tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcsample-sse2.asm (10345B)


      1 ;
      2 ; jcsample.asm - downsampling (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jsimdext.inc"
     14 
     15 ; --------------------------------------------------------------------------
     16    SECTION     SEG_TEXT
     17    BITS        32
     18 ;
     19 ; Downsample pixel values of a single component.
     20 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     21 ; without smoothing.
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
     25 ;                            JDIMENSION v_samp_factor,
     26 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     27 ;                            JSAMPARRAY output_data);
     28 ;
     29 
     30 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
     31 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
     32 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
     33 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
     34 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
     35 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
     36 
     37    align       32
     38    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
     39 
     40 EXTN(jsimd_h2v1_downsample_sse2):
     41    push        ebp
     42    mov         ebp, esp
     43 ;   push        ebx                     ; unused
     44 ;   push        ecx                     ; need not be preserved
     45 ;   push        edx                     ; need not be preserved
     46    push        esi
     47    push        edi
     48 
     49    mov         ecx, JDIMENSION [width_blks(ebp)]
     50    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
     51    jz          near .return
     52 
     53    mov         edx, JDIMENSION [img_width(ebp)]
     54 
     55    ; -- expand_right_edge
     56 
     57    push        ecx
     58    shl         ecx, 1                  ; output_cols * 2
     59    sub         ecx, edx
     60    jle         short .expand_end
     61 
     62    mov         eax, INT [max_v_samp(ebp)]
     63    test        eax, eax
     64    jle         short .expand_end
     65 
     66    cld
     67    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
     68    ALIGNX      16, 7
     69 .expandloop:
     70    push        eax
     71    push        ecx
     72 
     73    mov         edi, JSAMPROW [esi]
     74    add         edi, edx
     75    mov         al, JSAMPLE [edi-1]
     76 
     77    rep stosb
     78 
     79    pop         ecx
     80    pop         eax
     81 
     82    add         esi, byte SIZEOF_JSAMPROW
     83    dec         eax
     84    jg          short .expandloop
     85 
     86 .expand_end:
     87    pop         ecx                     ; output_cols
     88 
     89    ; -- h2v1_downsample
     90 
     91    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
     92    test        eax, eax
     93    jle         near .return
     94 
     95    mov         edx, 0x00010000         ; bias pattern
     96    movd        xmm7, edx
     97    pcmpeqw     xmm6, xmm6
     98    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
     99    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    100 
    101    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    102    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    103    ALIGNX      16, 7
    104 .rowloop:
    105    push        ecx
    106    push        edi
    107    push        esi
    108 
    109    mov         esi, JSAMPROW [esi]     ; inptr
    110    mov         edi, JSAMPROW [edi]     ; outptr
    111 
    112    cmp         ecx, byte SIZEOF_XMMWORD
    113    jae         short .columnloop
    114    ALIGNX      16, 7
    115 
    116 .columnloop_r8:
    117    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    118    pxor        xmm1, xmm1
    119    mov         ecx, SIZEOF_XMMWORD
    120    jmp         short .downsample
    121    ALIGNX      16, 7
    122 
    123 .columnloop:
    124    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    125    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
    126 
    127 .downsample:
    128    movdqa      xmm2, xmm0
    129    movdqa      xmm3, xmm1
    130 
    131    pand        xmm0, xmm6
    132    psrlw       xmm2, BYTE_BIT
    133    pand        xmm1, xmm6
    134    psrlw       xmm3, BYTE_BIT
    135 
    136    paddw       xmm0, xmm2
    137    paddw       xmm1, xmm3
    138    paddw       xmm0, xmm7
    139    paddw       xmm1, xmm7
    140    psrlw       xmm0, 1
    141    psrlw       xmm1, 1
    142 
    143    packuswb    xmm0, xmm1
    144 
    145    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    146 
    147    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
    148    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
    149    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
    150    cmp         ecx, byte SIZEOF_XMMWORD
    151    jae         short .columnloop
    152    test        ecx, ecx
    153    jnz         short .columnloop_r8
    154 
    155    pop         esi
    156    pop         edi
    157    pop         ecx
    158 
    159    add         esi, byte SIZEOF_JSAMPROW  ; input_data
    160    add         edi, byte SIZEOF_JSAMPROW  ; output_data
    161    dec         eax                        ; rowctr
    162    jg          near .rowloop
    163 
    164 .return:
    165    pop         edi
    166    pop         esi
    167 ;   pop         edx                     ; need not be preserved
    168 ;   pop         ecx                     ; need not be preserved
    169 ;   pop         ebx                     ; unused
    170    pop         ebp
    171    ret
    172 
    173 ; --------------------------------------------------------------------------
    174 ;
    175 ; Downsample pixel values of a single component.
    176 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    177 ; without smoothing.
    178 ;
    179 ; GLOBAL(void)
    180 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
    181 ;                            JDIMENSION v_samp_factor,
    182 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    183 ;                            JSAMPARRAY output_data);
    184 ;
    185 
    186 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
    187 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
    188 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
    189 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
    190 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
    191 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
    192 
    193    align       32
    194    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
    195 
    196 EXTN(jsimd_h2v2_downsample_sse2):
    197    push        ebp
    198    mov         ebp, esp
    199 ;   push        ebx                     ; unused
    200 ;   push        ecx                     ; need not be preserved
    201 ;   push        edx                     ; need not be preserved
    202    push        esi
    203    push        edi
    204 
    205    mov         ecx, JDIMENSION [width_blks(ebp)]
    206    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
    207    jz          near .return
    208 
    209    mov         edx, JDIMENSION [img_width(ebp)]
    210 
    211    ; -- expand_right_edge
    212 
    213    push        ecx
    214    shl         ecx, 1                  ; output_cols * 2
    215    sub         ecx, edx
    216    jle         short .expand_end
    217 
    218    mov         eax, INT [max_v_samp(ebp)]
    219    test        eax, eax
    220    jle         short .expand_end
    221 
    222    cld
    223    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
    224    ALIGNX      16, 7
    225 .expandloop:
    226    push        eax
    227    push        ecx
    228 
    229    mov         edi, JSAMPROW [esi]
    230    add         edi, edx
    231    mov         al, JSAMPLE [edi-1]
    232 
    233    rep stosb
    234 
    235    pop         ecx
    236    pop         eax
    237 
    238    add         esi, byte SIZEOF_JSAMPROW
    239    dec         eax
    240    jg          short .expandloop
    241 
    242 .expand_end:
    243    pop         ecx                     ; output_cols
    244 
    245    ; -- h2v2_downsample
    246 
    247    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
    248    test        eax, eax
    249    jle         near .return
    250 
    251    mov         edx, 0x00020001         ; bias pattern
    252    movd        xmm7, edx
    253    pcmpeqw     xmm6, xmm6
    254    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    255    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    256 
    257    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    258    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    259    ALIGNX      16, 7
    260 .rowloop:
    261    push        ecx
    262    push        edi
    263    push        esi
    264 
    265    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
    266    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
    267    mov         edi, JSAMPROW [edi]                    ; outptr
    268 
    269    cmp         ecx, byte SIZEOF_XMMWORD
    270    jae         short .columnloop
    271    ALIGNX      16, 7
    272 
    273 .columnloop_r8:
    274    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    275    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    276    pxor        xmm2, xmm2
    277    pxor        xmm3, xmm3
    278    mov         ecx, SIZEOF_XMMWORD
    279    jmp         short .downsample
    280    ALIGNX      16, 7
    281 
    282 .columnloop:
    283    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    284    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    285    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
    286    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
    287 
    288 .downsample:
    289    movdqa      xmm4, xmm0
    290    movdqa      xmm5, xmm1
    291    pand        xmm0, xmm6
    292    psrlw       xmm4, BYTE_BIT
    293    pand        xmm1, xmm6
    294    psrlw       xmm5, BYTE_BIT
    295    paddw       xmm0, xmm4
    296    paddw       xmm1, xmm5
    297 
    298    movdqa      xmm4, xmm2
    299    movdqa      xmm5, xmm3
    300    pand        xmm2, xmm6
    301    psrlw       xmm4, BYTE_BIT
    302    pand        xmm3, xmm6
    303    psrlw       xmm5, BYTE_BIT
    304    paddw       xmm2, xmm4
    305    paddw       xmm3, xmm5
    306 
    307    paddw       xmm0, xmm1
    308    paddw       xmm2, xmm3
    309    paddw       xmm0, xmm7
    310    paddw       xmm2, xmm7
    311    psrlw       xmm0, 2
    312    psrlw       xmm2, 2
    313 
    314    packuswb    xmm0, xmm2
    315 
    316    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    317 
    318    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
    319    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
    320    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
    321    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
    322    cmp         ecx, byte SIZEOF_XMMWORD
    323    jae         near .columnloop
    324    test        ecx, ecx
    325    jnz         near .columnloop_r8
    326 
    327    pop         esi
    328    pop         edi
    329    pop         ecx
    330 
    331    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
    332    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
    333    dec         eax                          ; rowctr
    334    jg          near .rowloop
    335 
    336 .return:
    337    pop         edi
    338    pop         esi
    339 ;   pop         edx                     ; need not be preserved
    340 ;   pop         ecx                     ; need not be preserved
    341 ;   pop         ebx                     ; unused
    342    pop         ebp
    343    ret
    344 
    345 ; For some reason, the OS X linker does not honor the request to align the
    346 ; segment unless we do this.
    347    align       32