tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcsample-mmx.asm (9482B)


      1 ;
      2 ; jcsample.asm - downsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jsimdext.inc"
     14 
     15 ; --------------------------------------------------------------------------
     16    SECTION     SEG_TEXT
     17    BITS        32
     18 ;
     19 ; Downsample pixel values of a single component.
     20 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     21 ; without smoothing.
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
     25 ;                           JDIMENSION v_samp_factor,
     26 ;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     27 ;                           JSAMPARRAY output_data);
     28 ;
     29 
     30 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
     31 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
     32 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
     33 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
     34 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
     35 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
     36 
     37    align       32
     38    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
     39 
     40 EXTN(jsimd_h2v1_downsample_mmx):
     41    push        ebp
     42    mov         ebp, esp
     43 ;   push        ebx                     ; unused
     44 ;   push        ecx                     ; need not be preserved
     45 ;   push        edx                     ; need not be preserved
     46    push        esi
     47    push        edi
     48 
     49    mov         ecx, JDIMENSION [width_blks(ebp)]
     50    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
     51    jz          near .return
     52 
     53    mov         edx, JDIMENSION [img_width(ebp)]
     54 
     55    ; -- expand_right_edge
     56 
     57    push        ecx
     58    shl         ecx, 1                  ; output_cols * 2
     59    sub         ecx, edx
     60    jle         short .expand_end
     61 
     62    mov         eax, INT [max_v_samp(ebp)]
     63    test        eax, eax
     64    jle         short .expand_end
     65 
     66    cld
     67    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
     68    ALIGNX      16, 7
     69 .expandloop:
     70    push        eax
     71    push        ecx
     72 
     73    mov         edi, JSAMPROW [esi]
     74    add         edi, edx
     75    mov         al, JSAMPLE [edi-1]
     76 
     77    rep stosb
     78 
     79    pop         ecx
     80    pop         eax
     81 
     82    add         esi, byte SIZEOF_JSAMPROW
     83    dec         eax
     84    jg          short .expandloop
     85 
     86 .expand_end:
     87    pop         ecx                     ; output_cols
     88 
     89    ; -- h2v1_downsample
     90 
     91    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
     92    test        eax, eax
     93    jle         near .return
     94 
     95    mov         edx, 0x00010000         ; bias pattern
     96    movd        mm7, edx
     97    pcmpeqw     mm6, mm6
     98    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
     99    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
    100 
    101    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    102    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    103    ALIGNX      16, 7
    104 .rowloop:
    105    push        ecx
    106    push        edi
    107    push        esi
    108 
    109    mov         esi, JSAMPROW [esi]     ; inptr
    110    mov         edi, JSAMPROW [edi]     ; outptr
    111    ALIGNX      16, 7
    112 .columnloop:
    113 
    114    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    115    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
    116    movq        mm2, mm0
    117    movq        mm3, mm1
    118 
    119    pand        mm0, mm6
    120    psrlw       mm2, BYTE_BIT
    121    pand        mm1, mm6
    122    psrlw       mm3, BYTE_BIT
    123 
    124    paddw       mm0, mm2
    125    paddw       mm1, mm3
    126    paddw       mm0, mm7
    127    paddw       mm1, mm7
    128    psrlw       mm0, 1
    129    psrlw       mm1, 1
    130 
    131    packuswb    mm0, mm1
    132 
    133    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
    134 
    135    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
    136    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
    137    sub         ecx, byte SIZEOF_MMWORD    ; outcol
    138    jnz         short .columnloop
    139 
    140    pop         esi
    141    pop         edi
    142    pop         ecx
    143 
    144    add         esi, byte SIZEOF_JSAMPROW  ; input_data
    145    add         edi, byte SIZEOF_JSAMPROW  ; output_data
    146    dec         eax                        ; rowctr
    147    jg          short .rowloop
    148 
    149    emms                                ; empty MMX state
    150 
    151 .return:
    152    pop         edi
    153    pop         esi
    154 ;   pop         edx                     ; need not be preserved
    155 ;   pop         ecx                     ; need not be preserved
    156 ;   pop         ebx                     ; unused
    157    pop         ebp
    158    ret
    159 
    160 ; --------------------------------------------------------------------------
    161 ;
    162 ; Downsample pixel values of a single component.
    163 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    164 ; without smoothing.
    165 ;
    166 ; GLOBAL(void)
    167 ; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
    168 ;                           JDIMENSION v_samp_factor,
    169 ;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    170 ;                           JSAMPARRAY output_data);
    171 ;
    172 
    173 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
    174 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
    175 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
    176 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
    177 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
    178 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
    179 
    180    align       32
    181    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
    182 
    183 EXTN(jsimd_h2v2_downsample_mmx):
    184    push        ebp
    185    mov         ebp, esp
    186 ;   push        ebx                     ; unused
    187 ;   push        ecx                     ; need not be preserved
    188 ;   push        edx                     ; need not be preserved
    189    push        esi
    190    push        edi
    191 
    192    mov         ecx, JDIMENSION [width_blks(ebp)]
    193    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
    194    jz          near .return
    195 
    196    mov         edx, JDIMENSION [img_width(ebp)]
    197 
    198    ; -- expand_right_edge
    199 
    200    push        ecx
    201    shl         ecx, 1                  ; output_cols * 2
    202    sub         ecx, edx
    203    jle         short .expand_end
    204 
    205    mov         eax, INT [max_v_samp(ebp)]
    206    test        eax, eax
    207    jle         short .expand_end
    208 
    209    cld
    210    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
    211    ALIGNX      16, 7
    212 .expandloop:
    213    push        eax
    214    push        ecx
    215 
    216    mov         edi, JSAMPROW [esi]
    217    add         edi, edx
    218    mov         al, JSAMPLE [edi-1]
    219 
    220    rep stosb
    221 
    222    pop         ecx
    223    pop         eax
    224 
    225    add         esi, byte SIZEOF_JSAMPROW
    226    dec         eax
    227    jg          short .expandloop
    228 
    229 .expand_end:
    230    pop         ecx                     ; output_cols
    231 
    232    ; -- h2v2_downsample
    233 
    234    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
    235    test        eax, eax
    236    jle         near .return
    237 
    238    mov         edx, 0x00020001         ; bias pattern
    239    movd        mm7, edx
    240    pcmpeqw     mm6, mm6
    241    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
    242    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
    243 
    244    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    245    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    246    ALIGNX      16, 7
    247 .rowloop:
    248    push        ecx
    249    push        edi
    250    push        esi
    251 
    252    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
    253    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
    254    mov         edi, JSAMPROW [edi]                    ; outptr
    255    ALIGNX      16, 7
    256 .columnloop:
    257 
    258    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
    259    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    260    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
    261    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
    262 
    263    movq        mm4, mm0
    264    movq        mm5, mm1
    265    pand        mm0, mm6
    266    psrlw       mm4, BYTE_BIT
    267    pand        mm1, mm6
    268    psrlw       mm5, BYTE_BIT
    269    paddw       mm0, mm4
    270    paddw       mm1, mm5
    271 
    272    movq        mm4, mm2
    273    movq        mm5, mm3
    274    pand        mm2, mm6
    275    psrlw       mm4, BYTE_BIT
    276    pand        mm3, mm6
    277    psrlw       mm5, BYTE_BIT
    278    paddw       mm2, mm4
    279    paddw       mm3, mm5
    280 
    281    paddw       mm0, mm1
    282    paddw       mm2, mm3
    283    paddw       mm0, mm7
    284    paddw       mm2, mm7
    285    psrlw       mm0, 2
    286    psrlw       mm2, 2
    287 
    288    packuswb    mm0, mm2
    289 
    290    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
    291 
    292    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
    293    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
    294    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
    295    sub         ecx, byte SIZEOF_MMWORD    ; outcol
    296    jnz         near .columnloop
    297 
    298    pop         esi
    299    pop         edi
    300    pop         ecx
    301 
    302    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
    303    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
    304    dec         eax                          ; rowctr
    305    jg          near .rowloop
    306 
    307    emms                                ; empty MMX state
    308 
    309 .return:
    310    pop         edi
    311    pop         esi
    312 ;   pop         edx                     ; need not be preserved
    313 ;   pop         ecx                     ; need not be preserved
    314 ;   pop         ebx                     ; unused
    315    pop         ebp
    316    ret
    317 
    318 ; For some reason, the OS X linker does not honor the request to align the
    319 ; segment unless we do this.
    320    align       32