tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcgryext-mmx.asm (12109B)


      1 ;
      2 ; jcgryext.asm - grayscale colorspace conversion (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jcolsamp.inc"
     14 
     15 ; --------------------------------------------------------------------------
     16 ;
     17 ; Convert some rows of samples to the output colorspace.
     18 ;
     19 ; GLOBAL(void)
     20 ; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
     21 ;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
     22 ;                            int num_rows);
     23 ;
     24 
     25 %define img_width(b)   (b) + 8          ; JDIMENSION img_width
     26 %define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
     27 %define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
     28 %define output_row(b)  (b) + 20         ; JDIMENSION output_row
     29 %define num_rows(b)    (b) + 24         ; int num_rows
     30 
     31 %define original_ebp   ebp + 0
     32 %define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
     33                                        ; mmword wk[WK_NUM]
     34 %define WK_NUM         2
     35 %define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
     36 
     37    align       32
     38    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
     39 
     40 EXTN(jsimd_rgb_gray_convert_mmx):
     41    push        ebp
     42    mov         eax, esp                    ; eax = original ebp
     43    sub         esp, byte 4
     44    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
     45    mov         [esp], eax
     46    mov         ebp, esp                    ; ebp = aligned ebp
     47    lea         esp, [wk(0)]
     48    PUSHPIC     eax                     ; make a room for GOT address
     49    push        ebx
     50 ;   push        ecx                     ; need not be preserved
     51 ;   push        edx                     ; need not be preserved
     52    push        esi
     53    push        edi
     54 
     55    GET_GOT     ebx                     ; get GOT address
     56    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
     57 
     58    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
     59    test        ecx, ecx
     60    jz          near .return
     61 
     62    push        ecx
     63 
     64    mov         esi, JSAMPIMAGE [output_buf(eax)]
     65    mov         ecx, JDIMENSION [output_row(eax)]
     66    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
     67    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
     68 
     69    pop         ecx
     70 
     71    mov         esi, JSAMPARRAY [input_buf(eax)]
     72    mov         eax, INT [num_rows(eax)]
     73    test        eax, eax
     74    jle         near .return
     75    ALIGNX      16, 7
     76 .rowloop:
     77    PUSHPIC     eax
     78    push        edi
     79    push        esi
     80    push        ecx                     ; col
     81 
     82    mov         esi, JSAMPROW [esi]     ; inptr
     83    mov         edi, JSAMPROW [edi]     ; outptr0
     84    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
     85 
     86    cmp         ecx, byte SIZEOF_MMWORD
     87    jae         short .columnloop
     88    ALIGNX      16, 7
     89 
     90 %if RGB_PIXELSIZE == 3  ; ---------------
     91 
     92 .column_ld1:
     93    push        eax
     94    push        edx
     95    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
     96    test        cl, SIZEOF_BYTE
     97    jz          short .column_ld2
     98    sub         ecx, byte SIZEOF_BYTE
     99    xor         eax, eax
    100    mov         al, byte [esi+ecx]
    101 .column_ld2:
    102    test        cl, SIZEOF_WORD
    103    jz          short .column_ld4
    104    sub         ecx, byte SIZEOF_WORD
    105    xor         edx, edx
    106    mov         dx, word [esi+ecx]
    107    shl         eax, WORD_BIT
    108    or          eax, edx
    109 .column_ld4:
    110    movd        mmA, eax
    111    pop         edx
    112    pop         eax
    113    test        cl, SIZEOF_DWORD
    114    jz          short .column_ld8
    115    sub         ecx, byte SIZEOF_DWORD
    116    movd        mmG, dword [esi+ecx]
    117    psllq       mmA, DWORD_BIT
    118    por         mmA, mmG
    119 .column_ld8:
    120    test        cl, SIZEOF_MMWORD
    121    jz          short .column_ld16
    122    movq        mmG, mmA
    123    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    124    mov         ecx, SIZEOF_MMWORD
    125    jmp         short .rgb_gray_cnv
    126 .column_ld16:
    127    test        cl, 2*SIZEOF_MMWORD
    128    mov         ecx, SIZEOF_MMWORD
    129    jz          short .rgb_gray_cnv
    130    movq        mmF, mmA
    131    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    132    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    133    jmp         short .rgb_gray_cnv
    134    ALIGNX      16, 7
    135 
    136 .columnloop:
    137    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    138    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    139    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
    140 
    141 .rgb_gray_cnv:
    142    ; mmA=(00 10 20 01 11 21 02 12)
    143    ; mmG=(22 03 13 23 04 14 24 05)
    144    ; mmF=(15 25 06 16 26 07 17 27)
    145 
    146    movq        mmD, mmA
    147    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
    148    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
    149 
    150    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
    151    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
    152 
    153    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
    154    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
    155 
    156    movq        mmE, mmA
    157    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
    158    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
    159 
    160    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
    161    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
    162 
    163    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
    164    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
    165 
    166    pxor        mmH, mmH
    167 
    168    movq        mmC, mmA
    169    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
    170    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
    171 
    172    movq        mmB, mmE
    173    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
    174    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
    175 
    176    movq        mmF, mmD
    177    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
    178    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
    179 
    180 %else  ; RGB_PIXELSIZE == 4 ; -----------
    181 
    182 .column_ld1:
    183    test        cl, SIZEOF_MMWORD/8
    184    jz          short .column_ld2
    185    sub         ecx, byte SIZEOF_MMWORD/8
    186    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
    187 .column_ld2:
    188    test        cl, SIZEOF_MMWORD/4
    189    jz          short .column_ld4
    190    sub         ecx, byte SIZEOF_MMWORD/4
    191    movq        mmF, mmA
    192    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
    193 .column_ld4:
    194    test        cl, SIZEOF_MMWORD/2
    195    mov         ecx, SIZEOF_MMWORD
    196    jz          short .rgb_gray_cnv
    197    movq        mmD, mmA
    198    movq        mmC, mmF
    199    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    200    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    201    jmp         short .rgb_gray_cnv
    202    ALIGNX      16, 7
    203 
    204 .columnloop:
    205    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    206    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    207    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
    208    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
    209 
    210 .rgb_gray_cnv:
    211    ; mmA=(00 10 20 30 01 11 21 31)
    212    ; mmF=(02 12 22 32 03 13 23 33)
    213    ; mmD=(04 14 24 34 05 15 25 35)
    214    ; mmC=(06 16 26 36 07 17 27 37)
    215 
    216    movq        mmB, mmA
    217    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
    218    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
    219 
    220    movq        mmG, mmD
    221    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
    222    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
    223 
    224    movq        mmE, mmA
    225    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
    226    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
    227 
    228    movq        mmH, mmB
    229    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
    230    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
    231 
    232    pxor        mmF, mmF
    233 
    234    movq        mmC, mmA
    235    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
    236    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
    237 
    238    movq        mmD, mmB
    239    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
    240    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
    241 
    242    movq        mmG, mmE
    243    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
    244    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
    245 
    246    punpcklbw   mmF, mmH
    247    punpckhbw   mmH, mmH
    248    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
    249    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
    250 
    251 %endif  ; RGB_PIXELSIZE ; ---------------
    252 
    253    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
    254    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
    255 
    256    ; (Original)
    257    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    258    ;
    259    ; (This implementation)
    260    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    261 
    262    movq        mm6, mm1
    263    punpcklwd   mm1, mm3
    264    punpckhwd   mm6, mm3
    265    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    266    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    267 
    268    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    269 
    270    movq        mm6, mm0
    271    punpcklwd   mm0, mm2
    272    punpckhwd   mm6, mm2
    273    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
    274    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
    275 
    276    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    277    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    278 
    279    movq        mm0, mm5                ; mm0=BO
    280    movq        mm6, mm4                ; mm6=BE
    281 
    282    movq        mm4, mm0
    283    punpcklwd   mm0, mm3
    284    punpckhwd   mm4, mm3
    285    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    286    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    287 
    288    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
    289 
    290    paddd       mm0, mm1
    291    paddd       mm4, mm7
    292    paddd       mm0, mm3
    293    paddd       mm4, mm3
    294    psrld       mm0, SCALEBITS          ; mm0=YOL
    295    psrld       mm4, SCALEBITS          ; mm4=YOH
    296    packssdw    mm0, mm4                ; mm0=YO
    297 
    298    movq        mm4, mm6
    299    punpcklwd   mm6, mm2
    300    punpckhwd   mm4, mm2
    301    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    302    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    303 
    304    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
    305 
    306    paddd       mm6, MMWORD [wk(0)]
    307    paddd       mm4, MMWORD [wk(1)]
    308    paddd       mm6, mm2
    309    paddd       mm4, mm2
    310    psrld       mm6, SCALEBITS          ; mm6=YEL
    311    psrld       mm4, SCALEBITS          ; mm4=YEH
    312    packssdw    mm6, mm4                ; mm6=YE
    313 
    314    psllw       mm0, BYTE_BIT
    315    por         mm6, mm0                ; mm6=Y
    316    movq        MMWORD [edi], mm6       ; Save Y
    317 
    318    sub         ecx, byte SIZEOF_MMWORD
    319    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
    320    add         edi, byte SIZEOF_MMWORD                ; outptr0
    321    cmp         ecx, byte SIZEOF_MMWORD
    322    jae         near .columnloop
    323    test        ecx, ecx
    324    jnz         near .column_ld1
    325 
    326    pop         ecx                     ; col
    327    pop         esi
    328    pop         edi
    329    POPPIC      eax
    330 
    331    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
    332    add         edi, byte SIZEOF_JSAMPROW
    333    dec         eax                        ; num_rows
    334    jg          near .rowloop
    335 
    336    emms                                ; empty MMX state
    337 
    338 .return:
    339    pop         edi
    340    pop         esi
    341 ;   pop         edx                     ; need not be preserved
    342 ;   pop         ecx                     ; need not be preserved
    343    pop         ebx
    344    mov         esp, ebp                ; esp <- aligned ebp
    345    pop         esp                     ; esp <- original ebp
    346    pop         ebp
    347    ret
    348 
    349 ; For some reason, the OS X linker does not honor the request to align the
    350 ; segment unless we do this.
    351    align       32