tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcgryext-sse2.asm (12887B)


      1 ;
      2 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
      3 ;
      4 ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
      5 ; Copyright (C) 2018, Matthias Räncker.
      6 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     13 
     14 %include "jcolsamp.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17 ;
     18 ; Convert some rows of samples to the output colorspace.
     19 ;
     20 ; GLOBAL(void)
     21 ; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
     22 ;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
     23 ;                             int num_rows);
     24 ;
     25 
     26 ; r10d = JDIMENSION img_width
     27 ; r11 = JSAMPARRAY input_buf
     28 ; r12 = JSAMPIMAGE output_buf
     29 ; r13d = JDIMENSION output_row
     30 ; r14d = int num_rows
     31 
     32 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
     33 %define WK_NUM  2
     34 
     35    align       32
     36    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
     37 
     38 EXTN(jsimd_rgb_gray_convert_sse2):
     39    ENDBR64
     40    push        rbp
     41    mov         rbp, rsp
     42    push        r15
     43    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     44    ; Allocate stack space for wk array.  r15 is used to access it.
     45    mov         r15, rsp
     46    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
     47    COLLECT_ARGS 5
     48    push        rbx
     49 
     50    mov         ecx, r10d
     51    test        rcx, rcx
     52    jz          near .return
     53 
     54    push        rcx
     55 
     56    mov         rsi, r12
     57    mov         ecx, r13d
     58    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     59    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     60 
     61    pop         rcx
     62 
     63    mov         rsi, r11
     64    mov         eax, r14d
     65    test        rax, rax
     66    jle         near .return
     67 .rowloop:
     68    push        rdi
     69    push        rsi
     70    push        rcx                     ; col
     71 
     72    mov         rsip, JSAMPROW [rsi]    ; inptr
     73    mov         rdip, JSAMPROW [rdi]    ; outptr0
     74 
     75    cmp         rcx, byte SIZEOF_XMMWORD
     76    jae         near .columnloop
     77 
     78 %if RGB_PIXELSIZE == 3  ; ---------------
     79 
     80 .column_ld1:
     81    push        rax
     82    push        rdx
     83    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
     84    test        cl, SIZEOF_BYTE
     85    jz          short .column_ld2
     86    sub         rcx, byte SIZEOF_BYTE
     87    movzx       rax, byte [rsi+rcx]
     88 .column_ld2:
     89    test        cl, SIZEOF_WORD
     90    jz          short .column_ld4
     91    sub         rcx, byte SIZEOF_WORD
     92    movzx       rdx, word [rsi+rcx]
     93    shl         rax, WORD_BIT
     94    or          rax, rdx
     95 .column_ld4:
     96    movd        xmmA, eax
     97    pop         rdx
     98    pop         rax
     99    test        cl, SIZEOF_DWORD
    100    jz          short .column_ld8
    101    sub         rcx, byte SIZEOF_DWORD
    102    movd        xmmF, XMM_DWORD [rsi+rcx]
    103    pslldq      xmmA, SIZEOF_DWORD
    104    por         xmmA, xmmF
    105 .column_ld8:
    106    test        cl, SIZEOF_MMWORD
    107    jz          short .column_ld16
    108    sub         rcx, byte SIZEOF_MMWORD
    109    movq        xmmB, XMM_MMWORD [rsi+rcx]
    110    pslldq      xmmA, SIZEOF_MMWORD
    111    por         xmmA, xmmB
    112 .column_ld16:
    113    test        cl, SIZEOF_XMMWORD
    114    jz          short .column_ld32
    115    movdqa      xmmF, xmmA
    116    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    117    mov         rcx, SIZEOF_XMMWORD
    118    jmp         short .rgb_gray_cnv
    119 .column_ld32:
    120    test        cl, 2*SIZEOF_XMMWORD
    121    mov         rcx, SIZEOF_XMMWORD
    122    jz          short .rgb_gray_cnv
    123    movdqa      xmmB, xmmA
    124    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    125    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    126    jmp         short .rgb_gray_cnv
    127 
    128 .columnloop:
    129    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    130    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    131    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    132 
    133 .rgb_gray_cnv:
    134    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    135    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    136    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    137 
    138    movdqa      xmmG, xmmA
    139    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    140    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    141 
    142    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    143    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    144 
    145    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    146    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    147 
    148    movdqa      xmmD, xmmA
    149    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    150    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    151 
    152    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    153    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    154 
    155    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    156    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    157 
    158    movdqa      xmmE, xmmA
    159    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    160    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    161 
    162    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    163    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    164 
    165    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    166    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    167 
    168    pxor        xmmH, xmmH
    169 
    170    movdqa      xmmC, xmmA
    171    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
    172    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
    173 
    174    movdqa      xmmB, xmmE
    175    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
    176    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
    177 
    178    movdqa      xmmF, xmmD
    179    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
    180    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
    181 
    182 %else  ; RGB_PIXELSIZE == 4 ; -----------
    183 
    184 .column_ld1:
    185    test        cl, SIZEOF_XMMWORD/16
    186    jz          short .column_ld2
    187    sub         rcx, byte SIZEOF_XMMWORD/16
    188    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    189 .column_ld2:
    190    test        cl, SIZEOF_XMMWORD/8
    191    jz          short .column_ld4
    192    sub         rcx, byte SIZEOF_XMMWORD/8
    193    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    194    pslldq      xmmA, SIZEOF_MMWORD
    195    por         xmmA, xmmE
    196 .column_ld4:
    197    test        cl, SIZEOF_XMMWORD/4
    198    jz          short .column_ld8
    199    sub         rcx, byte SIZEOF_XMMWORD/4
    200    movdqa      xmmE, xmmA
    201    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    202 .column_ld8:
    203    test        cl, SIZEOF_XMMWORD/2
    204    mov         rcx, SIZEOF_XMMWORD
    205    jz          short .rgb_gray_cnv
    206    movdqa      xmmF, xmmA
    207    movdqa      xmmH, xmmE
    208    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    209    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    210    jmp         short .rgb_gray_cnv
    211 
    212 .columnloop:
    213    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    214    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    215    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    216    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    217 
    218 .rgb_gray_cnv:
    219    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    220    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    221    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    222    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    223 
    224    movdqa      xmmD, xmmA
    225    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    226    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    227 
    228    movdqa      xmmC, xmmF
    229    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    230    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    231 
    232    movdqa      xmmB, xmmA
    233    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    234    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    235 
    236    movdqa      xmmG, xmmD
    237    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    238    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    239 
    240    movdqa      xmmE, xmmA
    241    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    242    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    243 
    244    movdqa      xmmH, xmmB
    245    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    246    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    247 
    248    pxor        xmmF, xmmF
    249 
    250    movdqa      xmmC, xmmA
    251    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
    252    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
    253 
    254    movdqa      xmmD, xmmB
    255    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
    256    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
    257 
    258    movdqa      xmmG, xmmE
    259    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
    260    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
    261 
    262    punpcklbw   xmmF, xmmH
    263    punpckhbw   xmmH, xmmH
    264    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
    265    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
    266 
    267 %endif  ; RGB_PIXELSIZE ; ---------------
    268 
    269    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    270    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    271 
    272    ; (Original)
    273    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    274    ;
    275    ; (This implementation)
    276    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    277 
    278    movdqa      xmm6, xmm1
    279    punpcklwd   xmm1, xmm3
    280    punpckhwd   xmm6, xmm3
    281    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    282    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    283 
    284    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    285 
    286    movdqa      xmm6, xmm0
    287    punpcklwd   xmm0, xmm2
    288    punpckhwd   xmm6, xmm2
    289    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    290    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    291 
    292    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    293    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    294 
    295    movdqa      xmm0, xmm5              ; xmm0=BO
    296    movdqa      xmm6, xmm4              ; xmm6=BE
    297 
    298    movdqa      xmm4, xmm0
    299    punpcklwd   xmm0, xmm3
    300    punpckhwd   xmm4, xmm3
    301    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    302    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    303 
    304    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
    305 
    306    paddd       xmm0, xmm1
    307    paddd       xmm4, xmm7
    308    paddd       xmm0, xmm3
    309    paddd       xmm4, xmm3
    310    psrld       xmm0, SCALEBITS         ; xmm0=YOL
    311    psrld       xmm4, SCALEBITS         ; xmm4=YOH
    312    packssdw    xmm0, xmm4              ; xmm0=YO
    313 
    314    movdqa      xmm4, xmm6
    315    punpcklwd   xmm6, xmm2
    316    punpckhwd   xmm4, xmm2
    317    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    318    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    319 
    320    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
    321 
    322    paddd       xmm6, XMMWORD [wk(0)]
    323    paddd       xmm4, XMMWORD [wk(1)]
    324    paddd       xmm6, xmm2
    325    paddd       xmm4, xmm2
    326    psrld       xmm6, SCALEBITS         ; xmm6=YEL
    327    psrld       xmm4, SCALEBITS         ; xmm4=YEH
    328    packssdw    xmm6, xmm4              ; xmm6=YE
    329 
    330    psllw       xmm0, BYTE_BIT
    331    por         xmm6, xmm0              ; xmm6=Y
    332    movdqa      XMMWORD [rdi], xmm6     ; Save Y
    333 
    334    sub         rcx, byte SIZEOF_XMMWORD
    335    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
    336    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
    337    cmp         rcx, byte SIZEOF_XMMWORD
    338    jae         near .columnloop
    339    test        rcx, rcx
    340    jnz         near .column_ld1
    341 
    342    pop         rcx                     ; col
    343    pop         rsi
    344    pop         rdi
    345 
    346    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
    347    add         rdi, byte SIZEOF_JSAMPROW
    348    dec         rax                        ; num_rows
    349    jg          near .rowloop
    350 
    351 .return:
    352    pop         rbx
    353    UNCOLLECT_ARGS 5
    354    lea         rsp, [rbp-8]
    355    pop         r15
    356    pop         rbp
    357    ret
    358 
    359 ; For some reason, the OS X linker does not honor the request to align the
    360 ; segment unless we do this.
    361    align       32