tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jccolext-sse2.asm (17806B)


      1 ;
      2 ; jccolext.asm - colorspace conversion (64-bit SSE2)
      3 ;
      4 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      5 ; Copyright (C) 2018, Matthias Räncker.
      6 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     13 
     14 %include "jcolsamp.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17 ;
     18 ; Convert some rows of samples to the output colorspace.
     19 ;
     20 ; GLOBAL(void)
     21 ; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
     22 ;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
     23 ;                            int num_rows);
     24 ;
     25 
     26 ; r10d = JDIMENSION img_width
     27 ; r11 = JSAMPARRAY input_buf
     28 ; r12 = JSAMPIMAGE output_buf
     29 ; r13d = JDIMENSION output_row
     30 ; r14d = int num_rows
     31 
     32 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
     33 %define WK_NUM  8
     34 
     35    align       32
     36    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
     37 
     38 EXTN(jsimd_rgb_ycc_convert_sse2):
     39    ENDBR64
     40    push        rbp
     41    mov         rbp, rsp
     42    push        r15
     43    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     44    ; Allocate stack space for wk array.  r15 is used to access it.
     45    mov         r15, rsp
     46    sub         rsp, (SIZEOF_XMMWORD * WK_NUM)
     47    COLLECT_ARGS 5
     48    push        rbx
     49 
     50    mov         ecx, r10d
     51    test        rcx, rcx
     52    jz          near .return
     53 
     54    push        rcx
     55 
     56    mov         rsi, r12
     57    mov         ecx, r13d
     58    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     59    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
     60    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     61    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     62    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     63    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
     64 
     65    pop         rcx
     66 
     67    mov         rsi, r11
     68    mov         eax, r14d
     69    test        rax, rax
     70    jle         near .return
     71 .rowloop:
     72    push        rdx
     73    push        rbx
     74    push        rdi
     75    push        rsi
     76    push        rcx                     ; col
     77 
     78    mov         rsip, JSAMPROW [rsi]    ; inptr
     79    mov         rdip, JSAMPROW [rdi]    ; outptr0
     80    mov         rbxp, JSAMPROW [rbx]    ; outptr1
     81    mov         rdxp, JSAMPROW [rdx]    ; outptr2
     82 
     83    cmp         rcx, byte SIZEOF_XMMWORD
     84    jae         near .columnloop
     85 
     86 %if RGB_PIXELSIZE == 3  ; ---------------
     87 
     88 .column_ld1:
     89    push        rax
     90    push        rdx
     91    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
     92    test        cl, SIZEOF_BYTE
     93    jz          short .column_ld2
     94    sub         rcx, byte SIZEOF_BYTE
     95    movzx       rax, byte [rsi+rcx]
     96 .column_ld2:
     97    test        cl, SIZEOF_WORD
     98    jz          short .column_ld4
     99    sub         rcx, byte SIZEOF_WORD
    100    movzx       rdx, word [rsi+rcx]
    101    shl         rax, WORD_BIT
    102    or          rax, rdx
    103 .column_ld4:
    104    movd        xmmA, eax
    105    pop         rdx
    106    pop         rax
    107    test        cl, SIZEOF_DWORD
    108    jz          short .column_ld8
    109    sub         rcx, byte SIZEOF_DWORD
    110    movd        xmmF, XMM_DWORD [rsi+rcx]
    111    pslldq      xmmA, SIZEOF_DWORD
    112    por         xmmA, xmmF
    113 .column_ld8:
    114    test        cl, SIZEOF_MMWORD
    115    jz          short .column_ld16
    116    sub         rcx, byte SIZEOF_MMWORD
    117    movq        xmmB, XMM_MMWORD [rsi+rcx]
    118    pslldq      xmmA, SIZEOF_MMWORD
    119    por         xmmA, xmmB
    120 .column_ld16:
    121    test        cl, SIZEOF_XMMWORD
    122    jz          short .column_ld32
    123    movdqa      xmmF, xmmA
    124    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    125    mov         rcx, SIZEOF_XMMWORD
    126    jmp         short .rgb_ycc_cnv
    127 .column_ld32:
    128    test        cl, 2*SIZEOF_XMMWORD
    129    mov         rcx, SIZEOF_XMMWORD
    130    jz          short .rgb_ycc_cnv
    131    movdqa      xmmB, xmmA
    132    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    133    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    134    jmp         short .rgb_ycc_cnv
    135 
    136 .columnloop:
    137    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    138    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    139    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    140 
    141 .rgb_ycc_cnv:
    142    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    143    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    144    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    145 
    146    movdqa      xmmG, xmmA
    147    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    148    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    149 
    150    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    151    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    152 
    153    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    154    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    155 
    156    movdqa      xmmD, xmmA
    157    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    158    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    159 
    160    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    161    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    162 
    163    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    164    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    165 
    166    movdqa      xmmE, xmmA
    167    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    168    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    169 
    170    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    171    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    172 
    173    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    174    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    175 
    176    pxor        xmmH, xmmH
    177 
    178    movdqa      xmmC, xmmA
    179    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
    180    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
    181 
    182    movdqa      xmmB, xmmE
    183    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
    184    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
    185 
    186    movdqa      xmmF, xmmD
    187    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
    188    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
    189 
    190 %else  ; RGB_PIXELSIZE == 4 ; -----------
    191 
    192 .column_ld1:
    193    test        cl, SIZEOF_XMMWORD/16
    194    jz          short .column_ld2
    195    sub         rcx, byte SIZEOF_XMMWORD/16
    196    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    197 .column_ld2:
    198    test        cl, SIZEOF_XMMWORD/8
    199    jz          short .column_ld4
    200    sub         rcx, byte SIZEOF_XMMWORD/8
    201    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    202    pslldq      xmmA, SIZEOF_MMWORD
    203    por         xmmA, xmmE
    204 .column_ld4:
    205    test        cl, SIZEOF_XMMWORD/4
    206    jz          short .column_ld8
    207    sub         rcx, byte SIZEOF_XMMWORD/4
    208    movdqa      xmmE, xmmA
    209    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    210 .column_ld8:
    211    test        cl, SIZEOF_XMMWORD/2
    212    mov         rcx, SIZEOF_XMMWORD
    213    jz          short .rgb_ycc_cnv
    214    movdqa      xmmF, xmmA
    215    movdqa      xmmH, xmmE
    216    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    217    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    218    jmp         short .rgb_ycc_cnv
    219 
    220 .columnloop:
    221    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    222    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    223    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    224    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    225 
    226 .rgb_ycc_cnv:
    227    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    228    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    229    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    230    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    231 
    232    movdqa      xmmD, xmmA
    233    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    234    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    235 
    236    movdqa      xmmC, xmmF
    237    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    238    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    239 
    240    movdqa      xmmB, xmmA
    241    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    242    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    243 
    244    movdqa      xmmG, xmmD
    245    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    246    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    247 
    248    movdqa      xmmE, xmmA
    249    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    250    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    251 
    252    movdqa      xmmH, xmmB
    253    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    254    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    255 
    256    pxor        xmmF, xmmF
    257 
    258    movdqa      xmmC, xmmA
    259    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
    260    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
    261 
    262    movdqa      xmmD, xmmB
    263    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
    264    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
    265 
    266    movdqa      xmmG, xmmE
    267    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
    268    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
    269 
    270    punpcklbw   xmmF, xmmH
    271    punpckhbw   xmmH, xmmH
    272    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
    273    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
    274 
    275 %endif  ; RGB_PIXELSIZE ; ---------------
    276 
    277    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    278    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    279 
    280    ; (Original)
    281    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    282    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    283    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    284    ;
    285    ; (This implementation)
    286    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    287    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    288    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    289 
    290    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
    291    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
    292    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
    293    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
    294 
    295    movdqa      xmm6, xmm1
    296    punpcklwd   xmm1, xmm3
    297    punpckhwd   xmm6, xmm3
    298    movdqa      xmm7, xmm1
    299    movdqa      xmm4, xmm6
    300    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    301    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    302    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    303    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    304 
    305    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    306    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    307 
    308    pxor        xmm1, xmm1
    309    pxor        xmm6, xmm6
    310    punpcklwd   xmm1, xmm5              ; xmm1=BOL
    311    punpckhwd   xmm6, xmm5              ; xmm6=BOH
    312    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
    313    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
    314 
    315    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
    316 
    317    paddd       xmm7, xmm1
    318    paddd       xmm4, xmm6
    319    paddd       xmm7, xmm5
    320    paddd       xmm4, xmm5
    321    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
    322    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
    323    packssdw    xmm7, xmm4              ; xmm7=CbO
    324 
    325    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
    326 
    327    movdqa      xmm6, xmm0
    328    punpcklwd   xmm0, xmm2
    329    punpckhwd   xmm6, xmm2
    330    movdqa      xmm5, xmm0
    331    movdqa      xmm4, xmm6
    332    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    333    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    334    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    335    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    336 
    337    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    338    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    339 
    340    pxor        xmm0, xmm0
    341    pxor        xmm6, xmm6
    342    punpcklwd   xmm0, xmm1              ; xmm0=BEL
    343    punpckhwd   xmm6, xmm1              ; xmm6=BEH
    344    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
    345    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
    346 
    347    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
    348 
    349    paddd       xmm5, xmm0
    350    paddd       xmm4, xmm6
    351    paddd       xmm5, xmm1
    352    paddd       xmm4, xmm1
    353    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
    354    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
    355    packssdw    xmm5, xmm4              ; xmm5=CbE
    356 
    357    psllw       xmm7, BYTE_BIT
    358    por         xmm5, xmm7              ; xmm5=Cb
    359    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
    360 
    361    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
    362    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
    363    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
    364 
    365    movdqa      xmm4, xmm0
    366    punpcklwd   xmm0, xmm3
    367    punpckhwd   xmm4, xmm3
    368    movdqa      xmm7, xmm0
    369    movdqa      xmm5, xmm4
    370    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    371    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    372    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    373    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    374 
    375    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
    376 
    377    paddd       xmm0, XMMWORD [wk(4)]
    378    paddd       xmm4, XMMWORD [wk(5)]
    379    paddd       xmm0, xmm3
    380    paddd       xmm4, xmm3
    381    psrld       xmm0, SCALEBITS         ; xmm0=YOL
    382    psrld       xmm4, SCALEBITS         ; xmm4=YOH
    383    packssdw    xmm0, xmm4              ; xmm0=YO
    384 
    385    pxor        xmm3, xmm3
    386    pxor        xmm4, xmm4
    387    punpcklwd   xmm3, xmm1              ; xmm3=ROL
    388    punpckhwd   xmm4, xmm1              ; xmm4=ROH
    389    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
    390    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
    391 
    392    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
    393 
    394    paddd       xmm7, xmm3
    395    paddd       xmm5, xmm4
    396    paddd       xmm7, xmm1
    397    paddd       xmm5, xmm1
    398    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
    399    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
    400    packssdw    xmm7, xmm5              ; xmm7=CrO
    401 
    402    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
    403 
    404    movdqa      xmm4, xmm6
    405    punpcklwd   xmm6, xmm2
    406    punpckhwd   xmm4, xmm2
    407    movdqa      xmm1, xmm6
    408    movdqa      xmm5, xmm4
    409    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    410    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    411    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    412    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    413 
    414    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
    415 
    416    paddd       xmm6, XMMWORD [wk(6)]
    417    paddd       xmm4, XMMWORD [wk(7)]
    418    paddd       xmm6, xmm2
    419    paddd       xmm4, xmm2
    420    psrld       xmm6, SCALEBITS         ; xmm6=YEL
    421    psrld       xmm4, SCALEBITS         ; xmm4=YEH
    422    packssdw    xmm6, xmm4              ; xmm6=YE
    423 
    424    psllw       xmm0, BYTE_BIT
    425    por         xmm6, xmm0              ; xmm6=Y
    426    movdqa      XMMWORD [rdi], xmm6     ; Save Y
    427 
    428    pxor        xmm2, xmm2
    429    pxor        xmm4, xmm4
    430    punpcklwd   xmm2, xmm3              ; xmm2=REL
    431    punpckhwd   xmm4, xmm3              ; xmm4=REH
    432    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
    433    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
    434 
    435    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
    436 
    437    paddd       xmm1, xmm2
    438    paddd       xmm5, xmm4
    439    paddd       xmm1, xmm0
    440    paddd       xmm5, xmm0
    441    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
    442    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
    443    packssdw    xmm1, xmm5              ; xmm1=CrE
    444 
    445    psllw       xmm7, BYTE_BIT
    446    por         xmm1, xmm7              ; xmm1=Cr
    447    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
    448 
    449    sub         rcx, byte SIZEOF_XMMWORD
    450    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
    451    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
    452    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
    453    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
    454    cmp         rcx, byte SIZEOF_XMMWORD
    455    jae         near .columnloop
    456    test        rcx, rcx
    457    jnz         near .column_ld1
    458 
    459    pop         rcx                     ; col
    460    pop         rsi
    461    pop         rdi
    462    pop         rbx
    463    pop         rdx
    464 
    465    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
    466    add         rdi, byte SIZEOF_JSAMPROW
    467    add         rbx, byte SIZEOF_JSAMPROW
    468    add         rdx, byte SIZEOF_JSAMPROW
    469    dec         rax                        ; num_rows
    470    jg          near .rowloop
    471 
    472 .return:
    473    pop         rbx
    474    UNCOLLECT_ARGS 5
    475    lea         rsp, [rbp-8]
    476    pop         r15
    477    pop         rbp
    478    ret
    479 
    480 ; For some reason, the OS X linker does not honor the request to align the
    481 ; segment unless we do this.
    482    align       32