tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jccolext-avx2.asm (24166B)


      1 ;
      2 ; jccolext.asm - colorspace conversion (64-bit AVX2)
      3 ;
      4 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      5 ; Copyright (C) 2015, Intel Corporation.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 
     15 %include "jcolsamp.inc"
     16 
     17 ; --------------------------------------------------------------------------
     18 ;
     19 ; Convert some rows of samples to the output colorspace.
     20 ;
     21 ; GLOBAL(void)
     22 ; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
     23 ;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
     24 ;                            int num_rows);
     25 ;
     26 
     27 ; r10d = JDIMENSION img_width
     28 ; r11 = JSAMPARRAY input_buf
     29 ; r12 = JSAMPIMAGE output_buf
     30 ; r13d = JDIMENSION output_row
     31 ; r14d = int num_rows
     32 
     33 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
     34 %define WK_NUM  8
     35 
     36    align       32
     37    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
     38 
     39 EXTN(jsimd_rgb_ycc_convert_avx2):
     40    ENDBR64
     41    push        rbp
     42    mov         rbp, rsp
     43    push        r15
     44    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
     45    ; Allocate stack space for wk array.  r15 is used to access it.
     46    mov         r15, rsp
     47    sub         rsp, (SIZEOF_YMMWORD * WK_NUM)
     48    COLLECT_ARGS 5
     49    push        rbx
     50 
     51    mov         ecx, r10d
     52    test        rcx, rcx
     53    jz          near .return
     54 
     55    push        rcx
     56 
     57    mov         rsi, r12
     58    mov         ecx, r13d
     59    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     60    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
     61    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     62    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     63    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     64    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
     65 
     66    pop         rcx
     67 
     68    mov         rsi, r11
     69    mov         eax, r14d
     70    test        rax, rax
     71    jle         near .return
     72 .rowloop:
     73    push        rdx
     74    push        rbx
     75    push        rdi
     76    push        rsi
     77    push        rcx                     ; col
     78 
     79    mov         rsip, JSAMPROW [rsi]    ; inptr
     80    mov         rdip, JSAMPROW [rdi]    ; outptr0
     81    mov         rbxp, JSAMPROW [rbx]    ; outptr1
     82    mov         rdxp, JSAMPROW [rdx]    ; outptr2
     83 
     84    cmp         rcx, byte SIZEOF_YMMWORD
     85    jae         near .columnloop
     86 
     87 %if RGB_PIXELSIZE == 3  ; ---------------
     88 
     89 .column_ld1:
     90    push        rax
     91    push        rdx
     92    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
     93    test        cl, SIZEOF_BYTE
     94    jz          short .column_ld2
     95    sub         rcx, byte SIZEOF_BYTE
     96    movzx       rax, byte [rsi+rcx]
     97 .column_ld2:
     98    test        cl, SIZEOF_WORD
     99    jz          short .column_ld4
    100    sub         rcx, byte SIZEOF_WORD
    101    movzx       rdx, word [rsi+rcx]
    102    shl         rax, WORD_BIT
    103    or          rax, rdx
    104 .column_ld4:
    105    vmovd       xmmA, eax
    106    pop         rdx
    107    pop         rax
    108    test        cl, SIZEOF_DWORD
    109    jz          short .column_ld8
    110    sub         rcx, byte SIZEOF_DWORD
    111    vmovd       xmmF, XMM_DWORD [rsi+rcx]
    112    vpslldq     xmmA, xmmA, SIZEOF_DWORD
    113    vpor        xmmA, xmmA, xmmF
    114 .column_ld8:
    115    test        cl, SIZEOF_MMWORD
    116    jz          short .column_ld16
    117    sub         rcx, byte SIZEOF_MMWORD
    118    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
    119    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
    120    vpor        xmmA, xmmA, xmmB
    121 .column_ld16:
    122    test        cl, SIZEOF_XMMWORD
    123    jz          short .column_ld32
    124    sub         rcx, byte SIZEOF_XMMWORD
    125    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
    126    vperm2i128  ymmA, ymmA, ymmA, 1
    127    vpor        ymmA, ymmB
    128 .column_ld32:
    129    test        cl, SIZEOF_YMMWORD
    130    jz          short .column_ld64
    131    sub         rcx, byte SIZEOF_YMMWORD
    132    vmovdqa     ymmF, ymmA
    133    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    134 .column_ld64:
    135    test        cl, 2*SIZEOF_YMMWORD
    136    mov         rcx, SIZEOF_YMMWORD
    137    jz          short .rgb_ycc_cnv
    138    vmovdqa     ymmB, ymmA
    139    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    140    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    141    jmp         short .rgb_ycc_cnv
    142 
    143 .columnloop:
    144    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    145    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    146    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
    147 
    148 .rgb_ycc_cnv:
    149    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
    150    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    151    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
    152    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
    153    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
    154    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
    155 
    156    vmovdqu     ymmC, ymmA
    157    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
    158                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
    159    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
    160                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    161    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
    162                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
    163    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
    164                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
    165 
    166    vmovdqa     ymmG, ymmA
    167    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
    168                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
    169    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
    170                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
    171 
    172    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
    173                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
    174    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
    175                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
    176 
    177    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
    178                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
    179    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
    180                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
    181 
    182    vmovdqa     ymmD, ymmA
    183    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
    184                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
    185    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
    186                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
    187 
    188    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
    189                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
    190    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
    191                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
    192 
    193    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
    194                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
    195    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
    196                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
    197 
    198    vmovdqa     ymmE, ymmA
    199    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
    200                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
    201    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
    202                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
    203 
    204    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
    205                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
    206    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
    207                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
    208 
    209    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
    210                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
    211    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
    212                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
    213 
    214    vpxor       ymmH, ymmH, ymmH
    215 
    216    vmovdqa     ymmC, ymmA
    217    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
    218    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
    219 
    220    vmovdqa     ymmB, ymmE
    221    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
    222    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
    223 
    224    vmovdqa     ymmF, ymmD
    225    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
    226    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
    227 
    228 %else  ; RGB_PIXELSIZE == 4 ; -----------
    229 
    230 .column_ld1:
    231    test        cl, SIZEOF_XMMWORD/16
    232    jz          short .column_ld2
    233    sub         rcx, byte SIZEOF_XMMWORD/16
    234    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    235 .column_ld2:
    236    test        cl, SIZEOF_XMMWORD/8
    237    jz          short .column_ld4
    238    sub         rcx, byte SIZEOF_XMMWORD/8
    239    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    240    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
    241    vpor        xmmA, xmmA, xmmF
    242 .column_ld4:
    243    test        cl, SIZEOF_XMMWORD/4
    244    jz          short .column_ld8
    245    sub         rcx, byte SIZEOF_XMMWORD/4
    246    vmovdqa     xmmF, xmmA
    247    vperm2i128  ymmF, ymmF, ymmF, 1
    248    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    249    vpor        ymmA, ymmA, ymmF
    250 .column_ld8:
    251    test        cl, SIZEOF_XMMWORD/2
    252    jz          short .column_ld16
    253    sub         rcx, byte SIZEOF_XMMWORD/2
    254    vmovdqa     ymmF, ymmA
    255    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
    256 .column_ld16:
    257    test        cl, SIZEOF_XMMWORD
    258    mov         rcx, SIZEOF_YMMWORD
    259    jz          short .rgb_ycc_cnv
    260    vmovdqa     ymmE, ymmA
    261    vmovdqa     ymmH, ymmF
    262    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    263    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    264    jmp         short .rgb_ycc_cnv
    265 
    266 .columnloop:
    267    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    268    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    269    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
    270    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
    271 
    272 .rgb_ycc_cnv:
    273    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    274    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    275    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
    276    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    277    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
    278    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
    279    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
    280    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
    281 
    282    vmovdqa     ymmB, ymmA
    283    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    284                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
    285    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    286                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
    287 
    288    vmovdqa     ymmB, ymmF
    289    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
    290                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
    291    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
    292                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
    293 
    294    vmovdqa     ymmD, ymmA
    295    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
    296                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
    297    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
    298                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
    299 
    300    vmovdqa     ymmC, ymmF
    301    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
    302                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
    303    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
    304                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
    305 
    306    vmovdqa     ymmB, ymmA
    307    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
    308                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
    309    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
    310                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
    311 
    312    vmovdqa     ymmG, ymmD
    313    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
    314                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
    315    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
    316                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
    317 
    318    vmovdqa     ymmE, ymmA
    319    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
    320                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
    321    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
    322                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
    323 
    324    vmovdqa     ymmH, ymmB
    325    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
    326                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
    327    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
    328                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
    329 
    330    vpxor       ymmF, ymmF, ymmF
    331 
    332    vmovdqa     ymmC, ymmA
    333    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
    334    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
    335 
    336    vmovdqa     ymmD, ymmB
    337    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
    338    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
    339 
    340    vmovdqa     ymmG, ymmE
    341    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
    342    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
    343 
    344    vpunpcklbw  ymmF, ymmF, ymmH
    345    vpunpckhbw  ymmH, ymmH, ymmH
    346    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
    347    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
    348 
    349 %endif  ; RGB_PIXELSIZE ; ---------------
    350 
    351    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
    352    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
    353 
    354    ; (Original)
    355    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    356    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    357    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    358    ;
    359    ; (This implementation)
    360    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    361    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    362    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    363 
    364    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
    365    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
    366    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
    367    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
    368 
    369    vmovdqa     ymm6, ymm1
    370    vpunpcklwd  ymm1, ymm1, ymm3
    371    vpunpckhwd  ymm6, ymm6, ymm3
    372    vmovdqa     ymm7, ymm1
    373    vmovdqa     ymm4, ymm6
    374    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    375    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    376    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    377    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    378 
    379    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    380    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    381 
    382    vpxor       ymm1, ymm1, ymm1
    383    vpxor       ymm6, ymm6, ymm6
    384    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
    385    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
    386    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
    387    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
    388 
    389    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
    390 
    391    vpaddd      ymm7, ymm7, ymm1
    392    vpaddd      ymm4, ymm4, ymm6
    393    vpaddd      ymm7, ymm7, ymm5
    394    vpaddd      ymm4, ymm4, ymm5
    395    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
    396    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
    397    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
    398 
    399    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
    400 
    401    vmovdqa     ymm6, ymm0
    402    vpunpcklwd  ymm0, ymm0, ymm2
    403    vpunpckhwd  ymm6, ymm6, ymm2
    404    vmovdqa     ymm5, ymm0
    405    vmovdqa     ymm4, ymm6
    406    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
    407    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
    408    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    409    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    410 
    411    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    412    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    413 
    414    vpxor       ymm0, ymm0, ymm0
    415    vpxor       ymm6, ymm6, ymm6
    416    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
    417    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
    418    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
    419    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
    420 
    421    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
    422 
    423    vpaddd      ymm5, ymm5, ymm0
    424    vpaddd      ymm4, ymm4, ymm6
    425    vpaddd      ymm5, ymm5, ymm1
    426    vpaddd      ymm4, ymm4, ymm1
    427    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
    428    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
    429    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
    430 
    431    vpsllw      ymm7, ymm7, BYTE_BIT
    432    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
    433    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
    434 
    435    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
    436    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
    437    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
    438 
    439    vmovdqa     ymm4, ymm0
    440    vpunpcklwd  ymm0, ymm0, ymm3
    441    vpunpckhwd  ymm4, ymm4, ymm3
    442    vmovdqa     ymm7, ymm0
    443    vmovdqa     ymm5, ymm4
    444    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    445    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    446    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    447    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    448 
    449    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
    450 
    451    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
    452    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
    453    vpaddd      ymm0, ymm0, ymm3
    454    vpaddd      ymm4, ymm4, ymm3
    455    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
    456    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
    457    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
    458 
    459    vpxor       ymm3, ymm3, ymm3
    460    vpxor       ymm4, ymm4, ymm4
    461    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
    462    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
    463    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
    464    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
    465 
    466    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
    467 
    468    vpaddd      ymm7, ymm7, ymm3
    469    vpaddd      ymm5, ymm5, ymm4
    470    vpaddd      ymm7, ymm7, ymm1
    471    vpaddd      ymm5, ymm5, ymm1
    472    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
    473    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
    474    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
    475 
    476    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
    477 
    478    vmovdqa     ymm4, ymm6
    479    vpunpcklwd  ymm6, ymm6, ymm2
    480    vpunpckhwd  ymm4, ymm4, ymm2
    481    vmovdqa     ymm1, ymm6
    482    vmovdqa     ymm5, ymm4
    483    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    484    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    485    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    486    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    487 
    488    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
    489 
    490    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
    491    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
    492    vpaddd      ymm6, ymm6, ymm2
    493    vpaddd      ymm4, ymm4, ymm2
    494    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
    495    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
    496    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
    497 
    498    vpsllw      ymm0, ymm0, BYTE_BIT
    499    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
    500    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
    501 
    502    vpxor       ymm2, ymm2, ymm2
    503    vpxor       ymm4, ymm4, ymm4
    504    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
    505    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
    506    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
    507    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
    508 
    509    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
    510 
    511    vpaddd      ymm1, ymm1, ymm2
    512    vpaddd      ymm5, ymm5, ymm4
    513    vpaddd      ymm1, ymm1, ymm0
    514    vpaddd      ymm5, ymm5, ymm0
    515    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
    516    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
    517    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
    518 
    519    vpsllw      ymm7, ymm7, BYTE_BIT
    520    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
    521    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
    522 
    523    sub         rcx, byte SIZEOF_YMMWORD
    524    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
    525    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
    526    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
    527    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
    528    cmp         rcx, byte SIZEOF_YMMWORD
    529    jae         near .columnloop
    530    test        rcx, rcx
    531    jnz         near .column_ld1
    532 
    533    pop         rcx                     ; col
    534    pop         rsi
    535    pop         rdi
    536    pop         rbx
    537    pop         rdx
    538 
    539    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
    540    add         rdi, byte SIZEOF_JSAMPROW
    541    add         rbx, byte SIZEOF_JSAMPROW
    542    add         rdx, byte SIZEOF_JSAMPROW
    543    dec         rax                        ; num_rows
    544    jg          near .rowloop
    545 
    546 .return:
    547    pop         rbx
    548    vzeroupper
    549    UNCOLLECT_ARGS 5
    550    lea         rsp, [rbp-8]
    551    pop         r15
    552    pop         rbp
    553    ret
    554 
    555 ; For some reason, the OS X linker does not honor the request to align the
    556 ; segment unless we do this.
    557    align       32