tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcgryext-avx2.asm (19052B)


      1 ;
      2 ; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
      3 ;
      4 ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
      5 ; Copyright (C) 2015, Intel Corporation.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 
     15 %include "jcolsamp.inc"
     16 
     17 ; --------------------------------------------------------------------------
     18 ;
     19 ; Convert some rows of samples to the output colorspace.
     20 ;
     21 ; GLOBAL(void)
     22 ; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
     23 ;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
     24 ;                             int num_rows);
     25 ;
     26 
     27 ; r10d = JDIMENSION img_width
     28 ; r11 = JSAMPARRAY input_buf
     29 ; r12 = JSAMPIMAGE output_buf
     30 ; r13d = JDIMENSION output_row
     31 ; r14d = int num_rows
     32 
     33 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
     34 %define WK_NUM  2
     35 
     36    align       32
     37    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
     38 
     39 EXTN(jsimd_rgb_gray_convert_avx2):
     40    ENDBR64
     41    push        rbp
     42    mov         rbp, rsp
     43    push        r15
     44    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
     45    ; Allocate stack space for wk array.  r15 is used to access it.
     46    mov         r15, rsp
     47    sub         rsp, byte (SIZEOF_YMMWORD * WK_NUM)
     48    COLLECT_ARGS 5
     49    push        rbx
     50 
     51    mov         ecx, r10d
     52    test        rcx, rcx
     53    jz          near .return
     54 
     55    push        rcx
     56 
     57    mov         rsi, r12
     58    mov         ecx, r13d
     59    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     60    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     61 
     62    pop         rcx
     63 
     64    mov         rsi, r11
     65    mov         eax, r14d
     66    test        rax, rax
     67    jle         near .return
     68 .rowloop:
     69    push        rdi
     70    push        rsi
     71    push        rcx                     ; col
     72 
     73    mov         rsip, JSAMPROW [rsi]    ; inptr
     74    mov         rdip, JSAMPROW [rdi]    ; outptr0
     75 
     76    cmp         rcx, byte SIZEOF_YMMWORD
     77    jae         near .columnloop
     78 
     79 %if RGB_PIXELSIZE == 3  ; ---------------
     80 
     81 .column_ld1:
     82    push        rax
     83    push        rdx
     84    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
     85    test        cl, SIZEOF_BYTE
     86    jz          short .column_ld2
     87    sub         rcx, byte SIZEOF_BYTE
     88    movzx       rax, byte [rsi+rcx]
     89 .column_ld2:
     90    test        cl, SIZEOF_WORD
     91    jz          short .column_ld4
     92    sub         rcx, byte SIZEOF_WORD
     93    movzx       rdx, word [rsi+rcx]
     94    shl         rax, WORD_BIT
     95    or          rax, rdx
     96 .column_ld4:
     97    vmovd       xmmA, eax
     98    pop         rdx
     99    pop         rax
    100    test        cl, SIZEOF_DWORD
    101    jz          short .column_ld8
    102    sub         rcx, byte SIZEOF_DWORD
    103    vmovd       xmmF, XMM_DWORD [rsi+rcx]
    104    vpslldq     xmmA, xmmA, SIZEOF_DWORD
    105    vpor        xmmA, xmmA, xmmF
    106 .column_ld8:
    107    test        cl, SIZEOF_MMWORD
    108    jz          short .column_ld16
    109    sub         rcx, byte SIZEOF_MMWORD
    110    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
    111    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
    112    vpor        xmmA, xmmA, xmmB
    113 .column_ld16:
    114    test        cl, SIZEOF_XMMWORD
    115    jz          short .column_ld32
    116    sub         rcx, byte SIZEOF_XMMWORD
    117    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
    118    vperm2i128  ymmA, ymmA, ymmA, 1
    119    vpor        ymmA, ymmB
    120 .column_ld32:
    121    test        cl, SIZEOF_YMMWORD
    122    jz          short .column_ld64
    123    sub         rcx, byte SIZEOF_YMMWORD
    124    vmovdqa     ymmF, ymmA
    125    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    126 .column_ld64:
    127    test        cl, 2*SIZEOF_YMMWORD
    128    mov         rcx, SIZEOF_YMMWORD
    129    jz          short .rgb_gray_cnv
    130    vmovdqa     ymmB, ymmA
    131    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    132    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    133    jmp         short .rgb_gray_cnv
    134 
    135 .columnloop:
    136    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    137    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    138    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
    139 
    140 .rgb_gray_cnv:
    141    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
    142    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    143    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
    144    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
    145    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
    146    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
    147 
    148    vmovdqu     ymmC, ymmA
    149    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
    150                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
    151    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
    152                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    153    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
    154                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
    155    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
    156                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
    157 
    158    vmovdqa     ymmG, ymmA
    159    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
    160                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
    161    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
    162                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
    163 
    164    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
    165                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
    166    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
    167                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
    168 
    169    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
    170                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
    171    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
    172                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
    173 
    174    vmovdqa     ymmD, ymmA
    175    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
    176                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
    177    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
    178                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
    179 
    180    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
    181                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
    182    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
    183                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
    184 
    185    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
    186                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
    187    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
    188                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
    189 
    190    vmovdqa     ymmE, ymmA
    191    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
    192                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
    193    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
    194                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
    195 
    196    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
    197                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
    198    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
    199                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
    200 
    201    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
    202                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
    203    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
    204                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
    205 
    206    vpxor       ymmH, ymmH, ymmH
    207 
    208    vmovdqa     ymmC, ymmA
    209    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
    210    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
    211 
    212    vmovdqa     ymmB, ymmE
    213    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
    214    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
    215 
    216    vmovdqa     ymmF, ymmD
    217    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
    218    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
    219 
    220 %else  ; RGB_PIXELSIZE == 4 ; -----------
    221 
    222 .column_ld1:
    223    test        cl, SIZEOF_XMMWORD/16
    224    jz          short .column_ld2
    225    sub         rcx, byte SIZEOF_XMMWORD/16
    226    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    227 .column_ld2:
    228    test        cl, SIZEOF_XMMWORD/8
    229    jz          short .column_ld4
    230    sub         rcx, byte SIZEOF_XMMWORD/8
    231    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    232    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
    233    vpor        xmmA, xmmA, xmmF
    234 .column_ld4:
    235    test        cl, SIZEOF_XMMWORD/4
    236    jz          short .column_ld8
    237    sub         rcx, byte SIZEOF_XMMWORD/4
    238    vmovdqa     xmmF, xmmA
    239    vperm2i128  ymmF, ymmF, ymmF, 1
    240    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    241    vpor        ymmA, ymmA, ymmF
    242 .column_ld8:
    243    test        cl, SIZEOF_XMMWORD/2
    244    jz          short .column_ld16
    245    sub         rcx, byte SIZEOF_XMMWORD/2
    246    vmovdqa     ymmF, ymmA
    247    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
    248 .column_ld16:
    249    test        cl, SIZEOF_XMMWORD
    250    mov         rcx, SIZEOF_YMMWORD
    251    jz          short .rgb_gray_cnv
    252    vmovdqa     ymmE, ymmA
    253    vmovdqa     ymmH, ymmF
    254    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    255    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    256    jmp         short .rgb_gray_cnv
    257 
    258 .columnloop:
    259    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    260    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    261    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
    262    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
    263 
    264 .rgb_gray_cnv:
    265    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    266    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    267    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
    268    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    269    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
    270    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
    271    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
    272    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
    273 
    274    vmovdqa     ymmB, ymmA
    275    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    276                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
    277    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    278                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
    279 
    280    vmovdqa     ymmB, ymmF
    281    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
    282                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
    283    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
    284                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
    285 
    286    vmovdqa     ymmD, ymmA
    287    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
    288                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
    289    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
    290                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
    291 
    292    vmovdqa     ymmC, ymmF
    293    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
    294                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
    295    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
    296                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
    297 
    298    vmovdqa     ymmB, ymmA
    299    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
    300                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
    301    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
    302                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
    303 
    304    vmovdqa     ymmG, ymmD
    305    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
    306                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
    307    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
    308                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
    309 
    310    vmovdqa     ymmE, ymmA
    311    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
    312                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
    313    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
    314                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
    315 
    316    vmovdqa     ymmH, ymmB
    317    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
    318                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
    319    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
    320                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
    321 
    322    vpxor       ymmF, ymmF, ymmF
    323 
    324    vmovdqa     ymmC, ymmA
    325    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
    326    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
    327 
    328    vmovdqa     ymmD, ymmB
    329    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
    330    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
    331 
    332    vmovdqa     ymmG, ymmE
    333    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
    334    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
    335 
    336    vpunpcklbw  ymmF, ymmF, ymmH
    337    vpunpckhbw  ymmH, ymmH, ymmH
    338    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
    339    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
    340 
    341 %endif  ; RGB_PIXELSIZE ; ---------------
    342 
    343    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
    344    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
    345 
    346    ; (Original)
    347    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    348    ;
    349    ; (This implementation)
    350    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    351 
    352    vmovdqa     ymm6, ymm1
    353    vpunpcklwd  ymm1, ymm1, ymm3
    354    vpunpckhwd  ymm6, ymm6, ymm3
    355    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    356    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    357 
    358    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    359 
    360    vmovdqa     ymm6, ymm0
    361    vpunpcklwd  ymm0, ymm0, ymm2
    362    vpunpckhwd  ymm6, ymm6, ymm2
    363    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
    364    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
    365 
    366    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    367    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    368 
    369    vmovdqa     ymm0, ymm5              ; ymm0=BO
    370    vmovdqa     ymm6, ymm4              ; ymm6=BE
    371 
    372    vmovdqa     ymm4, ymm0
    373    vpunpcklwd  ymm0, ymm0, ymm3
    374    vpunpckhwd  ymm4, ymm4, ymm3
    375    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    376    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    377 
    378    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
    379 
    380    vpaddd      ymm0, ymm0, ymm1
    381    vpaddd      ymm4, ymm4, ymm7
    382    vpaddd      ymm0, ymm0, ymm3
    383    vpaddd      ymm4, ymm4, ymm3
    384    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
    385    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
    386    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
    387 
    388    vmovdqa     ymm4, ymm6
    389    vpunpcklwd  ymm6, ymm6, ymm2
    390    vpunpckhwd  ymm4, ymm4, ymm2
    391    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    392    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    393 
    394    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
    395 
    396    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
    397    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
    398    vpaddd      ymm6, ymm6, ymm2
    399    vpaddd      ymm4, ymm4, ymm2
    400    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
    401    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
    402    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
    403 
    404    vpsllw      ymm0, ymm0, BYTE_BIT
    405    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
    406    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
    407 
    408    sub         rcx, byte SIZEOF_YMMWORD
    409    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
    410    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
    411    cmp         rcx, byte SIZEOF_YMMWORD
    412    jae         near .columnloop
    413    test        rcx, rcx
    414    jnz         near .column_ld1
    415 
    416    pop         rcx                     ; col
    417    pop         rsi
    418    pop         rdi
    419 
    420    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
    421    add         rdi, byte SIZEOF_JSAMPROW
    422    dec         rax                        ; num_rows
    423    jg          near .rowloop
    424 
    425 .return:
    426    pop         rbx
    427    vzeroupper
    428    UNCOLLECT_ARGS 5
    429    lea         rsp, [rbp-8]
    430    pop         r15
    431    pop         rbp
    432    ret
    433 
    434 ; For some reason, the OS X linker does not honor the request to align the
    435 ; segment unless we do this.
    436    align       32