jcgryext-avx2.asm (19052B)
1 ; 2 ; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) 3 ; 4 ; Copyright (C) 2011, 2016, 2024, D. R. Commander. 5 ; Copyright (C) 2015, Intel Corporation. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 15 %include "jcolsamp.inc" 16 17 ; -------------------------------------------------------------------------- 18 ; 19 ; Convert some rows of samples to the output colorspace. 20 ; 21 ; GLOBAL(void) 22 ; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, 23 ; JSAMPIMAGE output_buf, JDIMENSION output_row, 24 ; int num_rows); 25 ; 26 27 ; r10d = JDIMENSION img_width 28 ; r11 = JSAMPARRAY input_buf 29 ; r12 = JSAMPIMAGE output_buf 30 ; r13d = JDIMENSION output_row 31 ; r14d = int num_rows 32 33 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 34 %define WK_NUM 2 35 36 align 32 37 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) 38 39 EXTN(jsimd_rgb_gray_convert_avx2): 40 ENDBR64 41 push rbp 42 mov rbp, rsp 43 push r15 44 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 45 ; Allocate stack space for wk array. r15 is used to access it. 46 mov r15, rsp 47 sub rsp, byte (SIZEOF_YMMWORD * WK_NUM) 48 COLLECT_ARGS 5 49 push rbx 50 51 mov ecx, r10d 52 test rcx, rcx 53 jz near .return 54 55 push rcx 56 57 mov rsi, r12 58 mov ecx, r13d 59 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 60 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 61 62 pop rcx 63 64 mov rsi, r11 65 mov eax, r14d 66 test rax, rax 67 jle near .return 68 .rowloop: 69 push rdi 70 push rsi 71 push rcx ; col 72 73 mov rsip, JSAMPROW [rsi] ; inptr 74 mov rdip, JSAMPROW [rdi] ; outptr0 75 76 cmp rcx, byte SIZEOF_YMMWORD 77 jae near .columnloop 78 79 %if RGB_PIXELSIZE == 3 ; --------------- 80 81 .column_ld1: 82 push rax 83 push rdx 84 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 85 test cl, SIZEOF_BYTE 86 jz short .column_ld2 87 sub rcx, byte SIZEOF_BYTE 88 movzx rax, byte [rsi+rcx] 89 .column_ld2: 90 test cl, SIZEOF_WORD 91 jz short .column_ld4 92 sub rcx, byte SIZEOF_WORD 93 movzx rdx, word [rsi+rcx] 94 shl rax, WORD_BIT 95 or rax, rdx 96 .column_ld4: 97 vmovd xmmA, eax 98 pop rdx 99 pop rax 100 test cl, SIZEOF_DWORD 101 jz short .column_ld8 102 sub rcx, byte SIZEOF_DWORD 103 vmovd xmmF, XMM_DWORD [rsi+rcx] 104 vpslldq xmmA, xmmA, SIZEOF_DWORD 105 vpor xmmA, xmmA, xmmF 106 .column_ld8: 107 test cl, SIZEOF_MMWORD 108 jz short .column_ld16 109 sub rcx, byte SIZEOF_MMWORD 110 vmovq xmmB, XMM_MMWORD [rsi+rcx] 111 vpslldq xmmA, xmmA, SIZEOF_MMWORD 112 vpor xmmA, xmmA, xmmB 113 .column_ld16: 114 test cl, SIZEOF_XMMWORD 115 jz short .column_ld32 116 sub rcx, byte SIZEOF_XMMWORD 117 vmovdqu xmmB, XMM_MMWORD [rsi+rcx] 118 vperm2i128 ymmA, ymmA, ymmA, 1 119 vpor ymmA, ymmB 120 .column_ld32: 121 test cl, SIZEOF_YMMWORD 122 jz short .column_ld64 123 sub rcx, byte SIZEOF_YMMWORD 124 vmovdqa ymmF, ymmA 125 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 126 .column_ld64: 127 test cl, 2*SIZEOF_YMMWORD 128 mov rcx, SIZEOF_YMMWORD 129 jz short .rgb_gray_cnv 130 vmovdqa ymmB, ymmA 131 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 132 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 133 jmp short .rgb_gray_cnv 134 135 .columnloop: 136 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 137 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 138 vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] 139 140 .rgb_gray_cnv: 141 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 142 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 143 ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 144 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 145 ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 146 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 147 148 vmovdqu ymmC, ymmA 149 vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 150 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 151 vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 152 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 153 vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 154 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 155 vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 156 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 157 158 vmovdqa ymmG, ymmA 159 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 160 ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) 161 vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I 162 ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) 163 164 vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A 165 ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) 166 vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 167 ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) 168 169 vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D 170 ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) 171 vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F 172 ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) 173 174 vmovdqa ymmD, ymmA 175 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 176 ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) 177 vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P 178 ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) 179 180 vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D 181 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) 182 vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B 183 ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) 184 185 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E 186 ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) 187 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 188 ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) 189 190 vmovdqa ymmE, ymmA 191 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C 192 ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) 193 vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S 194 ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) 195 196 vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 197 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 198 vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D 199 ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) 200 201 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F 202 ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) 203 vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F 204 ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) 205 206 vpxor ymmH, ymmH, ymmH 207 208 vmovdqa ymmC, ymmA 209 vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 210 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 211 212 vmovdqa ymmB, ymmE 213 vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 214 vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 215 216 vmovdqa ymmF, ymmD 217 vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 218 vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 219 220 %else ; RGB_PIXELSIZE == 4 ; ----------- 221 222 .column_ld1: 223 test cl, SIZEOF_XMMWORD/16 224 jz short .column_ld2 225 sub rcx, byte SIZEOF_XMMWORD/16 226 vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 227 .column_ld2: 228 test cl, SIZEOF_XMMWORD/8 229 jz short .column_ld4 230 sub rcx, byte SIZEOF_XMMWORD/8 231 vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 232 vpslldq xmmA, xmmA, SIZEOF_MMWORD 233 vpor xmmA, xmmA, xmmF 234 .column_ld4: 235 test cl, SIZEOF_XMMWORD/4 236 jz short .column_ld8 237 sub rcx, byte SIZEOF_XMMWORD/4 238 vmovdqa xmmF, xmmA 239 vperm2i128 ymmF, ymmF, ymmF, 1 240 vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 241 vpor ymmA, ymmA, ymmF 242 .column_ld8: 243 test cl, SIZEOF_XMMWORD/2 244 jz short .column_ld16 245 sub rcx, byte SIZEOF_XMMWORD/2 246 vmovdqa ymmF, ymmA 247 vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] 248 .column_ld16: 249 test cl, SIZEOF_XMMWORD 250 mov rcx, SIZEOF_YMMWORD 251 jz short .rgb_gray_cnv 252 vmovdqa ymmE, ymmA 253 vmovdqa ymmH, ymmF 254 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 255 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 256 jmp short .rgb_gray_cnv 257 258 .columnloop: 259 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 260 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 261 vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] 262 vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] 263 264 .rgb_gray_cnv: 265 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 266 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 267 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 268 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 269 ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 270 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 271 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 272 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 273 274 vmovdqa ymmB, ymmA 275 vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 276 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 277 vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 278 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 279 280 vmovdqa ymmB, ymmF 281 vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 282 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 283 vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 284 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 285 286 vmovdqa ymmD, ymmA 287 vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 288 ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) 289 vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 290 ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) 291 292 vmovdqa ymmC, ymmF 293 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D 294 ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) 295 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F 296 ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) 297 298 vmovdqa ymmB, ymmA 299 vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C 300 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) 301 vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D 302 ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) 303 304 vmovdqa ymmG, ymmD 305 vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E 306 ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) 307 vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F 308 ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) 309 310 vmovdqa ymmE, ymmA 311 vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 312 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 313 vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E 314 ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) 315 316 vmovdqa ymmH, ymmB 317 vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F 318 ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) 319 vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F 320 ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) 321 322 vpxor ymmF, ymmF, ymmF 323 324 vmovdqa ymmC, ymmA 325 vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 326 vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 327 328 vmovdqa ymmD, ymmB 329 vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 330 vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 331 332 vmovdqa ymmG, ymmE 333 vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 334 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) 335 336 vpunpcklbw ymmF, ymmF, ymmH 337 vpunpckhbw ymmH, ymmH, ymmH 338 vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 339 vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) 340 341 %endif ; RGB_PIXELSIZE ; --------------- 342 343 ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE 344 ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO 345 346 ; (Original) 347 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 348 ; 349 ; (This implementation) 350 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 351 352 vmovdqa ymm6, ymm1 353 vpunpcklwd ymm1, ymm1, ymm3 354 vpunpckhwd ymm6, ymm6, ymm3 355 vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) 356 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) 357 358 vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) 359 360 vmovdqa ymm6, ymm0 361 vpunpcklwd ymm0, ymm0, ymm2 362 vpunpckhwd ymm6, ymm6, ymm2 363 vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) 364 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) 365 366 vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 367 vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 368 369 vmovdqa ymm0, ymm5 ; ymm0=BO 370 vmovdqa ymm6, ymm4 ; ymm6=BE 371 372 vmovdqa ymm4, ymm0 373 vpunpcklwd ymm0, ymm0, ymm3 374 vpunpckhwd ymm4, ymm4, ymm3 375 vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) 376 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) 377 378 vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] 379 380 vpaddd ymm0, ymm0, ymm1 381 vpaddd ymm4, ymm4, ymm7 382 vpaddd ymm0, ymm0, ymm3 383 vpaddd ymm4, ymm4, ymm3 384 vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL 385 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH 386 vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO 387 388 vmovdqa ymm4, ymm6 389 vpunpcklwd ymm6, ymm6, ymm2 390 vpunpckhwd ymm4, ymm4, ymm2 391 vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) 392 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) 393 394 vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] 395 396 vpaddd ymm6, ymm6, YMMWORD [wk(0)] 397 vpaddd ymm4, ymm4, YMMWORD [wk(1)] 398 vpaddd ymm6, ymm6, ymm2 399 vpaddd ymm4, ymm4, ymm2 400 vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL 401 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH 402 vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE 403 404 vpsllw ymm0, ymm0, BYTE_BIT 405 vpor ymm6, ymm6, ymm0 ; ymm6=Y 406 vmovdqu YMMWORD [rdi], ymm6 ; Save Y 407 408 sub rcx, byte SIZEOF_YMMWORD 409 add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr 410 add rdi, byte SIZEOF_YMMWORD ; outptr0 411 cmp rcx, byte SIZEOF_YMMWORD 412 jae near .columnloop 413 test rcx, rcx 414 jnz near .column_ld1 415 416 pop rcx ; col 417 pop rsi 418 pop rdi 419 420 add rsi, byte SIZEOF_JSAMPROW ; input_buf 421 add rdi, byte SIZEOF_JSAMPROW 422 dec rax ; num_rows 423 jg near .rowloop 424 425 .return: 426 pop rbx 427 vzeroupper 428 UNCOLLECT_ARGS 5 429 lea rsp, [rbp-8] 430 pop r15 431 pop rbp 432 ret 433 434 ; For some reason, the OS X linker does not honor the request to align the 435 ; segment unless we do this. 436 align 32