jccolext-avx2.asm (24166B)
1 ; 2 ; jccolext.asm - colorspace conversion (64-bit AVX2) 3 ; 4 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 5 ; Copyright (C) 2015, Intel Corporation. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 15 %include "jcolsamp.inc" 16 17 ; -------------------------------------------------------------------------- 18 ; 19 ; Convert some rows of samples to the output colorspace. 20 ; 21 ; GLOBAL(void) 22 ; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, 23 ; JSAMPIMAGE output_buf, JDIMENSION output_row, 24 ; int num_rows); 25 ; 26 27 ; r10d = JDIMENSION img_width 28 ; r11 = JSAMPARRAY input_buf 29 ; r12 = JSAMPIMAGE output_buf 30 ; r13d = JDIMENSION output_row 31 ; r14d = int num_rows 32 33 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 34 %define WK_NUM 8 35 36 align 32 37 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) 38 39 EXTN(jsimd_rgb_ycc_convert_avx2): 40 ENDBR64 41 push rbp 42 mov rbp, rsp 43 push r15 44 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 45 ; Allocate stack space for wk array. r15 is used to access it. 46 mov r15, rsp 47 sub rsp, (SIZEOF_YMMWORD * WK_NUM) 48 COLLECT_ARGS 5 49 push rbx 50 51 mov ecx, r10d 52 test rcx, rcx 53 jz near .return 54 55 push rcx 56 57 mov rsi, r12 58 mov ecx, r13d 59 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 60 mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 61 mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 62 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 63 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 64 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 65 66 pop rcx 67 68 mov rsi, r11 69 mov eax, r14d 70 test rax, rax 71 jle near .return 72 .rowloop: 73 push rdx 74 push rbx 75 push rdi 76 push rsi 77 push rcx ; col 78 79 mov rsip, JSAMPROW [rsi] ; inptr 80 mov rdip, JSAMPROW [rdi] ; outptr0 81 mov rbxp, JSAMPROW [rbx] ; outptr1 82 mov rdxp, JSAMPROW [rdx] ; outptr2 83 84 cmp rcx, byte SIZEOF_YMMWORD 85 jae near .columnloop 86 87 %if RGB_PIXELSIZE == 3 ; --------------- 88 89 .column_ld1: 90 push rax 91 push rdx 92 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 93 test cl, SIZEOF_BYTE 94 jz short .column_ld2 95 sub rcx, byte SIZEOF_BYTE 96 movzx rax, byte [rsi+rcx] 97 .column_ld2: 98 test cl, SIZEOF_WORD 99 jz short .column_ld4 100 sub rcx, byte SIZEOF_WORD 101 movzx rdx, word [rsi+rcx] 102 shl rax, WORD_BIT 103 or rax, rdx 104 .column_ld4: 105 vmovd xmmA, eax 106 pop rdx 107 pop rax 108 test cl, SIZEOF_DWORD 109 jz short .column_ld8 110 sub rcx, byte SIZEOF_DWORD 111 vmovd xmmF, XMM_DWORD [rsi+rcx] 112 vpslldq xmmA, xmmA, SIZEOF_DWORD 113 vpor xmmA, xmmA, xmmF 114 .column_ld8: 115 test cl, SIZEOF_MMWORD 116 jz short .column_ld16 117 sub rcx, byte SIZEOF_MMWORD 118 vmovq xmmB, XMM_MMWORD [rsi+rcx] 119 vpslldq xmmA, xmmA, SIZEOF_MMWORD 120 vpor xmmA, xmmA, xmmB 121 .column_ld16: 122 test cl, SIZEOF_XMMWORD 123 jz short .column_ld32 124 sub rcx, byte SIZEOF_XMMWORD 125 vmovdqu xmmB, XMM_MMWORD [rsi+rcx] 126 vperm2i128 ymmA, ymmA, ymmA, 1 127 vpor ymmA, ymmB 128 .column_ld32: 129 test cl, SIZEOF_YMMWORD 130 jz short .column_ld64 131 sub rcx, byte SIZEOF_YMMWORD 132 vmovdqa ymmF, ymmA 133 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 134 .column_ld64: 135 test cl, 2*SIZEOF_YMMWORD 136 mov rcx, SIZEOF_YMMWORD 137 jz short .rgb_ycc_cnv 138 vmovdqa ymmB, ymmA 139 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 140 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 141 jmp short .rgb_ycc_cnv 142 143 .columnloop: 144 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 145 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 146 vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] 147 148 .rgb_ycc_cnv: 149 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 150 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 151 ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 152 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 153 ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 154 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 155 156 vmovdqu ymmC, ymmA 157 vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 158 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 159 vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 160 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 161 vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 162 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 163 vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 164 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 165 166 vmovdqa ymmG, ymmA 167 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 168 ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) 169 vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I 170 ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) 171 172 vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A 173 ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) 174 vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 175 ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) 176 177 vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D 178 ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) 179 vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F 180 ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) 181 182 vmovdqa ymmD, ymmA 183 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 184 ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) 185 vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P 186 ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) 187 188 vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D 189 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) 190 vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B 191 ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) 192 193 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E 194 ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) 195 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 196 ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) 197 198 vmovdqa ymmE, ymmA 199 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C 200 ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) 201 vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S 202 ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) 203 204 vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 205 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 206 vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D 207 ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) 208 209 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F 210 ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) 211 vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F 212 ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) 213 214 vpxor ymmH, ymmH, ymmH 215 216 vmovdqa ymmC, ymmA 217 vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 218 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 219 220 vmovdqa ymmB, ymmE 221 vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 222 vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 223 224 vmovdqa ymmF, ymmD 225 vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 226 vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 227 228 %else ; RGB_PIXELSIZE == 4 ; ----------- 229 230 .column_ld1: 231 test cl, SIZEOF_XMMWORD/16 232 jz short .column_ld2 233 sub rcx, byte SIZEOF_XMMWORD/16 234 vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 235 .column_ld2: 236 test cl, SIZEOF_XMMWORD/8 237 jz short .column_ld4 238 sub rcx, byte SIZEOF_XMMWORD/8 239 vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 240 vpslldq xmmA, xmmA, SIZEOF_MMWORD 241 vpor xmmA, xmmA, xmmF 242 .column_ld4: 243 test cl, SIZEOF_XMMWORD/4 244 jz short .column_ld8 245 sub rcx, byte SIZEOF_XMMWORD/4 246 vmovdqa xmmF, xmmA 247 vperm2i128 ymmF, ymmF, ymmF, 1 248 vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 249 vpor ymmA, ymmA, ymmF 250 .column_ld8: 251 test cl, SIZEOF_XMMWORD/2 252 jz short .column_ld16 253 sub rcx, byte SIZEOF_XMMWORD/2 254 vmovdqa ymmF, ymmA 255 vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] 256 .column_ld16: 257 test cl, SIZEOF_XMMWORD 258 mov rcx, SIZEOF_YMMWORD 259 jz short .rgb_ycc_cnv 260 vmovdqa ymmE, ymmA 261 vmovdqa ymmH, ymmF 262 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 263 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 264 jmp short .rgb_ycc_cnv 265 266 .columnloop: 267 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 268 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 269 vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] 270 vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] 271 272 .rgb_ycc_cnv: 273 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 274 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 275 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 276 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 277 ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 278 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 279 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 280 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 281 282 vmovdqa ymmB, ymmA 283 vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 284 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 285 vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 286 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 287 288 vmovdqa ymmB, ymmF 289 vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 290 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 291 vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 292 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 293 294 vmovdqa ymmD, ymmA 295 vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 296 ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) 297 vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 298 ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) 299 300 vmovdqa ymmC, ymmF 301 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D 302 ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) 303 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F 304 ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) 305 306 vmovdqa ymmB, ymmA 307 vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C 308 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) 309 vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D 310 ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) 311 312 vmovdqa ymmG, ymmD 313 vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E 314 ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) 315 vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F 316 ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) 317 318 vmovdqa ymmE, ymmA 319 vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 320 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 321 vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E 322 ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) 323 324 vmovdqa ymmH, ymmB 325 vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F 326 ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) 327 vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F 328 ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) 329 330 vpxor ymmF, ymmF, ymmF 331 332 vmovdqa ymmC, ymmA 333 vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 334 vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 335 336 vmovdqa ymmD, ymmB 337 vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 338 vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 339 340 vmovdqa ymmG, ymmE 341 vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 342 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) 343 344 vpunpcklbw ymmF, ymmF, ymmH 345 vpunpckhbw ymmH, ymmH, ymmH 346 vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 347 vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) 348 349 %endif ; RGB_PIXELSIZE ; --------------- 350 351 ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE 352 ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO 353 354 ; (Original) 355 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 356 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 357 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 358 ; 359 ; (This implementation) 360 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 361 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 362 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 363 364 vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE 365 vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO 366 vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE 367 vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO 368 369 vmovdqa ymm6, ymm1 370 vpunpcklwd ymm1, ymm1, ymm3 371 vpunpckhwd ymm6, ymm6, ymm3 372 vmovdqa ymm7, ymm1 373 vmovdqa ymm4, ymm6 374 vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) 375 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) 376 vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 377 vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 378 379 vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 380 vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 381 382 vpxor ymm1, ymm1, ymm1 383 vpxor ymm6, ymm6, ymm6 384 vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL 385 vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH 386 vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) 387 vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) 388 389 vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ] 390 391 vpaddd ymm7, ymm7, ymm1 392 vpaddd ymm4, ymm4, ymm6 393 vpaddd ymm7, ymm7, ymm5 394 vpaddd ymm4, ymm4, ymm5 395 vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL 396 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH 397 vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO 398 399 vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE 400 401 vmovdqa ymm6, ymm0 402 vpunpcklwd ymm0, ymm0, ymm2 403 vpunpckhwd ymm6, ymm6, ymm2 404 vmovdqa ymm5, ymm0 405 vmovdqa ymm4, ymm6 406 vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) 407 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) 408 vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 409 vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 410 411 vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 412 vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 413 414 vpxor ymm0, ymm0, ymm0 415 vpxor ymm6, ymm6, ymm6 416 vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL 417 vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH 418 vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) 419 vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) 420 421 vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] 422 423 vpaddd ymm5, ymm5, ymm0 424 vpaddd ymm4, ymm4, ymm6 425 vpaddd ymm5, ymm5, ymm1 426 vpaddd ymm4, ymm4, ymm1 427 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL 428 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH 429 vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE 430 431 vpsllw ymm7, ymm7, BYTE_BIT 432 vpor ymm5, ymm5, ymm7 ; ymm5=Cb 433 vmovdqu YMMWORD [rbx], ymm5 ; Save Cb 434 435 vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO 436 vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE 437 vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO 438 439 vmovdqa ymm4, ymm0 440 vpunpcklwd ymm0, ymm0, ymm3 441 vpunpckhwd ymm4, ymm4, ymm3 442 vmovdqa ymm7, ymm0 443 vmovdqa ymm5, ymm4 444 vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) 445 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) 446 vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 447 vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 448 449 vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] 450 451 vpaddd ymm0, ymm0, YMMWORD [wk(4)] 452 vpaddd ymm4, ymm4, YMMWORD [wk(5)] 453 vpaddd ymm0, ymm0, ymm3 454 vpaddd ymm4, ymm4, ymm3 455 vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL 456 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH 457 vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO 458 459 vpxor ymm3, ymm3, ymm3 460 vpxor ymm4, ymm4, ymm4 461 vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL 462 vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH 463 vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) 464 vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) 465 466 vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] 467 468 vpaddd ymm7, ymm7, ymm3 469 vpaddd ymm5, ymm5, ymm4 470 vpaddd ymm7, ymm7, ymm1 471 vpaddd ymm5, ymm5, ymm1 472 vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL 473 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH 474 vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO 475 476 vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE 477 478 vmovdqa ymm4, ymm6 479 vpunpcklwd ymm6, ymm6, ymm2 480 vpunpckhwd ymm4, ymm4, ymm2 481 vmovdqa ymm1, ymm6 482 vmovdqa ymm5, ymm4 483 vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) 484 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) 485 vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 486 vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 487 488 vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] 489 490 vpaddd ymm6, ymm6, YMMWORD [wk(6)] 491 vpaddd ymm4, ymm4, YMMWORD [wk(7)] 492 vpaddd ymm6, ymm6, ymm2 493 vpaddd ymm4, ymm4, ymm2 494 vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL 495 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH 496 vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE 497 498 vpsllw ymm0, ymm0, BYTE_BIT 499 vpor ymm6, ymm6, ymm0 ; ymm6=Y 500 vmovdqu YMMWORD [rdi], ymm6 ; Save Y 501 502 vpxor ymm2, ymm2, ymm2 503 vpxor ymm4, ymm4, ymm4 504 vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL 505 vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH 506 vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) 507 vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) 508 509 vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ] 510 511 vpaddd ymm1, ymm1, ymm2 512 vpaddd ymm5, ymm5, ymm4 513 vpaddd ymm1, ymm1, ymm0 514 vpaddd ymm5, ymm5, ymm0 515 vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL 516 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH 517 vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE 518 519 vpsllw ymm7, ymm7, BYTE_BIT 520 vpor ymm1, ymm1, ymm7 ; ymm1=Cr 521 vmovdqu YMMWORD [rdx], ymm1 ; Save Cr 522 523 sub rcx, byte SIZEOF_YMMWORD 524 add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr 525 add rdi, byte SIZEOF_YMMWORD ; outptr0 526 add rbx, byte SIZEOF_YMMWORD ; outptr1 527 add rdx, byte SIZEOF_YMMWORD ; outptr2 528 cmp rcx, byte SIZEOF_YMMWORD 529 jae near .columnloop 530 test rcx, rcx 531 jnz near .column_ld1 532 533 pop rcx ; col 534 pop rsi 535 pop rdi 536 pop rbx 537 pop rdx 538 539 add rsi, byte SIZEOF_JSAMPROW ; input_buf 540 add rdi, byte SIZEOF_JSAMPROW 541 add rbx, byte SIZEOF_JSAMPROW 542 add rdx, byte SIZEOF_JSAMPROW 543 dec rax ; num_rows 544 jg near .rowloop 545 546 .return: 547 pop rbx 548 vzeroupper 549 UNCOLLECT_ARGS 5 550 lea rsp, [rbp-8] 551 pop r15 552 pop rbp 553 ret 554 555 ; For some reason, the OS X linker does not honor the request to align the 556 ; segment unless we do this. 557 align 32