jccolext-sse2.asm (17806B)
1 ; 2 ; jccolext.asm - colorspace conversion (64-bit SSE2) 3 ; 4 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 5 ; Copyright (C) 2018, Matthias Räncker. 6 ; Copyright (C) 2023, Aliaksiej Kandracienka. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jcolsamp.inc" 15 16 ; -------------------------------------------------------------------------- 17 ; 18 ; Convert some rows of samples to the output colorspace. 19 ; 20 ; GLOBAL(void) 21 ; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 22 ; JSAMPIMAGE output_buf, JDIMENSION output_row, 23 ; int num_rows); 24 ; 25 26 ; r10d = JDIMENSION img_width 27 ; r11 = JSAMPARRAY input_buf 28 ; r12 = JSAMPIMAGE output_buf 29 ; r13d = JDIMENSION output_row 30 ; r14d = int num_rows 31 32 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 33 %define WK_NUM 8 34 35 align 32 36 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) 37 38 EXTN(jsimd_rgb_ycc_convert_sse2): 39 ENDBR64 40 push rbp 41 mov rbp, rsp 42 push r15 43 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 44 ; Allocate stack space for wk array. r15 is used to access it. 45 mov r15, rsp 46 sub rsp, (SIZEOF_XMMWORD * WK_NUM) 47 COLLECT_ARGS 5 48 push rbx 49 50 mov ecx, r10d 51 test rcx, rcx 52 jz near .return 53 54 push rcx 55 56 mov rsi, r12 57 mov ecx, r13d 58 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 59 mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 60 mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 61 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 62 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 63 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 64 65 pop rcx 66 67 mov rsi, r11 68 mov eax, r14d 69 test rax, rax 70 jle near .return 71 .rowloop: 72 push rdx 73 push rbx 74 push rdi 75 push rsi 76 push rcx ; col 77 78 mov rsip, JSAMPROW [rsi] ; inptr 79 mov rdip, JSAMPROW [rdi] ; outptr0 80 mov rbxp, JSAMPROW [rbx] ; outptr1 81 mov rdxp, JSAMPROW [rdx] ; outptr2 82 83 cmp rcx, byte SIZEOF_XMMWORD 84 jae near .columnloop 85 86 %if RGB_PIXELSIZE == 3 ; --------------- 87 88 .column_ld1: 89 push rax 90 push rdx 91 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 92 test cl, SIZEOF_BYTE 93 jz short .column_ld2 94 sub rcx, byte SIZEOF_BYTE 95 movzx rax, byte [rsi+rcx] 96 .column_ld2: 97 test cl, SIZEOF_WORD 98 jz short .column_ld4 99 sub rcx, byte SIZEOF_WORD 100 movzx rdx, word [rsi+rcx] 101 shl rax, WORD_BIT 102 or rax, rdx 103 .column_ld4: 104 movd xmmA, eax 105 pop rdx 106 pop rax 107 test cl, SIZEOF_DWORD 108 jz short .column_ld8 109 sub rcx, byte SIZEOF_DWORD 110 movd xmmF, XMM_DWORD [rsi+rcx] 111 pslldq xmmA, SIZEOF_DWORD 112 por xmmA, xmmF 113 .column_ld8: 114 test cl, SIZEOF_MMWORD 115 jz short .column_ld16 116 sub rcx, byte SIZEOF_MMWORD 117 movq xmmB, XMM_MMWORD [rsi+rcx] 118 pslldq xmmA, SIZEOF_MMWORD 119 por xmmA, xmmB 120 .column_ld16: 121 test cl, SIZEOF_XMMWORD 122 jz short .column_ld32 123 movdqa xmmF, xmmA 124 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 125 mov rcx, SIZEOF_XMMWORD 126 jmp short .rgb_ycc_cnv 127 .column_ld32: 128 test cl, 2*SIZEOF_XMMWORD 129 mov rcx, SIZEOF_XMMWORD 130 jz short .rgb_ycc_cnv 131 movdqa xmmB, xmmA 132 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 133 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 134 jmp short .rgb_ycc_cnv 135 136 .columnloop: 137 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 138 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 139 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 140 141 .rgb_ycc_cnv: 142 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 143 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 144 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 145 146 movdqa xmmG, xmmA 147 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 148 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 149 150 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 151 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 152 153 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 154 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 155 156 movdqa xmmD, xmmA 157 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 158 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 159 160 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 161 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 162 163 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 164 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 165 166 movdqa xmmE, xmmA 167 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 168 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 169 170 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 171 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 172 173 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 174 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 175 176 pxor xmmH, xmmH 177 178 movdqa xmmC, xmmA 179 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 180 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 181 182 movdqa xmmB, xmmE 183 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 184 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 185 186 movdqa xmmF, xmmD 187 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 188 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 189 190 %else ; RGB_PIXELSIZE == 4 ; ----------- 191 192 .column_ld1: 193 test cl, SIZEOF_XMMWORD/16 194 jz short .column_ld2 195 sub rcx, byte SIZEOF_XMMWORD/16 196 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 197 .column_ld2: 198 test cl, SIZEOF_XMMWORD/8 199 jz short .column_ld4 200 sub rcx, byte SIZEOF_XMMWORD/8 201 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 202 pslldq xmmA, SIZEOF_MMWORD 203 por xmmA, xmmE 204 .column_ld4: 205 test cl, SIZEOF_XMMWORD/4 206 jz short .column_ld8 207 sub rcx, byte SIZEOF_XMMWORD/4 208 movdqa xmmE, xmmA 209 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 210 .column_ld8: 211 test cl, SIZEOF_XMMWORD/2 212 mov rcx, SIZEOF_XMMWORD 213 jz short .rgb_ycc_cnv 214 movdqa xmmF, xmmA 215 movdqa xmmH, xmmE 216 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 217 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 218 jmp short .rgb_ycc_cnv 219 220 .columnloop: 221 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 222 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 223 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 224 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 225 226 .rgb_ycc_cnv: 227 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 228 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 229 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 230 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 231 232 movdqa xmmD, xmmA 233 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 234 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 235 236 movdqa xmmC, xmmF 237 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 238 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 239 240 movdqa xmmB, xmmA 241 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 242 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 243 244 movdqa xmmG, xmmD 245 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 246 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 247 248 movdqa xmmE, xmmA 249 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 250 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 251 252 movdqa xmmH, xmmB 253 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 254 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 255 256 pxor xmmF, xmmF 257 258 movdqa xmmC, xmmA 259 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 260 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 261 262 movdqa xmmD, xmmB 263 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 264 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 265 266 movdqa xmmG, xmmE 267 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 268 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 269 270 punpcklbw xmmF, xmmH 271 punpckhbw xmmH, xmmH 272 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 273 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 274 275 %endif ; RGB_PIXELSIZE ; --------------- 276 277 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 278 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 279 280 ; (Original) 281 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 282 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 283 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 284 ; 285 ; (This implementation) 286 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 287 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 288 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 289 290 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE 291 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO 292 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE 293 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO 294 295 movdqa xmm6, xmm1 296 punpcklwd xmm1, xmm3 297 punpckhwd xmm6, xmm3 298 movdqa xmm7, xmm1 299 movdqa xmm4, xmm6 300 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 301 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 302 pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 303 pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 304 305 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 306 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 307 308 pxor xmm1, xmm1 309 pxor xmm6, xmm6 310 punpcklwd xmm1, xmm5 ; xmm1=BOL 311 punpckhwd xmm6, xmm5 ; xmm6=BOH 312 psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) 313 psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) 314 315 movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] 316 317 paddd xmm7, xmm1 318 paddd xmm4, xmm6 319 paddd xmm7, xmm5 320 paddd xmm4, xmm5 321 psrld xmm7, SCALEBITS ; xmm7=CbOL 322 psrld xmm4, SCALEBITS ; xmm4=CbOH 323 packssdw xmm7, xmm4 ; xmm7=CbO 324 325 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE 326 327 movdqa xmm6, xmm0 328 punpcklwd xmm0, xmm2 329 punpckhwd xmm6, xmm2 330 movdqa xmm5, xmm0 331 movdqa xmm4, xmm6 332 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 333 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 334 pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 335 pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 336 337 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 338 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 339 340 pxor xmm0, xmm0 341 pxor xmm6, xmm6 342 punpcklwd xmm0, xmm1 ; xmm0=BEL 343 punpckhwd xmm6, xmm1 ; xmm6=BEH 344 psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) 345 psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) 346 347 movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 348 349 paddd xmm5, xmm0 350 paddd xmm4, xmm6 351 paddd xmm5, xmm1 352 paddd xmm4, xmm1 353 psrld xmm5, SCALEBITS ; xmm5=CbEL 354 psrld xmm4, SCALEBITS ; xmm4=CbEH 355 packssdw xmm5, xmm4 ; xmm5=CbE 356 357 psllw xmm7, BYTE_BIT 358 por xmm5, xmm7 ; xmm5=Cb 359 movdqa XMMWORD [rbx], xmm5 ; Save Cb 360 361 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO 362 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE 363 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO 364 365 movdqa xmm4, xmm0 366 punpcklwd xmm0, xmm3 367 punpckhwd xmm4, xmm3 368 movdqa xmm7, xmm0 369 movdqa xmm5, xmm4 370 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 371 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 372 pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 373 pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 374 375 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 376 377 paddd xmm0, XMMWORD [wk(4)] 378 paddd xmm4, XMMWORD [wk(5)] 379 paddd xmm0, xmm3 380 paddd xmm4, xmm3 381 psrld xmm0, SCALEBITS ; xmm0=YOL 382 psrld xmm4, SCALEBITS ; xmm4=YOH 383 packssdw xmm0, xmm4 ; xmm0=YO 384 385 pxor xmm3, xmm3 386 pxor xmm4, xmm4 387 punpcklwd xmm3, xmm1 ; xmm3=ROL 388 punpckhwd xmm4, xmm1 ; xmm4=ROH 389 psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) 390 psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) 391 392 movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 393 394 paddd xmm7, xmm3 395 paddd xmm5, xmm4 396 paddd xmm7, xmm1 397 paddd xmm5, xmm1 398 psrld xmm7, SCALEBITS ; xmm7=CrOL 399 psrld xmm5, SCALEBITS ; xmm5=CrOH 400 packssdw xmm7, xmm5 ; xmm7=CrO 401 402 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE 403 404 movdqa xmm4, xmm6 405 punpcklwd xmm6, xmm2 406 punpckhwd xmm4, xmm2 407 movdqa xmm1, xmm6 408 movdqa xmm5, xmm4 409 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 410 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 411 pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 412 pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 413 414 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 415 416 paddd xmm6, XMMWORD [wk(6)] 417 paddd xmm4, XMMWORD [wk(7)] 418 paddd xmm6, xmm2 419 paddd xmm4, xmm2 420 psrld xmm6, SCALEBITS ; xmm6=YEL 421 psrld xmm4, SCALEBITS ; xmm4=YEH 422 packssdw xmm6, xmm4 ; xmm6=YE 423 424 psllw xmm0, BYTE_BIT 425 por xmm6, xmm0 ; xmm6=Y 426 movdqa XMMWORD [rdi], xmm6 ; Save Y 427 428 pxor xmm2, xmm2 429 pxor xmm4, xmm4 430 punpcklwd xmm2, xmm3 ; xmm2=REL 431 punpckhwd xmm4, xmm3 ; xmm4=REH 432 psrld xmm2, 1 ; xmm2=REL*FIX(0.500) 433 psrld xmm4, 1 ; xmm4=REH*FIX(0.500) 434 435 movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] 436 437 paddd xmm1, xmm2 438 paddd xmm5, xmm4 439 paddd xmm1, xmm0 440 paddd xmm5, xmm0 441 psrld xmm1, SCALEBITS ; xmm1=CrEL 442 psrld xmm5, SCALEBITS ; xmm5=CrEH 443 packssdw xmm1, xmm5 ; xmm1=CrE 444 445 psllw xmm7, BYTE_BIT 446 por xmm1, xmm7 ; xmm1=Cr 447 movdqa XMMWORD [rdx], xmm1 ; Save Cr 448 449 sub rcx, byte SIZEOF_XMMWORD 450 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 451 add rdi, byte SIZEOF_XMMWORD ; outptr0 452 add rbx, byte SIZEOF_XMMWORD ; outptr1 453 add rdx, byte SIZEOF_XMMWORD ; outptr2 454 cmp rcx, byte SIZEOF_XMMWORD 455 jae near .columnloop 456 test rcx, rcx 457 jnz near .column_ld1 458 459 pop rcx ; col 460 pop rsi 461 pop rdi 462 pop rbx 463 pop rdx 464 465 add rsi, byte SIZEOF_JSAMPROW ; input_buf 466 add rdi, byte SIZEOF_JSAMPROW 467 add rbx, byte SIZEOF_JSAMPROW 468 add rdx, byte SIZEOF_JSAMPROW 469 dec rax ; num_rows 470 jg near .rowloop 471 472 .return: 473 pop rbx 474 UNCOLLECT_ARGS 5 475 lea rsp, [rbp-8] 476 pop r15 477 pop rbp 478 ret 479 480 ; For some reason, the OS X linker does not honor the request to align the 481 ; segment unless we do this. 482 align 32