jdcolext-sse2.asm (18531B)
1 ; 2 ; jdcolext.asm - colorspace conversion (SSE2) 3 ; 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2012, 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jcolsamp.inc" 14 15 ; -------------------------------------------------------------------------- 16 ; 17 ; Convert some rows of samples to the output colorspace. 18 ; 19 ; GLOBAL(void) 20 ; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf, 21 ; JDIMENSION input_row, JSAMPARRAY output_buf, 22 ; int num_rows) 23 ; 24 25 %define out_width(b) (b) + 8 ; JDIMENSION out_width 26 %define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 27 %define input_row(b) (b) + 16 ; JDIMENSION input_row 28 %define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 29 %define num_rows(b) (b) + 24 ; int num_rows 30 31 %define original_ebp ebp + 0 32 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 33 ; xmmword wk[WK_NUM] 34 %define WK_NUM 2 35 %define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) 39 40 EXTN(jsimd_ycc_rgb_convert_sse2): 41 push ebp 42 mov eax, esp ; eax = original ebp 43 sub esp, byte 4 44 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 45 mov [esp], eax 46 mov ebp, esp ; ebp = aligned ebp 47 lea esp, [wk(0)] 48 PUSHPIC eax ; make a room for GOT address 49 push ebx 50 ; push ecx ; need not be preserved 51 ; push edx ; need not be preserved 52 push esi 53 push edi 54 55 GET_GOT ebx ; get GOT address 56 MOVPIC POINTER [gotptr], ebx ; save GOT address 57 58 mov ecx, JDIMENSION [out_width(eax)] ; num_cols 59 test ecx, ecx 60 jz near .return 61 62 push ecx 63 64 mov edi, JSAMPIMAGE [input_buf(eax)] 65 mov ecx, JDIMENSION [input_row(eax)] 66 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 67 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 68 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 69 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 70 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 71 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 72 73 pop ecx 74 75 mov edi, JSAMPARRAY [output_buf(eax)] 76 mov eax, INT [num_rows(eax)] 77 test eax, eax 78 jle near .return 79 ALIGNX 16, 7 80 .rowloop: 81 push eax 82 push edi 83 push edx 84 push ebx 85 push esi 86 push ecx ; col 87 88 mov esi, JSAMPROW [esi] ; inptr0 89 mov ebx, JSAMPROW [ebx] ; inptr1 90 mov edx, JSAMPROW [edx] ; inptr2 91 mov edi, JSAMPROW [edi] ; outptr 92 MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) 93 ALIGNX 16, 7 94 .columnloop: 95 96 movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) 97 movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) 98 99 pcmpeqw xmm4, xmm4 100 pcmpeqw xmm7, xmm7 101 psrlw xmm4, BYTE_BIT 102 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 103 movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} 104 105 pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE 106 psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO 107 pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE 108 psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO 109 110 paddw xmm4, xmm7 111 paddw xmm5, xmm7 112 paddw xmm0, xmm7 113 paddw xmm1, xmm7 114 115 ; (Original) 116 ; R = Y + 1.40200 * Cr 117 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 118 ; B = Y + 1.77200 * Cb 119 ; 120 ; (This implementation) 121 ; R = Y + 0.40200 * Cr + Cr 122 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 123 ; B = Y - 0.22800 * Cb + Cb + Cb 124 125 movdqa xmm2, xmm4 ; xmm2=CbE 126 movdqa xmm3, xmm5 ; xmm3=CbO 127 paddw xmm4, xmm4 ; xmm4=2*CbE 128 paddw xmm5, xmm5 ; xmm5=2*CbO 129 movdqa xmm6, xmm0 ; xmm6=CrE 130 movdqa xmm7, xmm1 ; xmm7=CrO 131 paddw xmm0, xmm0 ; xmm0=2*CrE 132 paddw xmm1, xmm1 ; xmm1=2*CrO 133 134 pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) 135 pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) 136 pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) 137 pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) 138 139 paddw xmm4, [GOTOFF(eax,PW_ONE)] 140 paddw xmm5, [GOTOFF(eax,PW_ONE)] 141 psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) 142 psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) 143 paddw xmm0, [GOTOFF(eax,PW_ONE)] 144 paddw xmm1, [GOTOFF(eax,PW_ONE)] 145 psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) 146 psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) 147 148 paddw xmm4, xmm2 149 paddw xmm5, xmm3 150 paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E 151 paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O 152 paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E 153 paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O 154 155 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E 156 movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O 157 158 movdqa xmm4, xmm2 159 movdqa xmm5, xmm3 160 punpcklwd xmm2, xmm6 161 punpckhwd xmm4, xmm6 162 pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] 163 pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)] 164 punpcklwd xmm3, xmm7 165 punpckhwd xmm5, xmm7 166 pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)] 167 pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] 168 169 paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] 170 paddd xmm4, [GOTOFF(eax,PD_ONEHALF)] 171 psrad xmm2, SCALEBITS 172 psrad xmm4, SCALEBITS 173 paddd xmm3, [GOTOFF(eax,PD_ONEHALF)] 174 paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] 175 psrad xmm3, SCALEBITS 176 psrad xmm5, SCALEBITS 177 178 packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 179 packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 180 psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 181 psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 182 183 movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) 184 185 pcmpeqw xmm4, xmm4 186 psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} 187 pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE 188 psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO 189 190 paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) 191 paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) 192 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 193 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 194 195 paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) 196 paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) 197 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 198 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 199 200 paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) 201 paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) 202 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 203 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 204 205 %if RGB_PIXELSIZE == 3 ; --------------- 206 207 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 208 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 209 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 210 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 211 212 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 213 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 214 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 215 216 movdqa xmmG, xmmA 217 movdqa xmmH, xmmA 218 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 219 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 220 221 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 222 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 223 224 movdqa xmmC, xmmD 225 movdqa xmmB, xmmD 226 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 227 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 228 229 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 230 231 movdqa xmmF, xmmE 232 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 233 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 234 235 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 236 movdqa xmmB, xmmE 237 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 238 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 239 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 240 241 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 242 movdqa xmmB, xmmF 243 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 244 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 245 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 246 247 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 248 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 249 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 250 251 cmp ecx, byte SIZEOF_XMMWORD 252 jb short .column_st32 253 254 test edi, SIZEOF_XMMWORD-1 255 jnz short .out1 256 ; --(aligned)------------------- 257 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 258 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 259 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 260 jmp short .out0 261 .out1: ; --(unaligned)----------------- 262 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 263 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 264 movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 265 .out0: 266 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 267 sub ecx, byte SIZEOF_XMMWORD 268 jz near .nextrow 269 270 add esi, byte SIZEOF_XMMWORD ; inptr0 271 add ebx, byte SIZEOF_XMMWORD ; inptr1 272 add edx, byte SIZEOF_XMMWORD ; inptr2 273 jmp near .columnloop 274 ALIGNX 16, 7 275 276 .column_st32: 277 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 278 cmp ecx, byte 2*SIZEOF_XMMWORD 279 jb short .column_st16 280 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 281 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 282 add edi, byte 2*SIZEOF_XMMWORD ; outptr 283 movdqa xmmA, xmmF 284 sub ecx, byte 2*SIZEOF_XMMWORD 285 jmp short .column_st15 286 .column_st16: 287 cmp ecx, byte SIZEOF_XMMWORD 288 jb short .column_st15 289 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 290 add edi, byte SIZEOF_XMMWORD ; outptr 291 movdqa xmmA, xmmD 292 sub ecx, byte SIZEOF_XMMWORD 293 .column_st15: 294 ; Store the lower 8 bytes of xmmA to the output when it has enough 295 ; space. 296 cmp ecx, byte SIZEOF_MMWORD 297 jb short .column_st7 298 movq XMM_MMWORD [edi], xmmA 299 add edi, byte SIZEOF_MMWORD 300 sub ecx, byte SIZEOF_MMWORD 301 psrldq xmmA, SIZEOF_MMWORD 302 .column_st7: 303 ; Store the lower 4 bytes of xmmA to the output when it has enough 304 ; space. 305 cmp ecx, byte SIZEOF_DWORD 306 jb short .column_st3 307 movd XMM_DWORD [edi], xmmA 308 add edi, byte SIZEOF_DWORD 309 sub ecx, byte SIZEOF_DWORD 310 psrldq xmmA, SIZEOF_DWORD 311 .column_st3: 312 ; Store the lower 2 bytes of eax to the output when it has enough 313 ; space. 314 movd eax, xmmA 315 cmp ecx, byte SIZEOF_WORD 316 jb short .column_st1 317 mov word [edi], ax 318 add edi, byte SIZEOF_WORD 319 sub ecx, byte SIZEOF_WORD 320 shr eax, 16 321 .column_st1: 322 ; Store the lower 1 byte of eax to the output when it has enough 323 ; space. 324 test ecx, ecx 325 jz short .nextrow 326 mov byte [edi], al 327 328 %else ; RGB_PIXELSIZE == 4 ; ----------- 329 330 %ifdef RGBX_FILLER_0XFF 331 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 332 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 333 %else 334 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 335 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 336 %endif 337 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 338 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 339 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 340 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 341 342 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 343 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 344 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 345 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 346 347 movdqa xmmC, xmmA 348 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 349 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 350 movdqa xmmG, xmmB 351 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 352 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 353 354 movdqa xmmD, xmmA 355 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 356 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 357 movdqa xmmH, xmmC 358 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 359 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 360 361 cmp ecx, byte SIZEOF_XMMWORD 362 jb short .column_st32 363 364 test edi, SIZEOF_XMMWORD-1 365 jnz short .out1 366 ; --(aligned)------------------- 367 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 368 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 369 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 370 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 371 jmp short .out0 372 .out1: ; --(unaligned)----------------- 373 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 374 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 375 movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 376 movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 377 .out0: 378 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 379 sub ecx, byte SIZEOF_XMMWORD 380 jz near .nextrow 381 382 add esi, byte SIZEOF_XMMWORD ; inptr0 383 add ebx, byte SIZEOF_XMMWORD ; inptr1 384 add edx, byte SIZEOF_XMMWORD ; inptr2 385 jmp near .columnloop 386 ALIGNX 16, 7 387 388 .column_st32: 389 cmp ecx, byte SIZEOF_XMMWORD/2 390 jb short .column_st16 391 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 392 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 393 add edi, byte 2*SIZEOF_XMMWORD ; outptr 394 movdqa xmmA, xmmC 395 movdqa xmmD, xmmH 396 sub ecx, byte SIZEOF_XMMWORD/2 397 .column_st16: 398 cmp ecx, byte SIZEOF_XMMWORD/4 399 jb short .column_st15 400 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 401 add edi, byte SIZEOF_XMMWORD ; outptr 402 movdqa xmmA, xmmD 403 sub ecx, byte SIZEOF_XMMWORD/4 404 .column_st15: 405 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 406 ; space. 407 cmp ecx, byte SIZEOF_XMMWORD/8 408 jb short .column_st7 409 movq XMM_MMWORD [edi], xmmA 410 add edi, byte SIZEOF_XMMWORD/8*4 411 sub ecx, byte SIZEOF_XMMWORD/8 412 psrldq xmmA, SIZEOF_XMMWORD/8*4 413 .column_st7: 414 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 415 ; space. 416 test ecx, ecx 417 jz short .nextrow 418 movd XMM_DWORD [edi], xmmA 419 420 %endif ; RGB_PIXELSIZE ; --------------- 421 422 ALIGNX 16, 7 423 424 .nextrow: 425 pop ecx 426 pop esi 427 pop ebx 428 pop edx 429 pop edi 430 pop eax 431 432 add esi, byte SIZEOF_JSAMPROW 433 add ebx, byte SIZEOF_JSAMPROW 434 add edx, byte SIZEOF_JSAMPROW 435 add edi, byte SIZEOF_JSAMPROW ; output_buf 436 dec eax ; num_rows 437 jg near .rowloop 438 439 sfence ; flush the write buffer 440 441 .return: 442 pop edi 443 pop esi 444 ; pop edx ; need not be preserved 445 ; pop ecx ; need not be preserved 446 pop ebx 447 mov esp, ebp ; esp <- aligned ebp 448 pop esp ; esp <- original ebp 449 pop ebp 450 ret 451 452 ; For some reason, the OS X linker does not honor the request to align the 453 ; segment unless we do this. 454 align 32