jdcolext-sse2.asm (17352B)
1 ; 2 ; jdcolext.asm - colorspace conversion (64-bit SSE2) 3 ; 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 15 %include "jcolsamp.inc" 16 17 ; -------------------------------------------------------------------------- 18 ; 19 ; Convert some rows of samples to the output colorspace. 20 ; 21 ; GLOBAL(void) 22 ; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf, 23 ; JDIMENSION input_row, JSAMPARRAY output_buf, 24 ; int num_rows) 25 ; 26 27 ; r10d = JDIMENSION out_width 28 ; r11 = JSAMPIMAGE input_buf 29 ; r12d = JDIMENSION input_row 30 ; r13 = JSAMPARRAY output_buf 31 ; r14d = int num_rows 32 33 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 34 %define WK_NUM 2 35 36 align 32 37 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) 38 39 EXTN(jsimd_ycc_rgb_convert_sse2): 40 ENDBR64 41 push rbp 42 mov rbp, rsp 43 push r15 44 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 45 ; Allocate stack space for wk array. r15 is used to access it. 46 mov r15, rsp 47 sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) 48 COLLECT_ARGS 5 49 push rbx 50 51 mov ecx, r10d ; num_cols 52 test rcx, rcx 53 jz near .return 54 55 push rcx 56 57 mov rdi, r11 58 mov ecx, r12d 59 mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 60 mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 61 mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 62 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] 63 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 64 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 65 66 pop rcx 67 68 mov rdi, r13 69 mov eax, r14d 70 test rax, rax 71 jle near .return 72 .rowloop: 73 push rax 74 push rdi 75 push rdx 76 push rbx 77 push rsi 78 push rcx ; col 79 80 mov rsip, JSAMPROW [rsi] ; inptr0 81 mov rbxp, JSAMPROW [rbx] ; inptr1 82 mov rdxp, JSAMPROW [rdx] ; inptr2 83 mov rdip, JSAMPROW [rdi] ; outptr 84 .columnloop: 85 86 movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) 87 movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) 88 89 pcmpeqw xmm4, xmm4 90 pcmpeqw xmm7, xmm7 91 psrlw xmm4, BYTE_BIT 92 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 93 movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} 94 95 pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE 96 psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO 97 pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE 98 psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO 99 100 paddw xmm4, xmm7 101 paddw xmm5, xmm7 102 paddw xmm0, xmm7 103 paddw xmm1, xmm7 104 105 ; (Original) 106 ; R = Y + 1.40200 * Cr 107 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 108 ; B = Y + 1.77200 * Cb 109 ; 110 ; (This implementation) 111 ; R = Y + 0.40200 * Cr + Cr 112 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 113 ; B = Y - 0.22800 * Cb + Cb + Cb 114 115 movdqa xmm2, xmm4 ; xmm2=CbE 116 movdqa xmm3, xmm5 ; xmm3=CbO 117 paddw xmm4, xmm4 ; xmm4=2*CbE 118 paddw xmm5, xmm5 ; xmm5=2*CbO 119 movdqa xmm6, xmm0 ; xmm6=CrE 120 movdqa xmm7, xmm1 ; xmm7=CrO 121 paddw xmm0, xmm0 ; xmm0=2*CrE 122 paddw xmm1, xmm1 ; xmm1=2*CrO 123 124 pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) 125 pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) 126 pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) 127 pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) 128 129 paddw xmm4, [rel PW_ONE] 130 paddw xmm5, [rel PW_ONE] 131 psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) 132 psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) 133 paddw xmm0, [rel PW_ONE] 134 paddw xmm1, [rel PW_ONE] 135 psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) 136 psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) 137 138 paddw xmm4, xmm2 139 paddw xmm5, xmm3 140 paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E 141 paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O 142 paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E 143 paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O 144 145 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E 146 movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O 147 148 movdqa xmm4, xmm2 149 movdqa xmm5, xmm3 150 punpcklwd xmm2, xmm6 151 punpckhwd xmm4, xmm6 152 pmaddwd xmm2, [rel PW_MF0344_F0285] 153 pmaddwd xmm4, [rel PW_MF0344_F0285] 154 punpcklwd xmm3, xmm7 155 punpckhwd xmm5, xmm7 156 pmaddwd xmm3, [rel PW_MF0344_F0285] 157 pmaddwd xmm5, [rel PW_MF0344_F0285] 158 159 paddd xmm2, [rel PD_ONEHALF] 160 paddd xmm4, [rel PD_ONEHALF] 161 psrad xmm2, SCALEBITS 162 psrad xmm4, SCALEBITS 163 paddd xmm3, [rel PD_ONEHALF] 164 paddd xmm5, [rel PD_ONEHALF] 165 psrad xmm3, SCALEBITS 166 psrad xmm5, SCALEBITS 167 168 packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 169 packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 170 psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 171 psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 172 173 movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) 174 175 pcmpeqw xmm4, xmm4 176 psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} 177 pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE 178 psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO 179 180 paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) 181 paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) 182 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 183 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 184 185 paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) 186 paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) 187 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 188 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 189 190 paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) 191 paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) 192 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 193 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 194 195 %if RGB_PIXELSIZE == 3 ; --------------- 196 197 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 198 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 199 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 200 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 201 202 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 203 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 204 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 205 206 movdqa xmmG, xmmA 207 movdqa xmmH, xmmA 208 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 209 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 210 211 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 212 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 213 214 movdqa xmmC, xmmD 215 movdqa xmmB, xmmD 216 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 217 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 218 219 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 220 221 movdqa xmmF, xmmE 222 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 223 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 224 225 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 226 movdqa xmmB, xmmE 227 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 228 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 229 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 230 231 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 232 movdqa xmmB, xmmF 233 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 234 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 235 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 236 237 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 238 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 239 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 240 241 cmp rcx, byte SIZEOF_XMMWORD 242 jb short .column_st32 243 244 test rdi, SIZEOF_XMMWORD-1 245 jnz short .out1 246 ; --(aligned)------------------- 247 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 248 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 249 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 250 jmp short .out0 251 .out1: ; --(unaligned)----------------- 252 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 253 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 254 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 255 .out0: 256 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 257 sub rcx, byte SIZEOF_XMMWORD 258 jz near .nextrow 259 260 add rsi, byte SIZEOF_XMMWORD ; inptr0 261 add rbx, byte SIZEOF_XMMWORD ; inptr1 262 add rdx, byte SIZEOF_XMMWORD ; inptr2 263 jmp near .columnloop 264 265 .column_st32: 266 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 267 cmp rcx, byte 2*SIZEOF_XMMWORD 268 jb short .column_st16 269 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 270 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 271 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 272 movdqa xmmA, xmmF 273 sub rcx, byte 2*SIZEOF_XMMWORD 274 jmp short .column_st15 275 .column_st16: 276 cmp rcx, byte SIZEOF_XMMWORD 277 jb short .column_st15 278 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 279 add rdi, byte SIZEOF_XMMWORD ; outptr 280 movdqa xmmA, xmmD 281 sub rcx, byte SIZEOF_XMMWORD 282 .column_st15: 283 ; Store the lower 8 bytes of xmmA to the output when it has enough 284 ; space. 285 cmp rcx, byte SIZEOF_MMWORD 286 jb short .column_st7 287 movq XMM_MMWORD [rdi], xmmA 288 add rdi, byte SIZEOF_MMWORD 289 sub rcx, byte SIZEOF_MMWORD 290 psrldq xmmA, SIZEOF_MMWORD 291 .column_st7: 292 ; Store the lower 4 bytes of xmmA to the output when it has enough 293 ; space. 294 cmp rcx, byte SIZEOF_DWORD 295 jb short .column_st3 296 movd XMM_DWORD [rdi], xmmA 297 add rdi, byte SIZEOF_DWORD 298 sub rcx, byte SIZEOF_DWORD 299 psrldq xmmA, SIZEOF_DWORD 300 .column_st3: 301 ; Store the lower 2 bytes of rax to the output when it has enough 302 ; space. 303 movd eax, xmmA 304 cmp rcx, byte SIZEOF_WORD 305 jb short .column_st1 306 mov word [rdi], ax 307 add rdi, byte SIZEOF_WORD 308 sub rcx, byte SIZEOF_WORD 309 shr rax, 16 310 .column_st1: 311 ; Store the lower 1 byte of rax to the output when it has enough 312 ; space. 313 test rcx, rcx 314 jz short .nextrow 315 mov byte [rdi], al 316 317 %else ; RGB_PIXELSIZE == 4 ; ----------- 318 319 %ifdef RGBX_FILLER_0XFF 320 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 321 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 322 %else 323 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 324 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 325 %endif 326 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 327 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 328 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 329 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 330 331 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 332 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 333 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 334 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 335 336 movdqa xmmC, xmmA 337 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 338 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 339 movdqa xmmG, xmmB 340 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 341 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 342 343 movdqa xmmD, xmmA 344 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 345 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 346 movdqa xmmH, xmmC 347 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 348 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 349 350 cmp rcx, byte SIZEOF_XMMWORD 351 jb short .column_st32 352 353 test rdi, SIZEOF_XMMWORD-1 354 jnz short .out1 355 ; --(aligned)------------------- 356 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 357 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 358 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 359 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 360 jmp short .out0 361 .out1: ; --(unaligned)----------------- 362 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 363 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 364 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 365 movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 366 .out0: 367 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 368 sub rcx, byte SIZEOF_XMMWORD 369 jz near .nextrow 370 371 add rsi, byte SIZEOF_XMMWORD ; inptr0 372 add rbx, byte SIZEOF_XMMWORD ; inptr1 373 add rdx, byte SIZEOF_XMMWORD ; inptr2 374 jmp near .columnloop 375 376 .column_st32: 377 cmp rcx, byte SIZEOF_XMMWORD/2 378 jb short .column_st16 379 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 380 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 381 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 382 movdqa xmmA, xmmC 383 movdqa xmmD, xmmH 384 sub rcx, byte SIZEOF_XMMWORD/2 385 .column_st16: 386 cmp rcx, byte SIZEOF_XMMWORD/4 387 jb short .column_st15 388 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 389 add rdi, byte SIZEOF_XMMWORD ; outptr 390 movdqa xmmA, xmmD 391 sub rcx, byte SIZEOF_XMMWORD/4 392 .column_st15: 393 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 394 ; space. 395 cmp rcx, byte SIZEOF_XMMWORD/8 396 jb short .column_st7 397 movq MMWORD [rdi], xmmA 398 add rdi, byte SIZEOF_XMMWORD/8*4 399 sub rcx, byte SIZEOF_XMMWORD/8 400 psrldq xmmA, SIZEOF_XMMWORD/8*4 401 .column_st7: 402 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 403 ; space. 404 test rcx, rcx 405 jz short .nextrow 406 movd XMM_DWORD [rdi], xmmA 407 408 %endif ; RGB_PIXELSIZE ; --------------- 409 410 .nextrow: 411 pop rcx 412 pop rsi 413 pop rbx 414 pop rdx 415 pop rdi 416 pop rax 417 418 add rsi, byte SIZEOF_JSAMPROW 419 add rbx, byte SIZEOF_JSAMPROW 420 add rdx, byte SIZEOF_JSAMPROW 421 add rdi, byte SIZEOF_JSAMPROW ; output_buf 422 dec rax ; num_rows 423 jg near .rowloop 424 425 sfence ; flush the write buffer 426 427 .return: 428 pop rbx 429 UNCOLLECT_ARGS 5 430 lea rsp, [rbp-8] 431 pop r15 432 pop rbp 433 ret 434 435 ; For some reason, the OS X linker does not honor the request to align the 436 ; segment unless we do this. 437 align 32