jdmrgext-sse2.asm (20408B)
1 ; 2 ; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) 3 ; 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 15 %include "jcolsamp.inc" 16 17 ; -------------------------------------------------------------------------- 18 ; 19 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 20 ; 21 ; GLOBAL(void) 22 ; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, 23 ; JSAMPIMAGE input_buf, 24 ; JDIMENSION in_row_group_ctr, 25 ; JSAMPARRAY output_buf); 26 ; 27 28 ; r10d = JDIMENSION output_width 29 ; r11 = JSAMPIMAGE input_buf 30 ; r12d = JDIMENSION in_row_group_ctr 31 ; r13 = JSAMPARRAY output_buf 32 33 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 34 %define WK_NUM 3 35 36 align 32 37 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) 38 39 EXTN(jsimd_h2v1_merged_upsample_sse2): 40 ENDBR64 41 push rbp 42 mov rbp, rsp 43 push r15 44 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 45 ; Allocate stack space for wk array. r15 is used to access it. 46 mov r15, rsp 47 sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) 48 COLLECT_ARGS 4 49 push rbx 50 51 mov ecx, r10d ; col 52 test rcx, rcx 53 jz near .return 54 55 push rcx 56 57 mov rdi, r11 58 mov ecx, r12d 59 mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 60 mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 61 mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 62 mov rdi, r13 63 mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 64 mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 65 mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 66 mov rdip, JSAMPROW [rdi] ; outptr 67 68 pop rcx ; col 69 70 .columnloop: 71 72 movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) 73 movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) 74 75 pxor xmm1, xmm1 ; xmm1=(all 0's) 76 pcmpeqw xmm3, xmm3 77 psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} 78 79 movdqa xmm4, xmm6 80 punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH 81 punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL 82 movdqa xmm0, xmm7 83 punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH 84 punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL 85 86 paddw xmm6, xmm3 87 paddw xmm4, xmm3 88 paddw xmm7, xmm3 89 paddw xmm0, xmm3 90 91 ; (Original) 92 ; R = Y + 1.40200 * Cr 93 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 94 ; B = Y + 1.77200 * Cb 95 ; 96 ; (This implementation) 97 ; R = Y + 0.40200 * Cr + Cr 98 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 99 ; B = Y - 0.22800 * Cb + Cb + Cb 100 101 movdqa xmm5, xmm6 ; xmm5=CbH 102 movdqa xmm2, xmm4 ; xmm2=CbL 103 paddw xmm6, xmm6 ; xmm6=2*CbH 104 paddw xmm4, xmm4 ; xmm4=2*CbL 105 movdqa xmm1, xmm7 ; xmm1=CrH 106 movdqa xmm3, xmm0 ; xmm3=CrL 107 paddw xmm7, xmm7 ; xmm7=2*CrH 108 paddw xmm0, xmm0 ; xmm0=2*CrL 109 110 pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) 111 pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) 112 pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) 113 pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) 114 115 paddw xmm6, [rel PW_ONE] 116 paddw xmm4, [rel PW_ONE] 117 psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) 118 psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) 119 paddw xmm7, [rel PW_ONE] 120 paddw xmm0, [rel PW_ONE] 121 psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) 122 psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) 123 124 paddw xmm6, xmm5 125 paddw xmm4, xmm2 126 paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H 127 paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L 128 paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H 129 paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L 130 131 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H 132 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H 133 134 movdqa xmm6, xmm5 135 movdqa xmm7, xmm2 136 punpcklwd xmm5, xmm1 137 punpckhwd xmm6, xmm1 138 pmaddwd xmm5, [rel PW_MF0344_F0285] 139 pmaddwd xmm6, [rel PW_MF0344_F0285] 140 punpcklwd xmm2, xmm3 141 punpckhwd xmm7, xmm3 142 pmaddwd xmm2, [rel PW_MF0344_F0285] 143 pmaddwd xmm7, [rel PW_MF0344_F0285] 144 145 paddd xmm5, [rel PD_ONEHALF] 146 paddd xmm6, [rel PD_ONEHALF] 147 psrad xmm5, SCALEBITS 148 psrad xmm6, SCALEBITS 149 paddd xmm2, [rel PD_ONEHALF] 150 paddd xmm7, [rel PD_ONEHALF] 151 psrad xmm2, SCALEBITS 152 psrad xmm7, SCALEBITS 153 154 packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 155 packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 156 psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 157 psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 158 159 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H 160 161 mov al, 2 ; Yctr 162 jmp short .Yloop_1st 163 164 .Yloop_2nd: 165 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H 166 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H 167 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H 168 169 .Yloop_1st: 170 movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) 171 172 pcmpeqw xmm6, xmm6 173 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 174 pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE 175 psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO 176 177 movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) 178 movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) 179 movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) 180 181 paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) 182 paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) 183 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 184 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 185 186 paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) 187 paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) 188 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 189 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 190 191 paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) 192 paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) 193 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 194 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 195 196 %if RGB_PIXELSIZE == 3 ; --------------- 197 198 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 199 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 200 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 201 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 202 203 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 204 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 205 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 206 207 movdqa xmmG, xmmA 208 movdqa xmmH, xmmA 209 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 210 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 211 212 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 213 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 214 215 movdqa xmmC, xmmD 216 movdqa xmmB, xmmD 217 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 218 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 219 220 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 221 222 movdqa xmmF, xmmE 223 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 224 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 225 226 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 227 movdqa xmmB, xmmE 228 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 229 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 230 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 231 232 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 233 movdqa xmmB, xmmF 234 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 235 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 236 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 237 238 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 239 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 240 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 241 242 cmp rcx, byte SIZEOF_XMMWORD 243 jb short .column_st32 244 245 test rdi, SIZEOF_XMMWORD-1 246 jnz short .out1 247 ; --(aligned)------------------- 248 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 249 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 250 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 251 jmp short .out0 252 .out1: ; --(unaligned)----------------- 253 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 254 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 255 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 256 .out0: 257 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 258 sub rcx, byte SIZEOF_XMMWORD 259 jz near .endcolumn 260 261 add rsi, byte SIZEOF_XMMWORD ; inptr0 262 dec al ; Yctr 263 jnz near .Yloop_2nd 264 265 add rbx, byte SIZEOF_XMMWORD ; inptr1 266 add rdx, byte SIZEOF_XMMWORD ; inptr2 267 jmp near .columnloop 268 269 .column_st32: 270 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 271 cmp rcx, byte 2*SIZEOF_XMMWORD 272 jb short .column_st16 273 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 274 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 275 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 276 movdqa xmmA, xmmF 277 sub rcx, byte 2*SIZEOF_XMMWORD 278 jmp short .column_st15 279 .column_st16: 280 cmp rcx, byte SIZEOF_XMMWORD 281 jb short .column_st15 282 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 283 add rdi, byte SIZEOF_XMMWORD ; outptr 284 movdqa xmmA, xmmD 285 sub rcx, byte SIZEOF_XMMWORD 286 .column_st15: 287 ; Store the lower 8 bytes of xmmA to the output when it has enough 288 ; space. 289 cmp rcx, byte SIZEOF_MMWORD 290 jb short .column_st7 291 movq XMM_MMWORD [rdi], xmmA 292 add rdi, byte SIZEOF_MMWORD 293 sub rcx, byte SIZEOF_MMWORD 294 psrldq xmmA, SIZEOF_MMWORD 295 .column_st7: 296 ; Store the lower 4 bytes of xmmA to the output when it has enough 297 ; space. 298 cmp rcx, byte SIZEOF_DWORD 299 jb short .column_st3 300 movd XMM_DWORD [rdi], xmmA 301 add rdi, byte SIZEOF_DWORD 302 sub rcx, byte SIZEOF_DWORD 303 psrldq xmmA, SIZEOF_DWORD 304 .column_st3: 305 ; Store the lower 2 bytes of rax to the output when it has enough 306 ; space. 307 movd eax, xmmA 308 cmp rcx, byte SIZEOF_WORD 309 jb short .column_st1 310 mov word [rdi], ax 311 add rdi, byte SIZEOF_WORD 312 sub rcx, byte SIZEOF_WORD 313 shr rax, 16 314 .column_st1: 315 ; Store the lower 1 byte of rax to the output when it has enough 316 ; space. 317 test rcx, rcx 318 jz short .endcolumn 319 mov byte [rdi], al 320 321 %else ; RGB_PIXELSIZE == 4 ; ----------- 322 323 %ifdef RGBX_FILLER_0XFF 324 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 325 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 326 %else 327 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 328 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 329 %endif 330 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 331 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 332 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 333 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 334 335 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 336 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 337 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 338 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 339 340 movdqa xmmC, xmmA 341 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 342 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 343 movdqa xmmG, xmmB 344 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 345 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 346 347 movdqa xmmD, xmmA 348 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 349 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 350 movdqa xmmH, xmmC 351 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 352 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 353 354 cmp rcx, byte SIZEOF_XMMWORD 355 jb short .column_st32 356 357 test rdi, SIZEOF_XMMWORD-1 358 jnz short .out1 359 ; --(aligned)------------------- 360 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 361 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 362 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 363 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 364 jmp short .out0 365 .out1: ; --(unaligned)----------------- 366 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 367 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 368 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 369 movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 370 .out0: 371 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 372 sub rcx, byte SIZEOF_XMMWORD 373 jz near .endcolumn 374 375 add rsi, byte SIZEOF_XMMWORD ; inptr0 376 dec al ; Yctr 377 jnz near .Yloop_2nd 378 379 add rbx, byte SIZEOF_XMMWORD ; inptr1 380 add rdx, byte SIZEOF_XMMWORD ; inptr2 381 jmp near .columnloop 382 383 .column_st32: 384 cmp rcx, byte SIZEOF_XMMWORD/2 385 jb short .column_st16 386 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 387 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 388 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 389 movdqa xmmA, xmmC 390 movdqa xmmD, xmmH 391 sub rcx, byte SIZEOF_XMMWORD/2 392 .column_st16: 393 cmp rcx, byte SIZEOF_XMMWORD/4 394 jb short .column_st15 395 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 396 add rdi, byte SIZEOF_XMMWORD ; outptr 397 movdqa xmmA, xmmD 398 sub rcx, byte SIZEOF_XMMWORD/4 399 .column_st15: 400 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 401 ; space. 402 cmp rcx, byte SIZEOF_XMMWORD/8 403 jb short .column_st7 404 movq XMM_MMWORD [rdi], xmmA 405 add rdi, byte SIZEOF_XMMWORD/8*4 406 sub rcx, byte SIZEOF_XMMWORD/8 407 psrldq xmmA, SIZEOF_XMMWORD/8*4 408 .column_st7: 409 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 410 ; space. 411 test rcx, rcx 412 jz short .endcolumn 413 movd XMM_DWORD [rdi], xmmA 414 415 %endif ; RGB_PIXELSIZE ; --------------- 416 417 .endcolumn: 418 sfence ; flush the write buffer 419 420 .return: 421 pop rbx 422 UNCOLLECT_ARGS 4 423 lea rsp, [rbp-8] 424 pop r15 425 pop rbp 426 ret 427 428 ; -------------------------------------------------------------------------- 429 ; 430 ; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 431 ; 432 ; GLOBAL(void) 433 ; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, 434 ; JSAMPIMAGE input_buf, 435 ; JDIMENSION in_row_group_ctr, 436 ; JSAMPARRAY output_buf); 437 ; 438 439 ; r10d = JDIMENSION output_width 440 ; r11 = JSAMPIMAGE input_buf 441 ; r12d = JDIMENSION in_row_group_ctr 442 ; r13 = JSAMPARRAY output_buf 443 444 align 32 445 GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) 446 447 EXTN(jsimd_h2v2_merged_upsample_sse2): 448 ENDBR64 449 push rbp 450 mov rbp, rsp 451 COLLECT_ARGS 4 452 push rbx 453 454 mov eax, r10d 455 456 mov rdi, r11 457 mov ecx, r12d 458 mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 459 mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 460 mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 461 mov rdi, r13 462 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] 463 464 sub rsp, SIZEOF_JSAMPARRAY*4 465 mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 466 mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 467 mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 468 mov rbx, rsp 469 470 push rdi 471 push rcx 472 push rax 473 474 %ifdef WIN64 475 mov r8, rcx 476 mov r9, rdi 477 mov rcx, rax 478 mov rdx, rbx 479 %else 480 mov rdx, rcx 481 mov rcx, rdi 482 mov rdi, rax 483 mov rsi, rbx 484 %endif 485 486 call EXTN(jsimd_h2v1_merged_upsample_sse2) 487 488 pop rax 489 pop rcx 490 pop rdi 491 mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] 492 mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] 493 mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] 494 495 add rdi, byte SIZEOF_JSAMPROW ; outptr1 496 add rsi, byte SIZEOF_JSAMPROW ; inptr01 497 498 mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 499 mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 500 mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 501 mov rbx, rsp 502 503 push rdi 504 push rcx 505 push rax 506 507 %ifdef WIN64 508 mov r8, rcx 509 mov r9, rdi 510 mov rcx, rax 511 mov rdx, rbx 512 %else 513 mov rdx, rcx 514 mov rcx, rdi 515 mov rdi, rax 516 mov rsi, rbx 517 %endif 518 519 call EXTN(jsimd_h2v1_merged_upsample_sse2) 520 521 pop rax 522 pop rcx 523 pop rdi 524 mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] 525 mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] 526 mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] 527 add rsp, SIZEOF_JSAMPARRAY*4 528 529 pop rbx 530 UNCOLLECT_ARGS 4 531 pop rbp 532 ret 533 534 ; For some reason, the OS X linker does not honor the request to align the 535 ; segment unless we do this. 536 align 32