jdsample-sse2.asm (23196B)
1 ; 2 ; jdsample.asm - upsampling (64-bit SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 15 %include "jsimdext.inc" 16 17 ; -------------------------------------------------------------------------- 18 SECTION SEG_CONST 19 20 ALIGNZ 32 21 GLOBAL_DATA(jconst_fancy_upsample_sse2) 22 23 EXTN(jconst_fancy_upsample_sse2): 24 25 PW_ONE times 8 dw 1 26 PW_TWO times 8 dw 2 27 PW_THREE times 8 dw 3 28 PW_SEVEN times 8 dw 7 29 PW_EIGHT times 8 dw 8 30 31 ALIGNZ 32 32 33 ; -------------------------------------------------------------------------- 34 SECTION SEG_TEXT 35 BITS 64 36 ; 37 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 38 ; 39 ; The upsampling algorithm is linear interpolation between pixel centers, 40 ; also known as a "triangle filter". This is a good compromise between 41 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 42 ; of the way between input pixel centers. 43 ; 44 ; GLOBAL(void) 45 ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, 46 ; JDIMENSION downsampled_width, 47 ; JSAMPARRAY input_data, 48 ; JSAMPARRAY *output_data_ptr); 49 ; 50 51 ; r10 = int max_v_samp_factor 52 ; r11d = JDIMENSION downsampled_width 53 ; r12 = JSAMPARRAY input_data 54 ; r13 = JSAMPARRAY *output_data_ptr 55 56 align 32 57 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) 58 59 EXTN(jsimd_h2v1_fancy_upsample_sse2): 60 ENDBR64 61 push rbp 62 mov rbp, rsp 63 COLLECT_ARGS 4 64 65 mov eax, r11d ; colctr 66 test rax, rax 67 jz near .return 68 69 mov rcx, r10 ; rowctr 70 test rcx, rcx 71 jz near .return 72 73 mov rsi, r12 ; input_data 74 mov rdi, r13 75 mov rdip, JSAMPARRAY [rdi] ; output_data 76 .rowloop: 77 push rax ; colctr 78 push rdi 79 push rsi 80 81 mov rsip, JSAMPROW [rsi] ; inptr 82 mov rdip, JSAMPROW [rdi] ; outptr 83 84 test rax, SIZEOF_XMMWORD-1 85 jz short .skip 86 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 87 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 88 .skip: 89 pxor xmm0, xmm0 ; xmm0=(all 0's) 90 pcmpeqb xmm7, xmm7 91 psrldq xmm7, (SIZEOF_XMMWORD-1) 92 pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] 93 94 add rax, byte SIZEOF_XMMWORD-1 95 and rax, byte -SIZEOF_XMMWORD 96 cmp rax, byte SIZEOF_XMMWORD 97 ja short .columnloop 98 99 .columnloop_last: 100 pcmpeqb xmm6, xmm6 101 pslldq xmm6, (SIZEOF_XMMWORD-1) 102 pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] 103 jmp short .upsample 104 105 .columnloop: 106 movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] 107 pslldq xmm6, (SIZEOF_XMMWORD-1) 108 109 .upsample: 110 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 111 movdqa xmm2, xmm1 112 movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 113 pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) 114 psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) 115 116 por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 117 por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 118 119 movdqa xmm7, xmm1 120 psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 121 122 movdqa xmm4, xmm1 123 punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 124 punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 125 movdqa xmm5, xmm2 126 punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 127 punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 128 movdqa xmm6, xmm3 129 punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 130 punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 131 132 pmullw xmm1, [rel PW_THREE] 133 pmullw xmm4, [rel PW_THREE] 134 paddw xmm2, [rel PW_ONE] 135 paddw xmm5, [rel PW_ONE] 136 paddw xmm3, [rel PW_TWO] 137 paddw xmm6, [rel PW_TWO] 138 139 paddw xmm2, xmm1 140 paddw xmm5, xmm4 141 psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 142 psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 143 paddw xmm3, xmm1 144 paddw xmm6, xmm4 145 psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 146 psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 147 148 psllw xmm3, BYTE_BIT 149 psllw xmm6, BYTE_BIT 150 por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 151 por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 152 153 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 154 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 155 156 sub rax, byte SIZEOF_XMMWORD 157 add rsi, byte 1*SIZEOF_XMMWORD ; inptr 158 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 159 cmp rax, byte SIZEOF_XMMWORD 160 ja near .columnloop 161 test eax, eax 162 jnz near .columnloop_last 163 164 pop rsi 165 pop rdi 166 pop rax 167 168 add rsi, byte SIZEOF_JSAMPROW ; input_data 169 add rdi, byte SIZEOF_JSAMPROW ; output_data 170 dec rcx ; rowctr 171 jg near .rowloop 172 173 .return: 174 UNCOLLECT_ARGS 4 175 pop rbp 176 ret 177 178 ; -------------------------------------------------------------------------- 179 ; 180 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 181 ; Again a triangle filter; see comments for h2v1 case, above. 182 ; 183 ; GLOBAL(void) 184 ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, 185 ; JDIMENSION downsampled_width, 186 ; JSAMPARRAY input_data, 187 ; JSAMPARRAY *output_data_ptr); 188 ; 189 190 ; r10 = int max_v_samp_factor 191 ; r11d = JDIMENSION downsampled_width 192 ; r12 = JSAMPARRAY input_data 193 ; r13 = JSAMPARRAY *output_data_ptr 194 195 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 196 %define WK_NUM 4 197 198 align 32 199 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) 200 201 EXTN(jsimd_h2v2_fancy_upsample_sse2): 202 ENDBR64 203 push rbp 204 mov rbp, rsp 205 push r15 206 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 207 ; Allocate stack space for wk array. r15 is used to access it. 208 mov r15, rsp 209 sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) 210 COLLECT_ARGS 4 211 push rbx 212 213 mov eax, r11d ; colctr 214 test rax, rax 215 jz near .return 216 217 mov rcx, r10 ; rowctr 218 test rcx, rcx 219 jz near .return 220 221 mov rsi, r12 ; input_data 222 mov rdi, r13 223 mov rdip, JSAMPARRAY [rdi] ; output_data 224 .rowloop: 225 push rax ; colctr 226 push rcx 227 push rdi 228 push rsi 229 230 mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 231 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 232 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 233 mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 234 mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 235 236 test rax, SIZEOF_XMMWORD-1 237 jz short .skip 238 push rdx 239 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 240 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 241 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 242 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 243 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 244 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 245 pop rdx 246 .skip: 247 ; -- process the first column block 248 249 movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 250 movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 251 movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 252 253 pxor xmm3, xmm3 ; xmm3=(all 0's) 254 movdqa xmm4, xmm0 255 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 256 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 257 movdqa xmm5, xmm1 258 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 259 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 260 movdqa xmm6, xmm2 261 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 262 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 263 264 pmullw xmm0, [rel PW_THREE] 265 pmullw xmm4, [rel PW_THREE] 266 267 pcmpeqb xmm7, xmm7 268 psrldq xmm7, (SIZEOF_XMMWORD-2) 269 270 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 271 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 272 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 273 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 274 275 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 276 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 277 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 278 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 279 280 pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 281 pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 282 283 movdqa XMMWORD [wk(0)], xmm1 284 movdqa XMMWORD [wk(1)], xmm2 285 286 add rax, byte SIZEOF_XMMWORD-1 287 and rax, byte -SIZEOF_XMMWORD 288 cmp rax, byte SIZEOF_XMMWORD 289 ja short .columnloop 290 291 .columnloop_last: 292 ; -- process the last column block 293 294 pcmpeqb xmm1, xmm1 295 pslldq xmm1, (SIZEOF_XMMWORD-2) 296 movdqa xmm2, xmm1 297 298 pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] 299 pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] 300 301 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 302 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 303 304 jmp near .upsample 305 306 .columnloop: 307 ; -- process the next column block 308 309 movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 310 movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 311 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 312 313 pxor xmm3, xmm3 ; xmm3=(all 0's) 314 movdqa xmm4, xmm0 315 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 316 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 317 movdqa xmm5, xmm1 318 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 319 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 320 movdqa xmm6, xmm2 321 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 322 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 323 324 pmullw xmm0, [rel PW_THREE] 325 pmullw xmm4, [rel PW_THREE] 326 327 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 328 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 329 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 330 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 331 332 movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 333 movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 334 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 335 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 336 337 pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 338 pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 339 340 movdqa XMMWORD [wk(2)], xmm1 341 movdqa XMMWORD [wk(3)], xmm2 342 343 .upsample: 344 ; -- process the upper row 345 346 movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] 347 movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] 348 349 movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 350 movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 351 psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) 352 pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 353 movdqa xmm5, xmm7 354 movdqa xmm6, xmm3 355 psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 356 pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) 357 358 por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 359 por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 360 361 movdqa xmm1, xmm7 362 movdqa xmm2, xmm3 363 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 364 psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) 365 movdqa xmm4, xmm3 366 psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 367 368 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 369 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 370 371 movdqa XMMWORD [wk(0)], xmm4 372 373 pmullw xmm7, [rel PW_THREE] 374 pmullw xmm3, [rel PW_THREE] 375 paddw xmm1, [rel PW_EIGHT] 376 paddw xmm5, [rel PW_EIGHT] 377 paddw xmm0, [rel PW_SEVEN] 378 paddw xmm2, [rel PW_SEVEN] 379 380 paddw xmm1, xmm7 381 paddw xmm5, xmm3 382 psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 383 psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 384 paddw xmm0, xmm7 385 paddw xmm2, xmm3 386 psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 387 psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 388 389 psllw xmm0, BYTE_BIT 390 psllw xmm2, BYTE_BIT 391 por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 392 por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 393 394 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 395 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 396 397 ; -- process the lower row 398 399 movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] 400 movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] 401 402 movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 403 movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 404 psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) 405 pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 406 movdqa xmm0, xmm6 407 movdqa xmm2, xmm4 408 psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 409 pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) 410 411 por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 412 por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 413 414 movdqa xmm1, xmm6 415 movdqa xmm5, xmm4 416 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 417 psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) 418 movdqa xmm3, xmm4 419 psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 420 421 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 422 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 423 424 movdqa XMMWORD [wk(1)], xmm3 425 426 pmullw xmm6, [rel PW_THREE] 427 pmullw xmm4, [rel PW_THREE] 428 paddw xmm1, [rel PW_EIGHT] 429 paddw xmm0, [rel PW_EIGHT] 430 paddw xmm7, [rel PW_SEVEN] 431 paddw xmm5, [rel PW_SEVEN] 432 433 paddw xmm1, xmm6 434 paddw xmm0, xmm4 435 psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 436 psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 437 paddw xmm7, xmm6 438 paddw xmm5, xmm4 439 psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 440 psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 441 442 psllw xmm7, BYTE_BIT 443 psllw xmm5, BYTE_BIT 444 por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 445 por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 446 447 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 448 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 449 450 sub rax, byte SIZEOF_XMMWORD 451 add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 452 add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 453 add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 454 add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 455 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 456 cmp rax, byte SIZEOF_XMMWORD 457 ja near .columnloop 458 test rax, rax 459 jnz near .columnloop_last 460 461 pop rsi 462 pop rdi 463 pop rcx 464 pop rax 465 466 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 467 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 468 sub rcx, byte 2 ; rowctr 469 jg near .rowloop 470 471 .return: 472 pop rbx 473 UNCOLLECT_ARGS 4 474 lea rsp, [rbp-8] 475 pop r15 476 pop rbp 477 ret 478 479 ; -------------------------------------------------------------------------- 480 ; 481 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 482 ; It's still a box filter. 483 ; 484 ; GLOBAL(void) 485 ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 486 ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 487 ; 488 489 ; r10 = int max_v_samp_factor 490 ; r11d = JDIMENSION output_width 491 ; r12 = JSAMPARRAY input_data 492 ; r13 = JSAMPARRAY *output_data_ptr 493 494 align 32 495 GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) 496 497 EXTN(jsimd_h2v1_upsample_sse2): 498 ENDBR64 499 push rbp 500 mov rbp, rsp 501 COLLECT_ARGS 4 502 503 mov edx, r11d 504 add rdx, byte (2*SIZEOF_XMMWORD)-1 505 and rdx, byte -(2*SIZEOF_XMMWORD) 506 jz near .return 507 508 mov rcx, r10 ; rowctr 509 test rcx, rcx 510 jz short .return 511 512 mov rsi, r12 ; input_data 513 mov rdi, r13 514 mov rdip, JSAMPARRAY [rdi] ; output_data 515 .rowloop: 516 push rdi 517 push rsi 518 519 mov rsip, JSAMPROW [rsi] ; inptr 520 mov rdip, JSAMPROW [rdi] ; outptr 521 mov rax, rdx ; colctr 522 .columnloop: 523 524 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 525 526 movdqa xmm1, xmm0 527 punpcklbw xmm0, xmm0 528 punpckhbw xmm1, xmm1 529 530 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 531 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 532 533 sub rax, byte 2*SIZEOF_XMMWORD 534 jz short .nextrow 535 536 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 537 538 movdqa xmm3, xmm2 539 punpcklbw xmm2, xmm2 540 punpckhbw xmm3, xmm3 541 542 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 543 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 544 545 sub rax, byte 2*SIZEOF_XMMWORD 546 jz short .nextrow 547 548 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 549 add rdi, byte 4*SIZEOF_XMMWORD ; outptr 550 jmp short .columnloop 551 552 .nextrow: 553 pop rsi 554 pop rdi 555 556 add rsi, byte SIZEOF_JSAMPROW ; input_data 557 add rdi, byte SIZEOF_JSAMPROW ; output_data 558 dec rcx ; rowctr 559 jg short .rowloop 560 561 .return: 562 UNCOLLECT_ARGS 4 563 pop rbp 564 ret 565 566 ; -------------------------------------------------------------------------- 567 ; 568 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 569 ; It's still a box filter. 570 ; 571 ; GLOBAL(void) 572 ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 573 ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 574 ; 575 576 ; r10 = int max_v_samp_factor 577 ; r11d = JDIMENSION output_width 578 ; r12 = JSAMPARRAY input_data 579 ; r13 = JSAMPARRAY *output_data_ptr 580 581 align 32 582 GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) 583 584 EXTN(jsimd_h2v2_upsample_sse2): 585 ENDBR64 586 push rbp 587 mov rbp, rsp 588 COLLECT_ARGS 4 589 push rbx 590 591 mov edx, r11d 592 add rdx, byte (2*SIZEOF_XMMWORD)-1 593 and rdx, byte -(2*SIZEOF_XMMWORD) 594 jz near .return 595 596 mov rcx, r10 ; rowctr 597 test rcx, rcx 598 jz near .return 599 600 mov rsi, r12 ; input_data 601 mov rdi, r13 602 mov rdip, JSAMPARRAY [rdi] ; output_data 603 .rowloop: 604 push rdi 605 push rsi 606 607 mov rsip, JSAMPROW [rsi] ; inptr 608 mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 609 mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 610 mov rax, rdx ; colctr 611 .columnloop: 612 613 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 614 615 movdqa xmm1, xmm0 616 punpcklbw xmm0, xmm0 617 punpckhbw xmm1, xmm1 618 619 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 620 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 621 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 622 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 623 624 sub rax, byte 2*SIZEOF_XMMWORD 625 jz short .nextrow 626 627 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 628 629 movdqa xmm3, xmm2 630 punpcklbw xmm2, xmm2 631 punpckhbw xmm3, xmm3 632 633 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 634 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 635 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 636 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 637 638 sub rax, byte 2*SIZEOF_XMMWORD 639 jz short .nextrow 640 641 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 642 add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 643 add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 644 jmp short .columnloop 645 646 .nextrow: 647 pop rsi 648 pop rdi 649 650 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 651 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 652 sub rcx, byte 2 ; rowctr 653 jg near .rowloop 654 655 .return: 656 pop rbx 657 UNCOLLECT_ARGS 4 658 pop rbp 659 ret 660 661 ; For some reason, the OS X linker does not honor the request to align the 662 ; segment unless we do this. 663 align 32