jdsample-mmx.asm (25380B)
1 ; 2 ; jdsample.asm - upsampling (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 15 ; -------------------------------------------------------------------------- 16 SECTION SEG_CONST 17 18 ALIGNZ 32 19 GLOBAL_DATA(jconst_fancy_upsample_mmx) 20 21 EXTN(jconst_fancy_upsample_mmx): 22 23 PW_ONE times 4 dw 1 24 PW_TWO times 4 dw 2 25 PW_THREE times 4 dw 3 26 PW_SEVEN times 4 dw 7 27 PW_EIGHT times 4 dw 8 28 29 ALIGNZ 32 30 31 ; -------------------------------------------------------------------------- 32 SECTION SEG_TEXT 33 BITS 32 34 ; 35 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 36 ; 37 ; The upsampling algorithm is linear interpolation between pixel centers, 38 ; also known as a "triangle filter". This is a good compromise between 39 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 40 ; of the way between input pixel centers. 41 ; 42 ; GLOBAL(void) 43 ; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor, 44 ; JDIMENSION downsampled_width, 45 ; JSAMPARRAY input_data, 46 ; JSAMPARRAY *output_data_ptr); 47 ; 48 49 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 50 %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 51 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 52 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 53 54 align 32 55 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx) 56 57 EXTN(jsimd_h2v1_fancy_upsample_mmx): 58 push ebp 59 mov ebp, esp 60 PUSHPIC ebx 61 ; push ecx ; need not be preserved 62 ; push edx ; need not be preserved 63 push esi 64 push edi 65 66 GET_GOT ebx ; get GOT address 67 68 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 69 test eax, eax 70 jz near .return 71 72 mov ecx, INT [max_v_samp(ebp)] ; rowctr 73 test ecx, ecx 74 jz near .return 75 76 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 77 mov edi, POINTER [output_data_ptr(ebp)] 78 mov edi, JSAMPARRAY [edi] ; output_data 79 ALIGNX 16, 7 80 .rowloop: 81 push eax ; colctr 82 push edi 83 push esi 84 85 mov esi, JSAMPROW [esi] ; inptr 86 mov edi, JSAMPROW [edi] ; outptr 87 88 test eax, SIZEOF_MMWORD-1 89 jz short .skip 90 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 91 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 92 .skip: 93 pxor mm0, mm0 ; mm0=(all 0's) 94 pcmpeqb mm7, mm7 95 psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT 96 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] 97 98 add eax, byte SIZEOF_MMWORD-1 99 and eax, byte -SIZEOF_MMWORD 100 cmp eax, byte SIZEOF_MMWORD 101 ja short .columnloop 102 ALIGNX 16, 7 103 104 .columnloop_last: 105 pcmpeqb mm6, mm6 106 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT 107 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] 108 jmp short .upsample 109 ALIGNX 16, 7 110 111 .columnloop: 112 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] 113 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT 114 115 .upsample: 116 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 117 movq mm2, mm1 118 movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7) 119 psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) 120 psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) 121 122 por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6) 123 por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8) 124 125 movq mm7, mm1 126 psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) 127 128 movq mm4, mm1 129 punpcklbw mm1, mm0 ; mm1=( 0 1 2 3) 130 punpckhbw mm4, mm0 ; mm4=( 4 5 6 7) 131 movq mm5, mm2 132 punpcklbw mm2, mm0 ; mm2=(-1 0 1 2) 133 punpckhbw mm5, mm0 ; mm5=( 3 4 5 6) 134 movq mm6, mm3 135 punpcklbw mm3, mm0 ; mm3=( 1 2 3 4) 136 punpckhbw mm6, mm0 ; mm6=( 5 6 7 8) 137 138 pmullw mm1, [GOTOFF(ebx,PW_THREE)] 139 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 140 paddw mm2, [GOTOFF(ebx,PW_ONE)] 141 paddw mm5, [GOTOFF(ebx,PW_ONE)] 142 paddw mm3, [GOTOFF(ebx,PW_TWO)] 143 paddw mm6, [GOTOFF(ebx,PW_TWO)] 144 145 paddw mm2, mm1 146 paddw mm5, mm4 147 psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6) 148 psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14) 149 paddw mm3, mm1 150 paddw mm6, mm4 151 psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7) 152 psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15) 153 154 psllw mm3, BYTE_BIT 155 psllw mm6, BYTE_BIT 156 por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) 157 por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) 158 159 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 160 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 161 162 sub eax, byte SIZEOF_MMWORD 163 add esi, byte 1*SIZEOF_MMWORD ; inptr 164 add edi, byte 2*SIZEOF_MMWORD ; outptr 165 cmp eax, byte SIZEOF_MMWORD 166 ja near .columnloop 167 test eax, eax 168 jnz near .columnloop_last 169 170 pop esi 171 pop edi 172 pop eax 173 174 add esi, byte SIZEOF_JSAMPROW ; input_data 175 add edi, byte SIZEOF_JSAMPROW ; output_data 176 dec ecx ; rowctr 177 jg near .rowloop 178 179 emms ; empty MMX state 180 181 .return: 182 pop edi 183 pop esi 184 ; pop edx ; need not be preserved 185 ; pop ecx ; need not be preserved 186 POPPIC ebx 187 pop ebp 188 ret 189 190 ; -------------------------------------------------------------------------- 191 ; 192 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 193 ; Again a triangle filter; see comments for h2v1 case, above. 194 ; 195 ; GLOBAL(void) 196 ; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor, 197 ; JDIMENSION downsampled_width, 198 ; JSAMPARRAY input_data, 199 ; JSAMPARRAY *output_data_ptr); 200 ; 201 202 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 203 %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 204 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 205 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 206 207 %define original_ebp ebp + 0 208 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 209 %define WK_NUM 4 210 %define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 211 212 align 32 213 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx) 214 215 EXTN(jsimd_h2v2_fancy_upsample_mmx): 216 push ebp 217 mov eax, esp ; eax = original ebp 218 sub esp, byte 4 219 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 220 mov [esp], eax 221 mov ebp, esp ; ebp = aligned ebp 222 lea esp, [wk(0)] 223 PUSHPIC eax ; make a room for GOT address 224 push ebx 225 ; push ecx ; need not be preserved 226 ; push edx ; need not be preserved 227 push esi 228 push edi 229 230 GET_GOT ebx ; get GOT address 231 MOVPIC POINTER [gotptr], ebx ; save GOT address 232 233 mov edx, eax ; edx = original ebp 234 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 235 test eax, eax 236 jz near .return 237 238 mov ecx, INT [max_v_samp(edx)] ; rowctr 239 test ecx, ecx 240 jz near .return 241 242 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 243 mov edi, POINTER [output_data_ptr(edx)] 244 mov edi, JSAMPARRAY [edi] ; output_data 245 ALIGNX 16, 7 246 .rowloop: 247 push eax ; colctr 248 push ecx 249 push edi 250 push esi 251 252 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 253 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 254 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 255 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 256 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 257 258 test eax, SIZEOF_MMWORD-1 259 jz short .skip 260 push edx 261 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 262 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 263 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 264 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 265 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 266 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 267 pop edx 268 .skip: 269 ; -- process the first column block 270 271 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] 272 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] 273 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] 274 275 PUSHPIC ebx 276 MOVPIC ebx, POINTER [gotptr] ; load GOT address 277 278 pxor mm3, mm3 ; mm3=(all 0's) 279 movq mm4, mm0 280 punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3) 281 punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7) 282 movq mm5, mm1 283 punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3) 284 punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7) 285 movq mm6, mm2 286 punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3) 287 punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7) 288 289 pmullw mm0, [GOTOFF(ebx,PW_THREE)] 290 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 291 292 pcmpeqb mm7, mm7 293 psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT 294 295 paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) 296 paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) 297 paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) 298 paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) 299 300 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save 301 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data 302 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 303 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 304 305 pand mm1, mm7 ; mm1=( 0 - - -) 306 pand mm2, mm7 ; mm2=( 0 - - -) 307 308 movq MMWORD [wk(0)], mm1 309 movq MMWORD [wk(1)], mm2 310 311 POPPIC ebx 312 313 add eax, byte SIZEOF_MMWORD-1 314 and eax, byte -SIZEOF_MMWORD 315 cmp eax, byte SIZEOF_MMWORD 316 ja short .columnloop 317 ALIGNX 16, 7 318 319 .columnloop_last: 320 ; -- process the last column block 321 322 PUSHPIC ebx 323 MOVPIC ebx, POINTER [gotptr] ; load GOT address 324 325 pcmpeqb mm1, mm1 326 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT 327 movq mm2, mm1 328 329 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) 330 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) 331 332 movq MMWORD [wk(2)], mm1 333 movq MMWORD [wk(3)], mm2 334 335 jmp short .upsample 336 ALIGNX 16, 7 337 338 .columnloop: 339 ; -- process the next column block 340 341 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] 342 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] 343 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] 344 345 PUSHPIC ebx 346 MOVPIC ebx, POINTER [gotptr] ; load GOT address 347 348 pxor mm3, mm3 ; mm3=(all 0's) 349 movq mm4, mm0 350 punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3) 351 punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7) 352 movq mm5, mm1 353 punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3) 354 punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7) 355 movq mm6, mm2 356 punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3) 357 punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7) 358 359 pmullw mm0, [GOTOFF(ebx,PW_THREE)] 360 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 361 362 paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) 363 paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) 364 paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) 365 paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) 366 367 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save 368 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data 369 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 370 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 371 372 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) 373 psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) 374 375 movq MMWORD [wk(2)], mm1 376 movq MMWORD [wk(3)], mm2 377 378 .upsample: 379 ; -- process the upper row 380 381 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) 382 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) 383 384 movq mm0, mm7 385 movq mm4, mm3 386 psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -) 387 psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) 388 movq mm5, mm7 389 movq mm6, mm3 390 psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) 391 psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6) 392 393 por mm0, mm4 ; mm0=( 1 2 3 4) 394 por mm5, mm6 ; mm5=( 3 4 5 6) 395 396 movq mm1, mm7 397 movq mm2, mm3 398 psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) 399 psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -) 400 movq mm4, mm3 401 psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) 402 403 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) 404 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) 405 406 movq MMWORD [wk(0)], mm4 407 408 pmullw mm7, [GOTOFF(ebx,PW_THREE)] 409 pmullw mm3, [GOTOFF(ebx,PW_THREE)] 410 paddw mm1, [GOTOFF(ebx,PW_EIGHT)] 411 paddw mm5, [GOTOFF(ebx,PW_EIGHT)] 412 paddw mm0, [GOTOFF(ebx,PW_SEVEN)] 413 paddw mm2, [GOTOFF(ebx,PW_SEVEN)] 414 415 paddw mm1, mm7 416 paddw mm5, mm3 417 psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6) 418 psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14) 419 paddw mm0, mm7 420 paddw mm2, mm3 421 psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7) 422 psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15) 423 424 psllw mm0, BYTE_BIT 425 psllw mm2, BYTE_BIT 426 por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) 427 por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) 428 429 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 430 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 431 432 ; -- process the lower row 433 434 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) 435 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) 436 437 movq mm7, mm6 438 movq mm3, mm4 439 psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -) 440 psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) 441 movq mm0, mm6 442 movq mm2, mm4 443 psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) 444 psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6) 445 446 por mm7, mm3 ; mm7=( 1 2 3 4) 447 por mm0, mm2 ; mm0=( 3 4 5 6) 448 449 movq mm1, mm6 450 movq mm5, mm4 451 psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) 452 psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -) 453 movq mm3, mm4 454 psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) 455 456 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) 457 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) 458 459 movq MMWORD [wk(1)], mm3 460 461 pmullw mm6, [GOTOFF(ebx,PW_THREE)] 462 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 463 paddw mm1, [GOTOFF(ebx,PW_EIGHT)] 464 paddw mm0, [GOTOFF(ebx,PW_EIGHT)] 465 paddw mm7, [GOTOFF(ebx,PW_SEVEN)] 466 paddw mm5, [GOTOFF(ebx,PW_SEVEN)] 467 468 paddw mm1, mm6 469 paddw mm0, mm4 470 psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6) 471 psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14) 472 paddw mm7, mm6 473 paddw mm5, mm4 474 psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7) 475 psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15) 476 477 psllw mm7, BYTE_BIT 478 psllw mm5, BYTE_BIT 479 por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) 480 por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) 481 482 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 483 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 484 485 POPPIC ebx 486 487 sub eax, byte SIZEOF_MMWORD 488 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) 489 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 490 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) 491 add edx, byte 2*SIZEOF_MMWORD ; outptr0 492 add edi, byte 2*SIZEOF_MMWORD ; outptr1 493 cmp eax, byte SIZEOF_MMWORD 494 ja near .columnloop 495 test eax, eax 496 jnz near .columnloop_last 497 498 pop esi 499 pop edi 500 pop ecx 501 pop eax 502 503 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 504 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 505 sub ecx, byte 2 ; rowctr 506 jg near .rowloop 507 508 emms ; empty MMX state 509 510 .return: 511 pop edi 512 pop esi 513 ; pop edx ; need not be preserved 514 ; pop ecx ; need not be preserved 515 pop ebx 516 mov esp, ebp ; esp <- aligned ebp 517 pop esp ; esp <- original ebp 518 pop ebp 519 ret 520 521 ; -------------------------------------------------------------------------- 522 ; 523 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 524 ; It's still a box filter. 525 ; 526 ; GLOBAL(void) 527 ; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, 528 ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 529 ; 530 531 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 532 %define output_width(b) (b) + 12 ; JDIMENSION output_width 533 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 534 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 535 536 align 32 537 GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx) 538 539 EXTN(jsimd_h2v1_upsample_mmx): 540 push ebp 541 mov ebp, esp 542 ; push ebx ; unused 543 ; push ecx ; need not be preserved 544 ; push edx ; need not be preserved 545 push esi 546 push edi 547 548 mov edx, JDIMENSION [output_width(ebp)] 549 add edx, byte (2*SIZEOF_MMWORD)-1 550 and edx, byte -(2*SIZEOF_MMWORD) 551 jz short .return 552 553 mov ecx, INT [max_v_samp(ebp)] ; rowctr 554 test ecx, ecx 555 jz short .return 556 557 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 558 mov edi, POINTER [output_data_ptr(ebp)] 559 mov edi, JSAMPARRAY [edi] ; output_data 560 ALIGNX 16, 7 561 .rowloop: 562 push edi 563 push esi 564 565 mov esi, JSAMPROW [esi] ; inptr 566 mov edi, JSAMPROW [edi] ; outptr 567 mov eax, edx ; colctr 568 ALIGNX 16, 7 569 .columnloop: 570 571 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 572 573 movq mm1, mm0 574 punpcklbw mm0, mm0 575 punpckhbw mm1, mm1 576 577 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 578 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 579 580 sub eax, byte 2*SIZEOF_MMWORD 581 jz short .nextrow 582 583 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 584 585 movq mm3, mm2 586 punpcklbw mm2, mm2 587 punpckhbw mm3, mm3 588 589 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 590 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 591 592 sub eax, byte 2*SIZEOF_MMWORD 593 jz short .nextrow 594 595 add esi, byte 2*SIZEOF_MMWORD ; inptr 596 add edi, byte 4*SIZEOF_MMWORD ; outptr 597 jmp short .columnloop 598 ALIGNX 16, 7 599 600 .nextrow: 601 pop esi 602 pop edi 603 604 add esi, byte SIZEOF_JSAMPROW ; input_data 605 add edi, byte SIZEOF_JSAMPROW ; output_data 606 dec ecx ; rowctr 607 jg short .rowloop 608 609 emms ; empty MMX state 610 611 .return: 612 pop edi 613 pop esi 614 ; pop edx ; need not be preserved 615 ; pop ecx ; need not be preserved 616 ; pop ebx ; unused 617 pop ebp 618 ret 619 620 ; -------------------------------------------------------------------------- 621 ; 622 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 623 ; It's still a box filter. 624 ; 625 ; GLOBAL(void) 626 ; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, 627 ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 628 ; 629 630 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 631 %define output_width(b) (b) + 12 ; JDIMENSION output_width 632 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 633 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 634 635 align 32 636 GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx) 637 638 EXTN(jsimd_h2v2_upsample_mmx): 639 push ebp 640 mov ebp, esp 641 push ebx 642 ; push ecx ; need not be preserved 643 ; push edx ; need not be preserved 644 push esi 645 push edi 646 647 mov edx, JDIMENSION [output_width(ebp)] 648 add edx, byte (2*SIZEOF_MMWORD)-1 649 and edx, byte -(2*SIZEOF_MMWORD) 650 jz near .return 651 652 mov ecx, INT [max_v_samp(ebp)] ; rowctr 653 test ecx, ecx 654 jz short .return 655 656 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 657 mov edi, POINTER [output_data_ptr(ebp)] 658 mov edi, JSAMPARRAY [edi] ; output_data 659 ALIGNX 16, 7 660 .rowloop: 661 push edi 662 push esi 663 664 mov esi, JSAMPROW [esi] ; inptr 665 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 666 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 667 mov eax, edx ; colctr 668 ALIGNX 16, 7 669 .columnloop: 670 671 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 672 673 movq mm1, mm0 674 punpcklbw mm0, mm0 675 punpckhbw mm1, mm1 676 677 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 678 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 679 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 680 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 681 682 sub eax, byte 2*SIZEOF_MMWORD 683 jz short .nextrow 684 685 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 686 687 movq mm3, mm2 688 punpcklbw mm2, mm2 689 punpckhbw mm3, mm3 690 691 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 692 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 693 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 694 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 695 696 sub eax, byte 2*SIZEOF_MMWORD 697 jz short .nextrow 698 699 add esi, byte 2*SIZEOF_MMWORD ; inptr 700 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 701 add edi, byte 4*SIZEOF_MMWORD ; outptr1 702 jmp short .columnloop 703 ALIGNX 16, 7 704 705 .nextrow: 706 pop esi 707 pop edi 708 709 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 710 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 711 sub ecx, byte 2 ; rowctr 712 jg short .rowloop 713 714 emms ; empty MMX state 715 716 .return: 717 pop edi 718 pop esi 719 ; pop edx ; need not be preserved 720 ; pop ecx ; need not be preserved 721 pop ebx 722 pop ebp 723 ret 724 725 ; For some reason, the OS X linker does not honor the request to align the 726 ; segment unless we do this. 727 align 32