jdsample-sse2.asm (26488B)
1 ; 2 ; jdsample.asm - upsampling (SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 15 ; -------------------------------------------------------------------------- 16 SECTION SEG_CONST 17 18 ALIGNZ 32 19 GLOBAL_DATA(jconst_fancy_upsample_sse2) 20 21 EXTN(jconst_fancy_upsample_sse2): 22 23 PW_ONE times 8 dw 1 24 PW_TWO times 8 dw 2 25 PW_THREE times 8 dw 3 26 PW_SEVEN times 8 dw 7 27 PW_EIGHT times 8 dw 8 28 29 ALIGNZ 32 30 31 ; -------------------------------------------------------------------------- 32 SECTION SEG_TEXT 33 BITS 32 34 ; 35 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 36 ; 37 ; The upsampling algorithm is linear interpolation between pixel centers, 38 ; also known as a "triangle filter". This is a good compromise between 39 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 40 ; of the way between input pixel centers. 41 ; 42 ; GLOBAL(void) 43 ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, 44 ; JDIMENSION downsampled_width, 45 ; JSAMPARRAY input_data, 46 ; JSAMPARRAY *output_data_ptr); 47 ; 48 49 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 50 %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 51 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 52 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 53 54 align 32 55 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) 56 57 EXTN(jsimd_h2v1_fancy_upsample_sse2): 58 push ebp 59 mov ebp, esp 60 PUSHPIC ebx 61 ; push ecx ; need not be preserved 62 ; push edx ; need not be preserved 63 push esi 64 push edi 65 66 GET_GOT ebx ; get GOT address 67 68 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 69 test eax, eax 70 jz near .return 71 72 mov ecx, INT [max_v_samp(ebp)] ; rowctr 73 test ecx, ecx 74 jz near .return 75 76 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 77 mov edi, POINTER [output_data_ptr(ebp)] 78 mov edi, JSAMPARRAY [edi] ; output_data 79 ALIGNX 16, 7 80 .rowloop: 81 push eax ; colctr 82 push edi 83 push esi 84 85 mov esi, JSAMPROW [esi] ; inptr 86 mov edi, JSAMPROW [edi] ; outptr 87 88 test eax, SIZEOF_XMMWORD-1 89 jz short .skip 90 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 91 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 92 .skip: 93 pxor xmm0, xmm0 ; xmm0=(all 0's) 94 pcmpeqb xmm7, xmm7 95 psrldq xmm7, (SIZEOF_XMMWORD-1) 96 pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] 97 98 add eax, byte SIZEOF_XMMWORD-1 99 and eax, byte -SIZEOF_XMMWORD 100 cmp eax, byte SIZEOF_XMMWORD 101 ja short .columnloop 102 ALIGNX 16, 7 103 104 .columnloop_last: 105 pcmpeqb xmm6, xmm6 106 pslldq xmm6, (SIZEOF_XMMWORD-1) 107 pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] 108 jmp short .upsample 109 ALIGNX 16, 7 110 111 .columnloop: 112 movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] 113 pslldq xmm6, (SIZEOF_XMMWORD-1) 114 115 .upsample: 116 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 117 movdqa xmm2, xmm1 118 movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 119 pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) 120 psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) 121 122 por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 123 por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 124 125 movdqa xmm7, xmm1 126 psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 127 128 movdqa xmm4, xmm1 129 punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 130 punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 131 movdqa xmm5, xmm2 132 punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 133 punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 134 movdqa xmm6, xmm3 135 punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 136 punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 137 138 pmullw xmm1, [GOTOFF(ebx,PW_THREE)] 139 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 140 paddw xmm2, [GOTOFF(ebx,PW_ONE)] 141 paddw xmm5, [GOTOFF(ebx,PW_ONE)] 142 paddw xmm3, [GOTOFF(ebx,PW_TWO)] 143 paddw xmm6, [GOTOFF(ebx,PW_TWO)] 144 145 paddw xmm2, xmm1 146 paddw xmm5, xmm4 147 psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 148 psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 149 paddw xmm3, xmm1 150 paddw xmm6, xmm4 151 psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 152 psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 153 154 psllw xmm3, BYTE_BIT 155 psllw xmm6, BYTE_BIT 156 por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 157 por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 158 159 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 160 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 161 162 sub eax, byte SIZEOF_XMMWORD 163 add esi, byte 1*SIZEOF_XMMWORD ; inptr 164 add edi, byte 2*SIZEOF_XMMWORD ; outptr 165 cmp eax, byte SIZEOF_XMMWORD 166 ja near .columnloop 167 test eax, eax 168 jnz near .columnloop_last 169 170 pop esi 171 pop edi 172 pop eax 173 174 add esi, byte SIZEOF_JSAMPROW ; input_data 175 add edi, byte SIZEOF_JSAMPROW ; output_data 176 dec ecx ; rowctr 177 jg near .rowloop 178 179 .return: 180 pop edi 181 pop esi 182 ; pop edx ; need not be preserved 183 ; pop ecx ; need not be preserved 184 POPPIC ebx 185 pop ebp 186 ret 187 188 ; -------------------------------------------------------------------------- 189 ; 190 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 191 ; Again a triangle filter; see comments for h2v1 case, above. 192 ; 193 ; GLOBAL(void) 194 ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, 195 ; JDIMENSION downsampled_width, 196 ; JSAMPARRAY input_data, 197 ; JSAMPARRAY *output_data_ptr); 198 ; 199 200 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 201 %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 202 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 203 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 204 205 %define original_ebp ebp + 0 206 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 207 ; xmmword wk[WK_NUM] 208 %define WK_NUM 4 209 %define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 210 211 align 32 212 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) 213 214 EXTN(jsimd_h2v2_fancy_upsample_sse2): 215 push ebp 216 mov eax, esp ; eax = original ebp 217 sub esp, byte 4 218 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 219 mov [esp], eax 220 mov ebp, esp ; ebp = aligned ebp 221 lea esp, [wk(0)] 222 PUSHPIC eax ; make a room for GOT address 223 push ebx 224 ; push ecx ; need not be preserved 225 ; push edx ; need not be preserved 226 push esi 227 push edi 228 229 GET_GOT ebx ; get GOT address 230 MOVPIC POINTER [gotptr], ebx ; save GOT address 231 232 mov edx, eax ; edx = original ebp 233 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 234 test eax, eax 235 jz near .return 236 237 mov ecx, INT [max_v_samp(edx)] ; rowctr 238 test ecx, ecx 239 jz near .return 240 241 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 242 mov edi, POINTER [output_data_ptr(edx)] 243 mov edi, JSAMPARRAY [edi] ; output_data 244 ALIGNX 16, 7 245 .rowloop: 246 push eax ; colctr 247 push ecx 248 push edi 249 push esi 250 251 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 252 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 253 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 254 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 255 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 256 257 test eax, SIZEOF_XMMWORD-1 258 jz short .skip 259 push edx 260 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 261 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 262 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 263 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 264 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 265 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 266 pop edx 267 .skip: 268 ; -- process the first column block 269 270 movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 271 movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 272 movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 273 274 PUSHPIC ebx 275 MOVPIC ebx, POINTER [gotptr] ; load GOT address 276 277 pxor xmm3, xmm3 ; xmm3=(all 0's) 278 movdqa xmm4, xmm0 279 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 280 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 281 movdqa xmm5, xmm1 282 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 283 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 284 movdqa xmm6, xmm2 285 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 286 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 287 288 pmullw xmm0, [GOTOFF(ebx,PW_THREE)] 289 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 290 291 pcmpeqb xmm7, xmm7 292 psrldq xmm7, (SIZEOF_XMMWORD-2) 293 294 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 295 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 296 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 297 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 298 299 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 300 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 301 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 302 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 303 304 pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 305 pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 306 307 movdqa XMMWORD [wk(0)], xmm1 308 movdqa XMMWORD [wk(1)], xmm2 309 310 POPPIC ebx 311 312 add eax, byte SIZEOF_XMMWORD-1 313 and eax, byte -SIZEOF_XMMWORD 314 cmp eax, byte SIZEOF_XMMWORD 315 ja short .columnloop 316 ALIGNX 16, 7 317 318 .columnloop_last: 319 ; -- process the last column block 320 321 PUSHPIC ebx 322 MOVPIC ebx, POINTER [gotptr] ; load GOT address 323 324 pcmpeqb xmm1, xmm1 325 pslldq xmm1, (SIZEOF_XMMWORD-2) 326 movdqa xmm2, xmm1 327 328 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] 329 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] 330 331 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 332 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 333 334 jmp near .upsample 335 ALIGNX 16, 7 336 337 .columnloop: 338 ; -- process the next column block 339 340 movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 341 movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 342 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 343 344 PUSHPIC ebx 345 MOVPIC ebx, POINTER [gotptr] ; load GOT address 346 347 pxor xmm3, xmm3 ; xmm3=(all 0's) 348 movdqa xmm4, xmm0 349 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 350 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 351 movdqa xmm5, xmm1 352 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 353 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 354 movdqa xmm6, xmm2 355 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 356 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 357 358 pmullw xmm0, [GOTOFF(ebx,PW_THREE)] 359 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 360 361 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 362 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 363 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 364 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 365 366 movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 367 movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 368 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 369 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 370 371 pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 372 pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 373 374 movdqa XMMWORD [wk(2)], xmm1 375 movdqa XMMWORD [wk(3)], xmm2 376 377 .upsample: 378 ; -- process the upper row 379 380 movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] 381 movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] 382 383 movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 384 movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 385 psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) 386 pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 387 movdqa xmm5, xmm7 388 movdqa xmm6, xmm3 389 psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 390 pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) 391 392 por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 393 por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 394 395 movdqa xmm1, xmm7 396 movdqa xmm2, xmm3 397 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 398 psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) 399 movdqa xmm4, xmm3 400 psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 401 402 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 403 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 404 405 movdqa XMMWORD [wk(0)], xmm4 406 407 pmullw xmm7, [GOTOFF(ebx,PW_THREE)] 408 pmullw xmm3, [GOTOFF(ebx,PW_THREE)] 409 paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] 410 paddw xmm5, [GOTOFF(ebx,PW_EIGHT)] 411 paddw xmm0, [GOTOFF(ebx,PW_SEVEN)] 412 paddw xmm2, [GOTOFF(ebx,PW_SEVEN)] 413 414 paddw xmm1, xmm7 415 paddw xmm5, xmm3 416 psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 417 psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 418 paddw xmm0, xmm7 419 paddw xmm2, xmm3 420 psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 421 psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 422 423 psllw xmm0, BYTE_BIT 424 psllw xmm2, BYTE_BIT 425 por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 426 por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 427 428 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 429 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 430 431 ; -- process the lower row 432 433 movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] 434 movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] 435 436 movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 437 movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 438 psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) 439 pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 440 movdqa xmm0, xmm6 441 movdqa xmm2, xmm4 442 psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 443 pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) 444 445 por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 446 por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 447 448 movdqa xmm1, xmm6 449 movdqa xmm5, xmm4 450 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 451 psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) 452 movdqa xmm3, xmm4 453 psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 454 455 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 456 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 457 458 movdqa XMMWORD [wk(1)], xmm3 459 460 pmullw xmm6, [GOTOFF(ebx,PW_THREE)] 461 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 462 paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] 463 paddw xmm0, [GOTOFF(ebx,PW_EIGHT)] 464 paddw xmm7, [GOTOFF(ebx,PW_SEVEN)] 465 paddw xmm5, [GOTOFF(ebx,PW_SEVEN)] 466 467 paddw xmm1, xmm6 468 paddw xmm0, xmm4 469 psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 470 psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 471 paddw xmm7, xmm6 472 paddw xmm5, xmm4 473 psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 474 psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 475 476 psllw xmm7, BYTE_BIT 477 psllw xmm5, BYTE_BIT 478 por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 479 por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 480 481 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 482 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 483 484 POPPIC ebx 485 486 sub eax, byte SIZEOF_XMMWORD 487 add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 488 add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 489 add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 490 add edx, byte 2*SIZEOF_XMMWORD ; outptr0 491 add edi, byte 2*SIZEOF_XMMWORD ; outptr1 492 cmp eax, byte SIZEOF_XMMWORD 493 ja near .columnloop 494 test eax, eax 495 jnz near .columnloop_last 496 497 pop esi 498 pop edi 499 pop ecx 500 pop eax 501 502 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 503 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 504 sub ecx, byte 2 ; rowctr 505 jg near .rowloop 506 507 .return: 508 pop edi 509 pop esi 510 ; pop edx ; need not be preserved 511 ; pop ecx ; need not be preserved 512 pop ebx 513 mov esp, ebp ; esp <- aligned ebp 514 pop esp ; esp <- original ebp 515 pop ebp 516 ret 517 518 ; -------------------------------------------------------------------------- 519 ; 520 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 521 ; It's still a box filter. 522 ; 523 ; GLOBAL(void) 524 ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 525 ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 526 ; 527 528 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 529 %define output_width(b) (b) + 12 ; JDIMENSION output_width 530 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 531 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 532 533 align 32 534 GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) 535 536 EXTN(jsimd_h2v1_upsample_sse2): 537 push ebp 538 mov ebp, esp 539 ; push ebx ; unused 540 ; push ecx ; need not be preserved 541 ; push edx ; need not be preserved 542 push esi 543 push edi 544 545 mov edx, JDIMENSION [output_width(ebp)] 546 add edx, byte (2*SIZEOF_XMMWORD)-1 547 and edx, byte -(2*SIZEOF_XMMWORD) 548 jz short .return 549 550 mov ecx, INT [max_v_samp(ebp)] ; rowctr 551 test ecx, ecx 552 jz short .return 553 554 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 555 mov edi, POINTER [output_data_ptr(ebp)] 556 mov edi, JSAMPARRAY [edi] ; output_data 557 ALIGNX 16, 7 558 .rowloop: 559 push edi 560 push esi 561 562 mov esi, JSAMPROW [esi] ; inptr 563 mov edi, JSAMPROW [edi] ; outptr 564 mov eax, edx ; colctr 565 ALIGNX 16, 7 566 .columnloop: 567 568 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 569 570 movdqa xmm1, xmm0 571 punpcklbw xmm0, xmm0 572 punpckhbw xmm1, xmm1 573 574 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 575 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 576 577 sub eax, byte 2*SIZEOF_XMMWORD 578 jz short .nextrow 579 580 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 581 582 movdqa xmm3, xmm2 583 punpcklbw xmm2, xmm2 584 punpckhbw xmm3, xmm3 585 586 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 587 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 588 589 sub eax, byte 2*SIZEOF_XMMWORD 590 jz short .nextrow 591 592 add esi, byte 2*SIZEOF_XMMWORD ; inptr 593 add edi, byte 4*SIZEOF_XMMWORD ; outptr 594 jmp short .columnloop 595 ALIGNX 16, 7 596 597 .nextrow: 598 pop esi 599 pop edi 600 601 add esi, byte SIZEOF_JSAMPROW ; input_data 602 add edi, byte SIZEOF_JSAMPROW ; output_data 603 dec ecx ; rowctr 604 jg short .rowloop 605 606 .return: 607 pop edi 608 pop esi 609 ; pop edx ; need not be preserved 610 ; pop ecx ; need not be preserved 611 ; pop ebx ; unused 612 pop ebp 613 ret 614 615 ; -------------------------------------------------------------------------- 616 ; 617 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 618 ; It's still a box filter. 619 ; 620 ; GLOBAL(void) 621 ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 622 ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 623 ; 624 625 %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 626 %define output_width(b) (b) + 12 ; JDIMENSION output_width 627 %define input_data(b) (b) + 16 ; JSAMPARRAY input_data 628 %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 629 630 align 32 631 GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) 632 633 EXTN(jsimd_h2v2_upsample_sse2): 634 push ebp 635 mov ebp, esp 636 push ebx 637 ; push ecx ; need not be preserved 638 ; push edx ; need not be preserved 639 push esi 640 push edi 641 642 mov edx, JDIMENSION [output_width(ebp)] 643 add edx, byte (2*SIZEOF_XMMWORD)-1 644 and edx, byte -(2*SIZEOF_XMMWORD) 645 jz near .return 646 647 mov ecx, INT [max_v_samp(ebp)] ; rowctr 648 test ecx, ecx 649 jz near .return 650 651 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 652 mov edi, POINTER [output_data_ptr(ebp)] 653 mov edi, JSAMPARRAY [edi] ; output_data 654 ALIGNX 16, 7 655 .rowloop: 656 push edi 657 push esi 658 659 mov esi, JSAMPROW [esi] ; inptr 660 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 661 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 662 mov eax, edx ; colctr 663 ALIGNX 16, 7 664 .columnloop: 665 666 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 667 668 movdqa xmm1, xmm0 669 punpcklbw xmm0, xmm0 670 punpckhbw xmm1, xmm1 671 672 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 673 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 674 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 675 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 676 677 sub eax, byte 2*SIZEOF_XMMWORD 678 jz short .nextrow 679 680 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 681 682 movdqa xmm3, xmm2 683 punpcklbw xmm2, xmm2 684 punpckhbw xmm3, xmm3 685 686 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 687 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 688 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 689 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 690 691 sub eax, byte 2*SIZEOF_XMMWORD 692 jz short .nextrow 693 694 add esi, byte 2*SIZEOF_XMMWORD ; inptr 695 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 696 add edi, byte 4*SIZEOF_XMMWORD ; outptr1 697 jmp short .columnloop 698 ALIGNX 16, 7 699 700 .nextrow: 701 pop esi 702 pop edi 703 704 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 705 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 706 sub ecx, byte 2 ; rowctr 707 jg short .rowloop 708 709 .return: 710 pop edi 711 pop esi 712 ; pop edx ; need not be preserved 713 ; pop ecx ; need not be preserved 714 pop ebx 715 pop ebp 716 ret 717 718 ; For some reason, the OS X linker does not honor the request to align the 719 ; segment unless we do this. 720 align 32