jidctred-sse2.asm (23213B)
1 ; 2 ; jidctred.asm - reduced-size IDCT (SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains inverse-DCT routines that produce reduced-size 14 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. 15 ; The following code is based directly on the IJG's original jidctred.c; 16 ; see the jidctred.c for more details. 17 18 %include "jsimdext.inc" 19 %include "jdct.inc" 20 21 ; -------------------------------------------------------------------------- 22 23 %define CONST_BITS 13 24 %define PASS1_BITS 2 25 26 %define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) 27 %define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) 28 %define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) 29 %define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) 30 31 %if CONST_BITS == 13 32 F_0_211 equ 1730 ; FIX(0.211164243) 33 F_0_509 equ 4176 ; FIX(0.509795579) 34 F_0_601 equ 4926 ; FIX(0.601344887) 35 F_0_720 equ 5906 ; FIX(0.720959822) 36 F_0_765 equ 6270 ; FIX(0.765366865) 37 F_0_850 equ 6967 ; FIX(0.850430095) 38 F_0_899 equ 7373 ; FIX(0.899976223) 39 F_1_061 equ 8697 ; FIX(1.061594337) 40 F_1_272 equ 10426 ; FIX(1.272758580) 41 F_1_451 equ 11893 ; FIX(1.451774981) 42 F_1_847 equ 15137 ; FIX(1.847759065) 43 F_2_172 equ 17799 ; FIX(2.172734803) 44 F_2_562 equ 20995 ; FIX(2.562915447) 45 F_3_624 equ 29692 ; FIX(3.624509785) 46 %else 47 ; NASM cannot do compile-time arithmetic on floating-point constants. 48 %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 49 F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) 50 F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) 51 F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) 52 F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) 53 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 54 F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) 55 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 56 F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) 57 F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) 58 F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) 59 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 60 F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) 61 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 62 F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) 63 %endif 64 65 ; -------------------------------------------------------------------------- 66 SECTION SEG_CONST 67 68 ALIGNZ 32 69 GLOBAL_DATA(jconst_idct_red_sse2) 70 71 EXTN(jconst_idct_red_sse2): 72 73 PW_F184_MF076 times 4 dw F_1_847, -F_0_765 74 PW_F256_F089 times 4 dw F_2_562, F_0_899 75 PW_F106_MF217 times 4 dw F_1_061, -F_2_172 76 PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509 77 PW_F145_MF021 times 4 dw F_1_451, -F_0_211 78 PW_F362_MF127 times 4 dw F_3_624, -F_1_272 79 PW_F085_MF072 times 4 dw F_0_850, -F_0_720 80 PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1) 81 PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1) 82 PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) 83 PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) 84 PB_CENTERJSAMP times 16 db CENTERJSAMPLE 85 86 ALIGNZ 32 87 88 ; -------------------------------------------------------------------------- 89 SECTION SEG_TEXT 90 BITS 32 91 ; 92 ; Perform dequantization and inverse DCT on one block of coefficients, 93 ; producing a reduced-size 4x4 output block. 94 ; 95 ; GLOBAL(void) 96 ; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block, 97 ; JSAMPARRAY output_buf, JDIMENSION output_col) 98 ; 99 100 %define dct_table(b) (b) + 8 ; void *dct_table 101 %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 102 %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 103 %define output_col(b) (b) + 20 ; JDIMENSION output_col 104 105 %define original_ebp ebp + 0 106 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 107 ; xmmword wk[WK_NUM] 108 %define WK_NUM 2 109 110 align 32 111 GLOBAL_FUNCTION(jsimd_idct_4x4_sse2) 112 113 EXTN(jsimd_idct_4x4_sse2): 114 push ebp 115 mov eax, esp ; eax = original ebp 116 sub esp, byte 4 117 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 118 mov [esp], eax 119 mov ebp, esp ; ebp = aligned ebp 120 lea esp, [wk(0)] 121 PUSHPIC ebx 122 ; push ecx ; unused 123 ; push edx ; need not be preserved 124 push esi 125 push edi 126 127 GET_GOT ebx ; get GOT address 128 129 ; ---- Pass 1: process columns from input. 130 131 ; mov eax, [original_ebp] 132 mov edx, POINTER [dct_table(eax)] ; quantptr 133 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 134 135 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 136 mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 137 or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 138 jnz short .columnDCT 139 140 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 141 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 142 por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 143 por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 144 por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 145 por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 146 por xmm0, xmm1 147 packsswb xmm0, xmm0 148 packsswb xmm0, xmm0 149 movd eax, xmm0 150 test eax, eax 151 jnz short .columnDCT 152 153 ; -- AC terms all zero 154 155 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 156 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 157 158 psllw xmm0, PASS1_BITS 159 160 movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) 161 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 162 punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) 163 164 pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) 165 pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) 166 pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) 167 pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) 168 169 jmp near .column_end 170 ALIGNX 16, 7 171 %endif 172 .columnDCT: 173 174 ; -- Odd part 175 176 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 177 movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 178 pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 179 pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 180 movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 181 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 182 pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 183 pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 184 185 movdqa xmm4, xmm0 186 movdqa xmm5, xmm0 187 punpcklwd xmm4, xmm1 188 punpckhwd xmm5, xmm1 189 movdqa xmm0, xmm4 190 movdqa xmm1, xmm5 191 pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) 192 pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) 193 pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) 194 pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) 195 196 movdqa xmm6, xmm2 197 movdqa xmm7, xmm2 198 punpcklwd xmm6, xmm3 199 punpckhwd xmm7, xmm3 200 movdqa xmm2, xmm6 201 movdqa xmm3, xmm7 202 pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) 203 pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) 204 pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) 205 pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) 206 207 paddd xmm6, xmm4 ; xmm6=tmp2L 208 paddd xmm7, xmm5 ; xmm7=tmp2H 209 paddd xmm2, xmm0 ; xmm2=tmp0L 210 paddd xmm3, xmm1 ; xmm3=tmp0H 211 212 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L 213 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H 214 215 ; -- Even part 216 217 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 218 movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 219 movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 220 pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 221 pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 222 pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 223 224 pxor xmm1, xmm1 225 pxor xmm2, xmm2 226 punpcklwd xmm1, xmm4 ; xmm1=tmp0L 227 punpckhwd xmm2, xmm4 ; xmm2=tmp0H 228 psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 229 psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 230 231 movdqa xmm3, xmm5 ; xmm5=in2=z2 232 punpcklwd xmm5, xmm0 ; xmm0=in6=z3 233 punpckhwd xmm3, xmm0 234 pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L 235 pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H 236 237 movdqa xmm4, xmm1 238 movdqa xmm0, xmm2 239 paddd xmm1, xmm5 ; xmm1=tmp10L 240 paddd xmm2, xmm3 ; xmm2=tmp10H 241 psubd xmm4, xmm5 ; xmm4=tmp12L 242 psubd xmm0, xmm3 ; xmm0=tmp12H 243 244 ; -- Final output stage 245 246 movdqa xmm5, xmm1 247 movdqa xmm3, xmm2 248 paddd xmm1, xmm6 ; xmm1=data0L 249 paddd xmm2, xmm7 ; xmm2=data0H 250 psubd xmm5, xmm6 ; xmm5=data3L 251 psubd xmm3, xmm7 ; xmm3=data3H 252 253 movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] 254 255 paddd xmm1, xmm6 256 paddd xmm2, xmm6 257 psrad xmm1, DESCALE_P1_4 258 psrad xmm2, DESCALE_P1_4 259 paddd xmm5, xmm6 260 paddd xmm3, xmm6 261 psrad xmm5, DESCALE_P1_4 262 psrad xmm3, DESCALE_P1_4 263 264 packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) 265 packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) 266 267 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L 268 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H 269 270 movdqa xmm2, xmm4 271 movdqa xmm3, xmm0 272 paddd xmm4, xmm7 ; xmm4=data1L 273 paddd xmm0, xmm6 ; xmm0=data1H 274 psubd xmm2, xmm7 ; xmm2=data2L 275 psubd xmm3, xmm6 ; xmm3=data2H 276 277 movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] 278 279 paddd xmm4, xmm7 280 paddd xmm0, xmm7 281 psrad xmm4, DESCALE_P1_4 282 psrad xmm0, DESCALE_P1_4 283 paddd xmm2, xmm7 284 paddd xmm3, xmm7 285 psrad xmm2, DESCALE_P1_4 286 psrad xmm3, DESCALE_P1_4 287 288 packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) 289 packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) 290 291 movdqa xmm6, xmm1 ; transpose coefficients(phase 1) 292 punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) 293 punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) 294 movdqa xmm7, xmm2 ; transpose coefficients(phase 1) 295 punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) 296 punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) 297 298 movdqa xmm0, xmm1 ; transpose coefficients(phase 2) 299 punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) 300 punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) 301 movdqa xmm3, xmm6 ; transpose coefficients(phase 2) 302 punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) 303 punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) 304 .column_end: 305 306 ; -- Prefetch the next coefficient block 307 308 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 309 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 310 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 311 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 312 313 ; ---- Pass 2: process rows, store into output array. 314 315 mov eax, [original_ebp] 316 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 317 mov eax, JDIMENSION [output_col(eax)] 318 319 ; -- Even part 320 321 pxor xmm4, xmm4 322 punpcklwd xmm4, xmm1 ; xmm4=tmp0 323 psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 324 325 ; -- Odd part 326 327 punpckhwd xmm1, xmm0 328 punpckhwd xmm6, xmm3 329 movdqa xmm5, xmm1 330 movdqa xmm2, xmm6 331 pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) 332 pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) 333 pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) 334 pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) 335 336 paddd xmm6, xmm1 ; xmm6=tmp2 337 paddd xmm2, xmm5 ; xmm2=tmp0 338 339 ; -- Even part 340 341 punpcklwd xmm0, xmm3 342 pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 343 344 movdqa xmm7, xmm4 345 paddd xmm4, xmm0 ; xmm4=tmp10 346 psubd xmm7, xmm0 ; xmm7=tmp12 347 348 ; -- Final output stage 349 350 movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] 351 352 movdqa xmm5, xmm4 353 movdqa xmm3, xmm7 354 paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) 355 paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) 356 psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) 357 psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) 358 359 paddd xmm4, xmm1 360 paddd xmm7, xmm1 361 psrad xmm4, DESCALE_P2_4 362 psrad xmm7, DESCALE_P2_4 363 paddd xmm5, xmm1 364 paddd xmm3, xmm1 365 psrad xmm5, DESCALE_P2_4 366 psrad xmm3, DESCALE_P2_4 367 368 packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) 369 packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) 370 371 movdqa xmm0, xmm4 ; transpose coefficients(phase 1) 372 punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) 373 punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) 374 375 movdqa xmm6, xmm4 ; transpose coefficients(phase 2) 376 punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) 377 punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) 378 379 packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) 380 paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)] 381 382 pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) 383 pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) 384 pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) 385 386 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 387 mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 388 movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 389 movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 390 mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 391 mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 392 movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 393 movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 394 395 pop edi 396 pop esi 397 ; pop edx ; need not be preserved 398 ; pop ecx ; unused 399 POPPIC ebx 400 mov esp, ebp ; esp <- aligned ebp 401 pop esp ; esp <- original ebp 402 pop ebp 403 ret 404 405 ; -------------------------------------------------------------------------- 406 ; 407 ; Perform dequantization and inverse DCT on one block of coefficients, 408 ; producing a reduced-size 2x2 output block. 409 ; 410 ; GLOBAL(void) 411 ; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block, 412 ; JSAMPARRAY output_buf, JDIMENSION output_col) 413 ; 414 415 %define dct_table(b) (b) + 8 ; void *dct_table 416 %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 417 %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 418 %define output_col(b) (b) + 20 ; JDIMENSION output_col 419 420 align 32 421 GLOBAL_FUNCTION(jsimd_idct_2x2_sse2) 422 423 EXTN(jsimd_idct_2x2_sse2): 424 push ebp 425 mov ebp, esp 426 push ebx 427 ; push ecx ; need not be preserved 428 ; push edx ; need not be preserved 429 push esi 430 push edi 431 432 GET_GOT ebx ; get GOT address 433 434 ; ---- Pass 1: process columns from input. 435 436 mov edx, POINTER [dct_table(ebp)] ; quantptr 437 mov esi, JCOEFPTR [coef_block(ebp)] ; inptr 438 439 ; | input: | result: | 440 ; | 00 01 ** 03 ** 05 ** 07 | | 441 ; | 10 11 ** 13 ** 15 ** 17 | | 442 ; | ** ** ** ** ** ** ** ** | | 443 ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | 444 ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | 445 ; | 50 51 ** 53 ** 55 ** 57 | | 446 ; | ** ** ** ** ** ** ** ** | | 447 ; | 70 71 ** 73 ** 75 ** 77 | | 448 449 ; -- Odd part 450 451 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 452 movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 453 pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 454 pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 455 movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 456 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 457 pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 458 pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 459 460 ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) 461 ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) 462 463 pcmpeqd xmm7, xmm7 464 pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} 465 466 movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) 467 movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) 468 punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) 469 punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) 470 pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)] 471 pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)] 472 473 psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) 474 pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) 475 psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) 476 pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) 477 por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) 478 por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) 479 pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)] 480 pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)] 481 482 paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] 483 paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] 484 485 ; -- Even part 486 487 movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 488 pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 489 490 ; xmm6=(00 01 ** 03 ** 05 ** 07) 491 492 movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) 493 pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) 494 pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) 495 psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] 496 psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] 497 498 ; -- Final output stage 499 500 movdqa xmm3, xmm6 501 movdqa xmm5, xmm1 502 paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) 503 paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) 504 psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) 505 psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) 506 507 movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] 508 509 punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) 510 511 movdqa xmm7, xmm1 512 punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) 513 punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) 514 515 paddd xmm6, xmm2 516 psrad xmm6, DESCALE_P1_2 517 518 paddd xmm1, xmm2 519 paddd xmm7, xmm2 520 psrad xmm1, DESCALE_P1_2 521 psrad xmm7, DESCALE_P1_2 522 523 ; -- Prefetch the next coefficient block 524 525 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 526 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 527 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 528 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 529 530 ; ---- Pass 2: process rows, store into output array. 531 532 mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) 533 mov eax, JDIMENSION [output_col(ebp)] 534 535 ; | input:| result:| 536 ; | A0 B0 | | 537 ; | A1 B1 | C0 C1 | 538 ; | A3 B3 | D0 D1 | 539 ; | A5 B5 | | 540 ; | A7 B7 | | 541 542 ; -- Odd part 543 544 packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) 545 packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) 546 pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)] 547 pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)] 548 549 paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] 550 551 ; -- Even part 552 553 pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] 554 555 ; -- Final output stage 556 557 movdqa xmm4, xmm6 558 paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) 559 psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) 560 561 punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) 562 563 paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)] 564 psrad xmm6, DESCALE_P2_2 565 566 packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) 567 packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) 568 paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)] 569 570 pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) 571 pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) 572 573 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 574 mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 575 mov word [edx+eax*SIZEOF_JSAMPLE], bx 576 mov word [esi+eax*SIZEOF_JSAMPLE], cx 577 578 pop edi 579 pop esi 580 ; pop edx ; need not be preserved 581 ; pop ecx ; need not be preserved 582 pop ebx 583 pop ebp 584 ret 585 586 ; For some reason, the OS X linker does not honor the request to align the 587 ; segment unless we do this. 588 align 32