jidctflt-3dn.asm (17391B)
1 ; 2 ; jidctflt.asm - floating-point IDCT (3DNow! & MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains a floating-point implementation of the inverse DCT 14 ; (Discrete Cosine Transform). The following code is based directly on 15 ; the IJG's original jidctflt.c; see the jidctflt.c for more details. 16 17 %include "jsimdext.inc" 18 %include "jdct.inc" 19 20 ; -------------------------------------------------------------------------- 21 SECTION SEG_CONST 22 23 ALIGNZ 32 24 GLOBAL_DATA(jconst_idct_float_3dnow) 25 26 EXTN(jconst_idct_float_3dnow): 27 28 PD_1_414 times 2 dd 1.414213562373095048801689 29 PD_1_847 times 2 dd 1.847759065022573512256366 30 PD_1_082 times 2 dd 1.082392200292393968799446 31 PD_2_613 times 2 dd 2.613125929752753055713286 32 PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) 33 PB_CENTERJSAMP times 8 db CENTERJSAMPLE 34 35 ALIGNZ 32 36 37 ; -------------------------------------------------------------------------- 38 SECTION SEG_TEXT 39 BITS 32 40 ; 41 ; Perform dequantization and inverse DCT on one block of coefficients. 42 ; 43 ; GLOBAL(void) 44 ; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block, 45 ; JSAMPARRAY output_buf, JDIMENSION output_col) 46 ; 47 48 %define dct_table(b) (b) + 8 ; void *dct_table 49 %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 50 %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 51 %define output_col(b) (b) + 20 ; JDIMENSION output_col 52 53 %define original_ebp ebp + 0 54 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 55 ; mmword wk[WK_NUM] 56 %define WK_NUM 2 57 %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 58 ; FAST_FLOAT workspace[DCTSIZE2] 59 60 align 32 61 GLOBAL_FUNCTION(jsimd_idct_float_3dnow) 62 63 EXTN(jsimd_idct_float_3dnow): 64 push ebp 65 mov eax, esp ; eax = original ebp 66 sub esp, byte 4 67 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 68 mov [esp], eax 69 mov ebp, esp ; ebp = aligned ebp 70 lea esp, [workspace] 71 push ebx 72 ; push ecx ; need not be preserved 73 ; push edx ; need not be preserved 74 push esi 75 push edi 76 77 GET_GOT ebx ; get GOT address 78 79 ; ---- Pass 1: process columns from input, store into work array. 80 81 ; mov eax, [original_ebp] 82 mov edx, POINTER [dct_table(eax)] ; quantptr 83 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 84 lea edi, [workspace] ; FAST_FLOAT *wsptr 85 mov ecx, DCTSIZE/2 ; ctr 86 ALIGNX 16, 7 87 .columnloop: 88 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW 89 mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 90 or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 91 jnz short .columnDCT 92 93 PUSHPIC ebx ; save GOT address 94 mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] 95 mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] 96 or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] 97 or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] 98 or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] 99 or eax, ebx 100 POPPIC ebx ; restore GOT address 101 jnz short .columnDCT 102 103 ; -- AC terms all zero 104 105 movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] 106 107 punpcklwd mm0, mm0 108 psrad mm0, (DWORD_BIT-WORD_BIT) 109 pi2fd mm0, mm0 110 111 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 112 113 movq mm1, mm0 114 punpckldq mm0, mm0 115 punpckhdq mm1, mm1 116 117 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 118 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 119 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 120 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 121 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 122 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 123 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 124 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 125 jmp near .nextcolumn 126 ALIGNX 16, 7 127 %endif 128 .columnDCT: 129 130 ; -- Even part 131 132 movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] 133 movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 134 movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] 135 movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] 136 137 punpcklwd mm0, mm0 138 punpcklwd mm1, mm1 139 psrad mm0, (DWORD_BIT-WORD_BIT) 140 psrad mm1, (DWORD_BIT-WORD_BIT) 141 pi2fd mm0, mm0 142 pi2fd mm1, mm1 143 144 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 145 pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 146 147 punpcklwd mm2, mm2 148 punpcklwd mm3, mm3 149 psrad mm2, (DWORD_BIT-WORD_BIT) 150 psrad mm3, (DWORD_BIT-WORD_BIT) 151 pi2fd mm2, mm2 152 pi2fd mm3, mm3 153 154 pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 155 pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 156 157 movq mm4, mm0 158 movq mm5, mm1 159 pfsub mm0, mm2 ; mm0=tmp11 160 pfsub mm1, mm3 161 pfadd mm4, mm2 ; mm4=tmp10 162 pfadd mm5, mm3 ; mm5=tmp13 163 164 pfmul mm1, [GOTOFF(ebx,PD_1_414)] 165 pfsub mm1, mm5 ; mm1=tmp12 166 167 movq mm6, mm4 168 movq mm7, mm0 169 pfsub mm4, mm5 ; mm4=tmp3 170 pfsub mm0, mm1 ; mm0=tmp2 171 pfadd mm6, mm5 ; mm6=tmp0 172 pfadd mm7, mm1 ; mm7=tmp1 173 174 movq MMWORD [wk(1)], mm4 ; tmp3 175 movq MMWORD [wk(0)], mm0 ; tmp2 176 177 ; -- Odd part 178 179 movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 180 movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] 181 movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] 182 movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] 183 184 punpcklwd mm2, mm2 185 punpcklwd mm3, mm3 186 psrad mm2, (DWORD_BIT-WORD_BIT) 187 psrad mm3, (DWORD_BIT-WORD_BIT) 188 pi2fd mm2, mm2 189 pi2fd mm3, mm3 190 191 pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 192 pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 193 194 punpcklwd mm5, mm5 195 punpcklwd mm1, mm1 196 psrad mm5, (DWORD_BIT-WORD_BIT) 197 psrad mm1, (DWORD_BIT-WORD_BIT) 198 pi2fd mm5, mm5 199 pi2fd mm1, mm1 200 201 pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 202 pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 203 204 movq mm4, mm2 205 movq mm0, mm5 206 pfadd mm2, mm1 ; mm2=z11 207 pfadd mm5, mm3 ; mm5=z13 208 pfsub mm4, mm1 ; mm4=z12 209 pfsub mm0, mm3 ; mm0=z10 210 211 movq mm1, mm2 212 pfsub mm2, mm5 213 pfadd mm1, mm5 ; mm1=tmp7 214 215 pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 216 217 movq mm3, mm0 218 pfadd mm0, mm4 219 pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 220 pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) 221 pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) 222 pfsubr mm3, mm0 ; mm3=tmp12 223 pfsub mm4, mm0 ; mm4=tmp10 224 225 ; -- Final output stage 226 227 pfsub mm3, mm1 ; mm3=tmp6 228 movq mm5, mm6 229 movq mm0, mm7 230 pfadd mm6, mm1 ; mm6=data0=(00 01) 231 pfadd mm7, mm3 ; mm7=data1=(10 11) 232 pfsub mm5, mm1 ; mm5=data7=(70 71) 233 pfsub mm0, mm3 ; mm0=data6=(60 61) 234 pfsub mm2, mm3 ; mm2=tmp5 235 236 movq mm1, mm6 ; transpose coefficients 237 punpckldq mm6, mm7 ; mm6=(00 10) 238 punpckhdq mm1, mm7 ; mm1=(01 11) 239 movq mm3, mm0 ; transpose coefficients 240 punpckldq mm0, mm5 ; mm0=(60 70) 241 punpckhdq mm3, mm5 ; mm3=(61 71) 242 243 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 244 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 245 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 246 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 247 248 movq mm7, MMWORD [wk(0)] ; mm7=tmp2 249 movq mm5, MMWORD [wk(1)] ; mm5=tmp3 250 251 pfadd mm4, mm2 ; mm4=tmp4 252 movq mm6, mm7 253 movq mm1, mm5 254 pfadd mm7, mm2 ; mm7=data2=(20 21) 255 pfadd mm5, mm4 ; mm5=data4=(40 41) 256 pfsub mm6, mm2 ; mm6=data5=(50 51) 257 pfsub mm1, mm4 ; mm1=data3=(30 31) 258 259 movq mm0, mm7 ; transpose coefficients 260 punpckldq mm7, mm1 ; mm7=(20 30) 261 punpckhdq mm0, mm1 ; mm0=(21 31) 262 movq mm3, mm5 ; transpose coefficients 263 punpckldq mm5, mm6 ; mm5=(40 50) 264 punpckhdq mm3, mm6 ; mm3=(41 51) 265 266 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 267 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 268 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 269 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 270 271 .nextcolumn: 272 add esi, byte 2*SIZEOF_JCOEF ; coef_block 273 add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr 274 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 275 dec ecx ; ctr 276 jnz near .columnloop 277 278 ; -- Prefetch the next coefficient block 279 280 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 281 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 282 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 283 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 284 285 ; ---- Pass 2: process rows from work array, store into output array. 286 287 mov eax, [original_ebp] 288 lea esi, [workspace] ; FAST_FLOAT *wsptr 289 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 290 mov eax, JDIMENSION [output_col(eax)] 291 mov ecx, DCTSIZE/2 ; ctr 292 ALIGNX 16, 7 293 .rowloop: 294 295 ; -- Even part 296 297 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 298 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 299 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 300 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 301 302 movq mm4, mm0 303 movq mm5, mm1 304 pfsub mm0, mm2 ; mm0=tmp11 305 pfsub mm1, mm3 306 pfadd mm4, mm2 ; mm4=tmp10 307 pfadd mm5, mm3 ; mm5=tmp13 308 309 pfmul mm1, [GOTOFF(ebx,PD_1_414)] 310 pfsub mm1, mm5 ; mm1=tmp12 311 312 movq mm6, mm4 313 movq mm7, mm0 314 pfsub mm4, mm5 ; mm4=tmp3 315 pfsub mm0, mm1 ; mm0=tmp2 316 pfadd mm6, mm5 ; mm6=tmp0 317 pfadd mm7, mm1 ; mm7=tmp1 318 319 movq MMWORD [wk(1)], mm4 ; tmp3 320 movq MMWORD [wk(0)], mm0 ; tmp2 321 322 ; -- Odd part 323 324 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 325 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 326 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 327 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 328 329 movq mm4, mm2 330 movq mm0, mm5 331 pfadd mm2, mm1 ; mm2=z11 332 pfadd mm5, mm3 ; mm5=z13 333 pfsub mm4, mm1 ; mm4=z12 334 pfsub mm0, mm3 ; mm0=z10 335 336 movq mm1, mm2 337 pfsub mm2, mm5 338 pfadd mm1, mm5 ; mm1=tmp7 339 340 pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 341 342 movq mm3, mm0 343 pfadd mm0, mm4 344 pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 345 pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) 346 pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) 347 pfsubr mm3, mm0 ; mm3=tmp12 348 pfsub mm4, mm0 ; mm4=tmp10 349 350 ; -- Final output stage 351 352 pfsub mm3, mm1 ; mm3=tmp6 353 movq mm5, mm6 354 movq mm0, mm7 355 pfadd mm6, mm1 ; mm6=data0=(00 10) 356 pfadd mm7, mm3 ; mm7=data1=(01 11) 357 pfsub mm5, mm1 ; mm5=data7=(07 17) 358 pfsub mm0, mm3 ; mm0=data6=(06 16) 359 pfsub mm2, mm3 ; mm2=tmp5 360 361 movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] 362 pcmpeqd mm3, mm3 363 psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} 364 365 pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) 366 pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) 367 pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) 368 pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) 369 370 pand mm6, mm3 ; mm6=(00 -- 10 --) 371 pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11) 372 pand mm0, mm3 ; mm0=(06 -- 16 --) 373 pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17) 374 por mm6, mm7 ; mm6=(00 01 10 11) 375 por mm0, mm5 ; mm0=(06 07 16 17) 376 377 movq mm1, MMWORD [wk(0)] ; mm1=tmp2 378 movq mm3, MMWORD [wk(1)] ; mm3=tmp3 379 380 pfadd mm4, mm2 ; mm4=tmp4 381 movq mm7, mm1 382 movq mm5, mm3 383 pfadd mm1, mm2 ; mm1=data2=(02 12) 384 pfadd mm3, mm4 ; mm3=data4=(04 14) 385 pfsub mm7, mm2 ; mm7=data5=(05 15) 386 pfsub mm5, mm4 ; mm5=data3=(03 13) 387 388 movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] 389 pcmpeqd mm4, mm4 390 psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} 391 392 pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) 393 pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) 394 pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) 395 pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) 396 397 pand mm3, mm4 ; mm3=(04 -- 14 --) 398 pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15) 399 pand mm1, mm4 ; mm1=(02 -- 12 --) 400 pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13) 401 por mm3, mm7 ; mm3=(04 05 14 15) 402 por mm1, mm5 ; mm1=(02 03 12 13) 403 404 movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] 405 406 packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15) 407 packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17) 408 paddb mm6, mm2 409 paddb mm1, mm2 410 411 movq mm4, mm6 ; transpose coefficients(phase 2) 412 punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13) 413 punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17) 414 415 movq mm7, mm6 ; transpose coefficients(phase 3) 416 punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07) 417 punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17) 418 419 PUSHPIC ebx ; save GOT address 420 421 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 422 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 423 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 424 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 425 426 POPPIC ebx ; restore GOT address 427 428 add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr 429 add edi, byte 2*SIZEOF_JSAMPROW 430 dec ecx ; ctr 431 jnz near .rowloop 432 433 femms ; empty MMX/3DNow! state 434 435 pop edi 436 pop esi 437 ; pop edx ; need not be preserved 438 ; pop ecx ; need not be preserved 439 pop ebx 440 mov esp, ebp ; esp <- aligned ebp 441 pop esp ; esp <- original ebp 442 pop ebp 443 ret 444 445 ; For some reason, the OS X linker does not honor the request to align the 446 ; segment unless we do this. 447 align 32