jfdctint-mmx.asm (24349B)
1 ; 2 ; jfdctint.asm - accurate integer FDCT (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2020, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains a slower but more accurate integer implementation of the 14 ; forward DCT (Discrete Cosine Transform). The following code is based 15 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for 16 ; more details. 17 18 %include "jsimdext.inc" 19 %include "jdct.inc" 20 21 ; -------------------------------------------------------------------------- 22 23 %define CONST_BITS 13 24 %define PASS1_BITS 2 25 26 %define DESCALE_P1 (CONST_BITS - PASS1_BITS) 27 %define DESCALE_P2 (CONST_BITS + PASS1_BITS) 28 29 %if CONST_BITS == 13 30 F_0_298 equ 2446 ; FIX(0.298631336) 31 F_0_390 equ 3196 ; FIX(0.390180644) 32 F_0_541 equ 4433 ; FIX(0.541196100) 33 F_0_765 equ 6270 ; FIX(0.765366865) 34 F_0_899 equ 7373 ; FIX(0.899976223) 35 F_1_175 equ 9633 ; FIX(1.175875602) 36 F_1_501 equ 12299 ; FIX(1.501321110) 37 F_1_847 equ 15137 ; FIX(1.847759065) 38 F_1_961 equ 16069 ; FIX(1.961570560) 39 F_2_053 equ 16819 ; FIX(2.053119869) 40 F_2_562 equ 20995 ; FIX(2.562915447) 41 F_3_072 equ 25172 ; FIX(3.072711026) 42 %else 43 ; NASM cannot do compile-time arithmetic on floating-point constants. 44 %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 45 F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) 46 F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) 47 F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 48 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 49 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 50 F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) 51 F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) 52 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 53 F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) 54 F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) 55 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 56 F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) 57 %endif 58 59 ; -------------------------------------------------------------------------- 60 SECTION SEG_CONST 61 62 ALIGNZ 32 63 GLOBAL_DATA(jconst_fdct_islow_mmx) 64 65 EXTN(jconst_fdct_islow_mmx): 66 67 PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 68 PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) 69 PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 70 PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) 71 PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 72 PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) 73 PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 74 PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) 75 PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) 76 PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) 77 PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) 78 79 ALIGNZ 32 80 81 ; -------------------------------------------------------------------------- 82 SECTION SEG_TEXT 83 BITS 32 84 ; 85 ; Perform the forward DCT on one block of samples. 86 ; 87 ; GLOBAL(void) 88 ; jsimd_fdct_islow_mmx(DCTELEM *data) 89 ; 90 91 %define data(b) (b) + 8 ; DCTELEM *data 92 93 %define original_ebp ebp + 0 94 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 95 %define WK_NUM 2 96 97 align 32 98 GLOBAL_FUNCTION(jsimd_fdct_islow_mmx) 99 100 EXTN(jsimd_fdct_islow_mmx): 101 push ebp 102 mov eax, esp ; eax = original ebp 103 sub esp, byte 4 104 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 105 mov [esp], eax 106 mov ebp, esp ; ebp = aligned ebp 107 lea esp, [wk(0)] 108 PUSHPIC ebx 109 ; push ecx ; need not be preserved 110 ; push edx ; need not be preserved 111 ; push esi ; unused 112 ; push edi ; unused 113 114 GET_GOT ebx ; get GOT address 115 116 ; ---- Pass 1: process rows. 117 118 mov edx, POINTER [data(eax)] ; (DCTELEM *) 119 mov ecx, DCTSIZE/4 120 ALIGNX 16, 7 121 .rowloop: 122 123 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 124 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 125 movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] 126 movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] 127 128 ; mm0=(20 21 22 23), mm2=(24 25 26 27) 129 ; mm1=(30 31 32 33), mm3=(34 35 36 37) 130 131 movq mm4, mm0 ; transpose coefficients(phase 1) 132 punpcklwd mm0, mm1 ; mm0=(20 30 21 31) 133 punpckhwd mm4, mm1 ; mm4=(22 32 23 33) 134 movq mm5, mm2 ; transpose coefficients(phase 1) 135 punpcklwd mm2, mm3 ; mm2=(24 34 25 35) 136 punpckhwd mm5, mm3 ; mm5=(26 36 27 37) 137 138 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 139 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 140 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] 141 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] 142 143 ; mm6=(00 01 02 03), mm1=(04 05 06 07) 144 ; mm7=(10 11 12 13), mm3=(14 15 16 17) 145 146 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) 147 movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) 148 149 movq mm4, mm6 ; transpose coefficients(phase 1) 150 punpcklwd mm6, mm7 ; mm6=(00 10 01 11) 151 punpckhwd mm4, mm7 ; mm4=(02 12 03 13) 152 movq mm2, mm1 ; transpose coefficients(phase 1) 153 punpcklwd mm1, mm3 ; mm1=(04 14 05 15) 154 punpckhwd mm2, mm3 ; mm2=(06 16 07 17) 155 156 movq mm7, mm6 ; transpose coefficients(phase 2) 157 punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 158 punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 159 movq mm3, mm2 ; transpose coefficients(phase 2) 160 punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 161 punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 162 163 movq mm0, mm7 164 movq mm5, mm6 165 psubw mm7, mm2 ; mm7=data1-data6=tmp6 166 psubw mm6, mm3 ; mm6=data0-data7=tmp7 167 paddw mm0, mm2 ; mm0=data1+data6=tmp1 168 paddw mm5, mm3 ; mm5=data0+data7=tmp0 169 170 movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) 171 movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) 172 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 173 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 174 175 movq mm7, mm4 ; transpose coefficients(phase 2) 176 punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 177 punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 178 movq mm6, mm1 ; transpose coefficients(phase 2) 179 punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 180 punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 181 182 movq mm2, mm7 183 movq mm3, mm4 184 paddw mm7, mm1 ; mm7=data3+data4=tmp3 185 paddw mm4, mm6 ; mm4=data2+data5=tmp2 186 psubw mm2, mm1 ; mm2=data3-data4=tmp4 187 psubw mm3, mm6 ; mm3=data2-data5=tmp5 188 189 ; -- Even part 190 191 movq mm1, mm5 192 movq mm6, mm0 193 paddw mm5, mm7 ; mm5=tmp10 194 paddw mm0, mm4 ; mm0=tmp11 195 psubw mm1, mm7 ; mm1=tmp13 196 psubw mm6, mm4 ; mm6=tmp12 197 198 movq mm7, mm5 199 paddw mm5, mm0 ; mm5=tmp10+tmp11 200 psubw mm7, mm0 ; mm7=tmp10-tmp11 201 202 psllw mm5, PASS1_BITS ; mm5=data0 203 psllw mm7, PASS1_BITS ; mm7=data4 204 205 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 206 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 207 208 ; (Original) 209 ; z1 = (tmp12 + tmp13) * 0.541196100; 210 ; data2 = z1 + tmp13 * 0.765366865; 211 ; data6 = z1 + tmp12 * -1.847759065; 212 ; 213 ; (This implementation) 214 ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 215 ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 216 217 movq mm4, mm1 ; mm1=tmp13 218 movq mm0, mm1 219 punpcklwd mm4, mm6 ; mm6=tmp12 220 punpckhwd mm0, mm6 221 movq mm1, mm4 222 movq mm6, mm0 223 pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 224 pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 225 pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 226 pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 227 228 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] 229 paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)] 230 psrad mm4, DESCALE_P1 231 psrad mm0, DESCALE_P1 232 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 233 paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)] 234 psrad mm1, DESCALE_P1 235 psrad mm6, DESCALE_P1 236 237 packssdw mm4, mm0 ; mm4=data2 238 packssdw mm1, mm6 ; mm1=data6 239 240 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 241 movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 242 243 ; -- Odd part 244 245 movq mm5, MMWORD [wk(0)] ; mm5=tmp6 246 movq mm7, MMWORD [wk(1)] ; mm7=tmp7 247 248 movq mm0, mm2 ; mm2=tmp4 249 movq mm6, mm3 ; mm3=tmp5 250 paddw mm0, mm5 ; mm0=z3 251 paddw mm6, mm7 ; mm6=z4 252 253 ; (Original) 254 ; z5 = (z3 + z4) * 1.175875602; 255 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 256 ; z3 += z5; z4 += z5; 257 ; 258 ; (This implementation) 259 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 260 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 261 262 movq mm4, mm0 263 movq mm1, mm0 264 punpcklwd mm4, mm6 265 punpckhwd mm1, mm6 266 movq mm0, mm4 267 movq mm6, mm1 268 pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 269 pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 270 pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 271 pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 272 273 movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 274 movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 275 276 ; (Original) 277 ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 278 ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 279 ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 280 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 281 ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 282 ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 283 ; 284 ; (This implementation) 285 ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 286 ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 287 ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 288 ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 289 ; data7 = tmp4 + z3; data5 = tmp5 + z4; 290 ; data3 = tmp6 + z3; data1 = tmp7 + z4; 291 292 movq mm4, mm2 293 movq mm1, mm2 294 punpcklwd mm4, mm7 295 punpckhwd mm1, mm7 296 movq mm2, mm4 297 movq mm7, mm1 298 pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 299 pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 300 pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 301 pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 302 303 paddd mm4, MMWORD [wk(0)] ; mm4=data7L 304 paddd mm1, MMWORD [wk(1)] ; mm1=data7H 305 paddd mm2, mm0 ; mm2=data1L 306 paddd mm7, mm6 ; mm7=data1H 307 308 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] 309 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 310 psrad mm4, DESCALE_P1 311 psrad mm1, DESCALE_P1 312 paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)] 313 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] 314 psrad mm2, DESCALE_P1 315 psrad mm7, DESCALE_P1 316 317 packssdw mm4, mm1 ; mm4=data7 318 packssdw mm2, mm7 ; mm2=data1 319 320 movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 321 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 322 323 movq mm1, mm3 324 movq mm7, mm3 325 punpcklwd mm1, mm5 326 punpckhwd mm7, mm5 327 movq mm3, mm1 328 movq mm5, mm7 329 pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 330 pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 331 pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 332 pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 333 334 paddd mm1, mm0 ; mm1=data5L 335 paddd mm7, mm6 ; mm7=data5H 336 paddd mm3, MMWORD [wk(0)] ; mm3=data3L 337 paddd mm5, MMWORD [wk(1)] ; mm5=data3H 338 339 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 340 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] 341 psrad mm1, DESCALE_P1 342 psrad mm7, DESCALE_P1 343 paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)] 344 paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)] 345 psrad mm3, DESCALE_P1 346 psrad mm5, DESCALE_P1 347 348 packssdw mm1, mm7 ; mm1=data5 349 packssdw mm3, mm5 ; mm3=data3 350 351 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 352 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 353 354 add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM 355 dec ecx 356 jnz near .rowloop 357 358 ; ---- Pass 2: process columns. 359 360 mov edx, POINTER [data(eax)] ; (DCTELEM *) 361 mov ecx, DCTSIZE/4 362 ALIGNX 16, 7 363 .columnloop: 364 365 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 366 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 367 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 368 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 369 370 ; mm0=(02 12 22 32), mm2=(42 52 62 72) 371 ; mm1=(03 13 23 33), mm3=(43 53 63 73) 372 373 movq mm4, mm0 ; transpose coefficients(phase 1) 374 punpcklwd mm0, mm1 ; mm0=(02 03 12 13) 375 punpckhwd mm4, mm1 ; mm4=(22 23 32 33) 376 movq mm5, mm2 ; transpose coefficients(phase 1) 377 punpcklwd mm2, mm3 ; mm2=(42 43 52 53) 378 punpckhwd mm5, mm3 ; mm5=(62 63 72 73) 379 380 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 381 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 382 movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 383 movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 384 385 ; mm6=(00 10 20 30), mm1=(40 50 60 70) 386 ; mm7=(01 11 21 31), mm3=(41 51 61 71) 387 388 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) 389 movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) 390 391 movq mm4, mm6 ; transpose coefficients(phase 1) 392 punpcklwd mm6, mm7 ; mm6=(00 01 10 11) 393 punpckhwd mm4, mm7 ; mm4=(20 21 30 31) 394 movq mm2, mm1 ; transpose coefficients(phase 1) 395 punpcklwd mm1, mm3 ; mm1=(40 41 50 51) 396 punpckhwd mm2, mm3 ; mm2=(60 61 70 71) 397 398 movq mm7, mm6 ; transpose coefficients(phase 2) 399 punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 400 punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 401 movq mm3, mm2 ; transpose coefficients(phase 2) 402 punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 403 punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 404 405 movq mm0, mm7 406 movq mm5, mm6 407 psubw mm7, mm2 ; mm7=data1-data6=tmp6 408 psubw mm6, mm3 ; mm6=data0-data7=tmp7 409 paddw mm0, mm2 ; mm0=data1+data6=tmp1 410 paddw mm5, mm3 ; mm5=data0+data7=tmp0 411 412 movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) 413 movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) 414 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 415 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 416 417 movq mm7, mm4 ; transpose coefficients(phase 2) 418 punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 419 punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 420 movq mm6, mm1 ; transpose coefficients(phase 2) 421 punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 422 punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 423 424 movq mm2, mm7 425 movq mm3, mm4 426 paddw mm7, mm1 ; mm7=data3+data4=tmp3 427 paddw mm4, mm6 ; mm4=data2+data5=tmp2 428 psubw mm2, mm1 ; mm2=data3-data4=tmp4 429 psubw mm3, mm6 ; mm3=data2-data5=tmp5 430 431 ; -- Even part 432 433 movq mm1, mm5 434 movq mm6, mm0 435 paddw mm5, mm7 ; mm5=tmp10 436 paddw mm0, mm4 ; mm0=tmp11 437 psubw mm1, mm7 ; mm1=tmp13 438 psubw mm6, mm4 ; mm6=tmp12 439 440 movq mm7, mm5 441 paddw mm5, mm0 ; mm5=tmp10+tmp11 442 psubw mm7, mm0 ; mm7=tmp10-tmp11 443 444 paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)] 445 paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)] 446 psraw mm5, PASS1_BITS ; mm5=data0 447 psraw mm7, PASS1_BITS ; mm7=data4 448 449 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 450 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 451 452 ; (Original) 453 ; z1 = (tmp12 + tmp13) * 0.541196100; 454 ; data2 = z1 + tmp13 * 0.765366865; 455 ; data6 = z1 + tmp12 * -1.847759065; 456 ; 457 ; (This implementation) 458 ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 459 ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 460 461 movq mm4, mm1 ; mm1=tmp13 462 movq mm0, mm1 463 punpcklwd mm4, mm6 ; mm6=tmp12 464 punpckhwd mm0, mm6 465 movq mm1, mm4 466 movq mm6, mm0 467 pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 468 pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 469 pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 470 pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 471 472 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] 473 paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)] 474 psrad mm4, DESCALE_P2 475 psrad mm0, DESCALE_P2 476 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 477 paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)] 478 psrad mm1, DESCALE_P2 479 psrad mm6, DESCALE_P2 480 481 packssdw mm4, mm0 ; mm4=data2 482 packssdw mm1, mm6 ; mm1=data6 483 484 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 485 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 486 487 ; -- Odd part 488 489 movq mm5, MMWORD [wk(0)] ; mm5=tmp6 490 movq mm7, MMWORD [wk(1)] ; mm7=tmp7 491 492 movq mm0, mm2 ; mm2=tmp4 493 movq mm6, mm3 ; mm3=tmp5 494 paddw mm0, mm5 ; mm0=z3 495 paddw mm6, mm7 ; mm6=z4 496 497 ; (Original) 498 ; z5 = (z3 + z4) * 1.175875602; 499 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 500 ; z3 += z5; z4 += z5; 501 ; 502 ; (This implementation) 503 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 504 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 505 506 movq mm4, mm0 507 movq mm1, mm0 508 punpcklwd mm4, mm6 509 punpckhwd mm1, mm6 510 movq mm0, mm4 511 movq mm6, mm1 512 pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 513 pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 514 pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 515 pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 516 517 movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 518 movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 519 520 ; (Original) 521 ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 522 ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 523 ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 524 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 525 ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 526 ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 527 ; 528 ; (This implementation) 529 ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 530 ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 531 ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 532 ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 533 ; data7 = tmp4 + z3; data5 = tmp5 + z4; 534 ; data3 = tmp6 + z3; data1 = tmp7 + z4; 535 536 movq mm4, mm2 537 movq mm1, mm2 538 punpcklwd mm4, mm7 539 punpckhwd mm1, mm7 540 movq mm2, mm4 541 movq mm7, mm1 542 pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 543 pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 544 pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 545 pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 546 547 paddd mm4, MMWORD [wk(0)] ; mm4=data7L 548 paddd mm1, MMWORD [wk(1)] ; mm1=data7H 549 paddd mm2, mm0 ; mm2=data1L 550 paddd mm7, mm6 ; mm7=data1H 551 552 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] 553 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 554 psrad mm4, DESCALE_P2 555 psrad mm1, DESCALE_P2 556 paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)] 557 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] 558 psrad mm2, DESCALE_P2 559 psrad mm7, DESCALE_P2 560 561 packssdw mm4, mm1 ; mm4=data7 562 packssdw mm2, mm7 ; mm2=data1 563 564 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 565 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 566 567 movq mm1, mm3 568 movq mm7, mm3 569 punpcklwd mm1, mm5 570 punpckhwd mm7, mm5 571 movq mm3, mm1 572 movq mm5, mm7 573 pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 574 pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 575 pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 576 pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 577 578 paddd mm1, mm0 ; mm1=data5L 579 paddd mm7, mm6 ; mm7=data5H 580 paddd mm3, MMWORD [wk(0)] ; mm3=data3L 581 paddd mm5, MMWORD [wk(1)] ; mm5=data3H 582 583 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 584 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] 585 psrad mm1, DESCALE_P2 586 psrad mm7, DESCALE_P2 587 paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)] 588 paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)] 589 psrad mm3, DESCALE_P2 590 psrad mm5, DESCALE_P2 591 592 packssdw mm1, mm7 ; mm1=data5 593 packssdw mm3, mm5 ; mm3=data3 594 595 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 596 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 597 598 add edx, byte 4*SIZEOF_DCTELEM 599 dec ecx 600 jnz near .columnloop 601 602 emms ; empty MMX state 603 604 ; pop edi ; unused 605 ; pop esi ; unused 606 ; pop edx ; need not be preserved 607 ; pop ecx ; need not be preserved 608 POPPIC ebx 609 mov esp, ebp ; esp <- aligned ebp 610 pop esp ; esp <- original ebp 611 pop ebp 612 ret 613 614 ; For some reason, the OS X linker does not honor the request to align the 615 ; segment unless we do this. 616 align 32