jidctfst-mmx.asm (19822B)
1 ; 2 ; jidctfst.asm - fast integer IDCT (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains a fast, not so accurate integer implementation of 14 ; the inverse DCT (Discrete Cosine Transform). The following code is 15 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c 16 ; for more details. 17 18 %include "jsimdext.inc" 19 %include "jdct.inc" 20 21 ; -------------------------------------------------------------------------- 22 23 %define CONST_BITS 8 ; 14 is also OK. 24 %define PASS1_BITS 2 25 26 %if IFAST_SCALE_BITS != PASS1_BITS 27 %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." 28 %endif 29 30 %if CONST_BITS == 8 31 F_1_082 equ 277 ; FIX(1.082392200) 32 F_1_414 equ 362 ; FIX(1.414213562) 33 F_1_847 equ 473 ; FIX(1.847759065) 34 F_2_613 equ 669 ; FIX(2.613125930) 35 F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) 36 %else 37 ; NASM cannot do compile-time arithmetic on floating-point constants. 38 %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 39 F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) 40 F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) 41 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 42 F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) 43 F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) 44 %endif 45 46 ; -------------------------------------------------------------------------- 47 SECTION SEG_CONST 48 49 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 50 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 51 52 %define PRE_MULTIPLY_SCALE_BITS 2 53 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 54 55 ALIGNZ 32 56 GLOBAL_DATA(jconst_idct_ifast_mmx) 57 58 EXTN(jconst_idct_ifast_mmx): 59 60 PW_F1414 times 4 dw F_1_414 << CONST_SHIFT 61 PW_F1847 times 4 dw F_1_847 << CONST_SHIFT 62 PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT 63 PW_F1082 times 4 dw F_1_082 << CONST_SHIFT 64 PB_CENTERJSAMP times 8 db CENTERJSAMPLE 65 66 ALIGNZ 32 67 68 ; -------------------------------------------------------------------------- 69 SECTION SEG_TEXT 70 BITS 32 71 ; 72 ; Perform dequantization and inverse DCT on one block of coefficients. 73 ; 74 ; GLOBAL(void) 75 ; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block, 76 ; JSAMPARRAY output_buf, JDIMENSION output_col) 77 ; 78 79 %define dct_table(b) (b) + 8 ; jpeg_component_info *compptr 80 %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 81 %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 82 %define output_col(b) (b) + 20 ; JDIMENSION output_col 83 84 %define original_ebp ebp + 0 85 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 86 ; mmword wk[WK_NUM] 87 %define WK_NUM 2 88 %define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF 89 ; JCOEF workspace[DCTSIZE2] 90 91 align 32 92 GLOBAL_FUNCTION(jsimd_idct_ifast_mmx) 93 94 EXTN(jsimd_idct_ifast_mmx): 95 push ebp 96 mov eax, esp ; eax = original ebp 97 sub esp, byte 4 98 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 99 mov [esp], eax 100 mov ebp, esp ; ebp = aligned ebp 101 lea esp, [workspace] 102 push ebx 103 ; push ecx ; need not be preserved 104 ; push edx ; need not be preserved 105 push esi 106 push edi 107 108 GET_GOT ebx ; get GOT address 109 110 ; ---- Pass 1: process columns from input, store into work array. 111 112 ; mov eax, [original_ebp] 113 mov edx, POINTER [dct_table(eax)] ; quantptr 114 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 115 lea edi, [workspace] ; JCOEF *wsptr 116 mov ecx, DCTSIZE/4 ; ctr 117 ALIGNX 16, 7 118 .columnloop: 119 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX 120 mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 121 or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 122 jnz short .columnDCT 123 124 movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 125 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 126 por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 127 por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 128 por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 129 por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 130 por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 131 por mm1, mm0 132 packsswb mm1, mm1 133 movd eax, mm1 134 test eax, eax 135 jnz short .columnDCT 136 137 ; -- AC terms all zero 138 139 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 140 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] 141 142 movq mm2, mm0 ; mm0=in0=(00 01 02 03) 143 punpcklwd mm0, mm0 ; mm0=(00 00 01 01) 144 punpckhwd mm2, mm2 ; mm2=(02 02 03 03) 145 146 movq mm1, mm0 147 punpckldq mm0, mm0 ; mm0=(00 00 00 00) 148 punpckhdq mm1, mm1 ; mm1=(01 01 01 01) 149 movq mm3, mm2 150 punpckldq mm2, mm2 ; mm2=(02 02 02 02) 151 punpckhdq mm3, mm3 ; mm3=(03 03 03 03) 152 153 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 154 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 155 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 156 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 157 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 158 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 159 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 160 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 161 jmp near .nextcolumn 162 ALIGNX 16, 7 163 %endif 164 .columnDCT: 165 166 ; -- Even part 167 168 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 169 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 170 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] 171 pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] 172 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 173 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 174 pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] 175 pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] 176 177 movq mm4, mm0 178 movq mm5, mm1 179 psubw mm0, mm2 ; mm0=tmp11 180 psubw mm1, mm3 181 paddw mm4, mm2 ; mm4=tmp10 182 paddw mm5, mm3 ; mm5=tmp13 183 184 psllw mm1, PRE_MULTIPLY_SCALE_BITS 185 pmulhw mm1, [GOTOFF(ebx,PW_F1414)] 186 psubw mm1, mm5 ; mm1=tmp12 187 188 movq mm6, mm4 189 movq mm7, mm0 190 psubw mm4, mm5 ; mm4=tmp3 191 psubw mm0, mm1 ; mm0=tmp2 192 paddw mm6, mm5 ; mm6=tmp0 193 paddw mm7, mm1 ; mm7=tmp1 194 195 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 196 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 197 198 ; -- Odd part 199 200 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 201 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 202 pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] 203 pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] 204 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 205 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 206 pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] 207 pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] 208 209 movq mm4, mm2 210 movq mm0, mm5 211 psubw mm2, mm1 ; mm2=z12 212 psubw mm5, mm3 ; mm5=z10 213 paddw mm4, mm1 ; mm4=z11 214 paddw mm0, mm3 ; mm0=z13 215 216 movq mm1, mm5 ; mm1=z10(unscaled) 217 psllw mm2, PRE_MULTIPLY_SCALE_BITS 218 psllw mm5, PRE_MULTIPLY_SCALE_BITS 219 220 movq mm3, mm4 221 psubw mm4, mm0 222 paddw mm3, mm0 ; mm3=tmp7 223 224 psllw mm4, PRE_MULTIPLY_SCALE_BITS 225 pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 226 227 ; To avoid overflow... 228 ; 229 ; (Original) 230 ; tmp12 = -2.613125930 * z10 + z5; 231 ; 232 ; (This implementation) 233 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 234 ; = -1.613125930 * z10 - z10 + z5; 235 236 movq mm0, mm5 237 paddw mm5, mm2 238 pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 239 pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] 240 pmulhw mm2, [GOTOFF(ebx,PW_F1082)] 241 psubw mm0, mm1 242 psubw mm2, mm5 ; mm2=tmp10 243 paddw mm0, mm5 ; mm0=tmp12 244 245 ; -- Final output stage 246 247 psubw mm0, mm3 ; mm0=tmp6 248 movq mm1, mm6 249 movq mm5, mm7 250 paddw mm6, mm3 ; mm6=data0=(00 01 02 03) 251 paddw mm7, mm0 ; mm7=data1=(10 11 12 13) 252 psubw mm1, mm3 ; mm1=data7=(70 71 72 73) 253 psubw mm5, mm0 ; mm5=data6=(60 61 62 63) 254 psubw mm4, mm0 ; mm4=tmp5 255 256 movq mm3, mm6 ; transpose coefficients(phase 1) 257 punpcklwd mm6, mm7 ; mm6=(00 10 01 11) 258 punpckhwd mm3, mm7 ; mm3=(02 12 03 13) 259 movq mm0, mm5 ; transpose coefficients(phase 1) 260 punpcklwd mm5, mm1 ; mm5=(60 70 61 71) 261 punpckhwd mm0, mm1 ; mm0=(62 72 63 73) 262 263 movq mm7, MMWORD [wk(0)] ; mm7=tmp2 264 movq mm1, MMWORD [wk(1)] ; mm1=tmp3 265 266 movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) 267 movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) 268 269 paddw mm2, mm4 ; mm2=tmp4 270 movq mm5, mm7 271 movq mm0, mm1 272 paddw mm7, mm4 ; mm7=data2=(20 21 22 23) 273 paddw mm1, mm2 ; mm1=data4=(40 41 42 43) 274 psubw mm5, mm4 ; mm5=data5=(50 51 52 53) 275 psubw mm0, mm2 ; mm0=data3=(30 31 32 33) 276 277 movq mm4, mm7 ; transpose coefficients(phase 1) 278 punpcklwd mm7, mm0 ; mm7=(20 30 21 31) 279 punpckhwd mm4, mm0 ; mm4=(22 32 23 33) 280 movq mm2, mm1 ; transpose coefficients(phase 1) 281 punpcklwd mm1, mm5 ; mm1=(40 50 41 51) 282 punpckhwd mm2, mm5 ; mm2=(42 52 43 53) 283 284 movq mm0, mm6 ; transpose coefficients(phase 2) 285 punpckldq mm6, mm7 ; mm6=(00 10 20 30) 286 punpckhdq mm0, mm7 ; mm0=(01 11 21 31) 287 movq mm5, mm3 ; transpose coefficients(phase 2) 288 punpckldq mm3, mm4 ; mm3=(02 12 22 32) 289 punpckhdq mm5, mm4 ; mm5=(03 13 23 33) 290 291 movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) 292 movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) 293 294 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 295 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 296 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 297 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 298 299 movq mm6, mm1 ; transpose coefficients(phase 2) 300 punpckldq mm1, mm7 ; mm1=(40 50 60 70) 301 punpckhdq mm6, mm7 ; mm6=(41 51 61 71) 302 movq mm0, mm2 ; transpose coefficients(phase 2) 303 punpckldq mm2, mm4 ; mm2=(42 52 62 72) 304 punpckhdq mm0, mm4 ; mm0=(43 53 63 73) 305 306 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 307 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 308 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 309 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 310 311 .nextcolumn: 312 add esi, byte 4*SIZEOF_JCOEF ; coef_block 313 add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr 314 add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr 315 dec ecx ; ctr 316 jnz near .columnloop 317 318 ; ---- Pass 2: process rows from work array, store into output array. 319 320 mov eax, [original_ebp] 321 lea esi, [workspace] ; JCOEF *wsptr 322 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 323 mov eax, JDIMENSION [output_col(eax)] 324 mov ecx, DCTSIZE/4 ; ctr 325 ALIGNX 16, 7 326 .rowloop: 327 328 ; -- Even part 329 330 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 331 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 332 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 333 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 334 335 movq mm4, mm0 336 movq mm5, mm1 337 psubw mm0, mm2 ; mm0=tmp11 338 psubw mm1, mm3 339 paddw mm4, mm2 ; mm4=tmp10 340 paddw mm5, mm3 ; mm5=tmp13 341 342 psllw mm1, PRE_MULTIPLY_SCALE_BITS 343 pmulhw mm1, [GOTOFF(ebx,PW_F1414)] 344 psubw mm1, mm5 ; mm1=tmp12 345 346 movq mm6, mm4 347 movq mm7, mm0 348 psubw mm4, mm5 ; mm4=tmp3 349 psubw mm0, mm1 ; mm0=tmp2 350 paddw mm6, mm5 ; mm6=tmp0 351 paddw mm7, mm1 ; mm7=tmp1 352 353 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 354 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 355 356 ; -- Odd part 357 358 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 359 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 360 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 361 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 362 363 movq mm4, mm2 364 movq mm0, mm5 365 psubw mm2, mm1 ; mm2=z12 366 psubw mm5, mm3 ; mm5=z10 367 paddw mm4, mm1 ; mm4=z11 368 paddw mm0, mm3 ; mm0=z13 369 370 movq mm1, mm5 ; mm1=z10(unscaled) 371 psllw mm2, PRE_MULTIPLY_SCALE_BITS 372 psllw mm5, PRE_MULTIPLY_SCALE_BITS 373 374 movq mm3, mm4 375 psubw mm4, mm0 376 paddw mm3, mm0 ; mm3=tmp7 377 378 psllw mm4, PRE_MULTIPLY_SCALE_BITS 379 pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 380 381 ; To avoid overflow... 382 ; 383 ; (Original) 384 ; tmp12 = -2.613125930 * z10 + z5; 385 ; 386 ; (This implementation) 387 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 388 ; = -1.613125930 * z10 - z10 + z5; 389 390 movq mm0, mm5 391 paddw mm5, mm2 392 pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 393 pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] 394 pmulhw mm2, [GOTOFF(ebx,PW_F1082)] 395 psubw mm0, mm1 396 psubw mm2, mm5 ; mm2=tmp10 397 paddw mm0, mm5 ; mm0=tmp12 398 399 ; -- Final output stage 400 401 psubw mm0, mm3 ; mm0=tmp6 402 movq mm1, mm6 403 movq mm5, mm7 404 paddw mm6, mm3 ; mm6=data0=(00 10 20 30) 405 paddw mm7, mm0 ; mm7=data1=(01 11 21 31) 406 psraw mm6, (PASS1_BITS+3) ; descale 407 psraw mm7, (PASS1_BITS+3) ; descale 408 psubw mm1, mm3 ; mm1=data7=(07 17 27 37) 409 psubw mm5, mm0 ; mm5=data6=(06 16 26 36) 410 psraw mm1, (PASS1_BITS+3) ; descale 411 psraw mm5, (PASS1_BITS+3) ; descale 412 psubw mm4, mm0 ; mm4=tmp5 413 414 packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36) 415 packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37) 416 417 movq mm3, MMWORD [wk(0)] ; mm3=tmp2 418 movq mm0, MMWORD [wk(1)] ; mm0=tmp3 419 420 paddw mm2, mm4 ; mm2=tmp4 421 movq mm5, mm3 422 movq mm1, mm0 423 paddw mm3, mm4 ; mm3=data2=(02 12 22 32) 424 paddw mm0, mm2 ; mm0=data4=(04 14 24 34) 425 psraw mm3, (PASS1_BITS+3) ; descale 426 psraw mm0, (PASS1_BITS+3) ; descale 427 psubw mm5, mm4 ; mm5=data5=(05 15 25 35) 428 psubw mm1, mm2 ; mm1=data3=(03 13 23 33) 429 psraw mm5, (PASS1_BITS+3) ; descale 430 psraw mm1, (PASS1_BITS+3) ; descale 431 432 movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] 433 434 packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34) 435 packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35) 436 437 paddb mm6, mm4 438 paddb mm7, mm4 439 paddb mm3, mm4 440 paddb mm1, mm4 441 442 movq mm2, mm6 ; transpose coefficients(phase 1) 443 punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31) 444 punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37) 445 movq mm0, mm3 ; transpose coefficients(phase 1) 446 punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33) 447 punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35) 448 449 movq mm5, mm6 ; transpose coefficients(phase 2) 450 punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13) 451 punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33) 452 movq mm4, mm0 ; transpose coefficients(phase 2) 453 punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17) 454 punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37) 455 456 movq mm7, mm6 ; transpose coefficients(phase 3) 457 punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07) 458 punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17) 459 movq mm1, mm5 ; transpose coefficients(phase 3) 460 punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27) 461 punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37) 462 463 PUSHPIC ebx ; save GOT address 464 465 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 466 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 467 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 468 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 469 mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 470 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 471 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 472 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 473 474 POPPIC ebx ; restore GOT address 475 476 add esi, byte 4*SIZEOF_JCOEF ; wsptr 477 add edi, byte 4*SIZEOF_JSAMPROW 478 dec ecx ; ctr 479 jnz near .rowloop 480 481 emms ; empty MMX state 482 483 pop edi 484 pop esi 485 ; pop edx ; need not be preserved 486 ; pop ecx ; need not be preserved 487 pop ebx 488 mov esp, ebp ; esp <- aligned ebp 489 pop esp ; esp <- original ebp 490 pop ebp 491 ret 492 493 ; For some reason, the OS X linker does not honor the request to align the 494 ; segment unless we do this. 495 align 32