jidctint-sse2.asm (35533B)
1 ; 2 ; jidctint.asm - accurate integer IDCT (SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2020, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains a slower but more accurate integer implementation of the 14 ; inverse DCT (Discrete Cosine Transform). The following code is based 15 ; directly on the IJG's original jidctint.c; see the jidctint.c for 16 ; more details. 17 18 %include "jsimdext.inc" 19 %include "jdct.inc" 20 21 ; -------------------------------------------------------------------------- 22 23 %define CONST_BITS 13 24 %define PASS1_BITS 2 25 26 %define DESCALE_P1 (CONST_BITS - PASS1_BITS) 27 %define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) 28 29 %if CONST_BITS == 13 30 F_0_298 equ 2446 ; FIX(0.298631336) 31 F_0_390 equ 3196 ; FIX(0.390180644) 32 F_0_541 equ 4433 ; FIX(0.541196100) 33 F_0_765 equ 6270 ; FIX(0.765366865) 34 F_0_899 equ 7373 ; FIX(0.899976223) 35 F_1_175 equ 9633 ; FIX(1.175875602) 36 F_1_501 equ 12299 ; FIX(1.501321110) 37 F_1_847 equ 15137 ; FIX(1.847759065) 38 F_1_961 equ 16069 ; FIX(1.961570560) 39 F_2_053 equ 16819 ; FIX(2.053119869) 40 F_2_562 equ 20995 ; FIX(2.562915447) 41 F_3_072 equ 25172 ; FIX(3.072711026) 42 %else 43 ; NASM cannot do compile-time arithmetic on floating-point constants. 44 %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 45 F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) 46 F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) 47 F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 48 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 49 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 50 F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) 51 F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) 52 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 53 F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) 54 F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) 55 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 56 F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) 57 %endif 58 59 ; -------------------------------------------------------------------------- 60 SECTION SEG_CONST 61 62 ALIGNZ 32 63 GLOBAL_DATA(jconst_idct_islow_sse2) 64 65 EXTN(jconst_idct_islow_sse2): 66 67 PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 68 PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) 69 PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 70 PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) 71 PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 72 PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) 73 PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 74 PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) 75 PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) 76 PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) 77 PB_CENTERJSAMP times 16 db CENTERJSAMPLE 78 79 ALIGNZ 32 80 81 ; -------------------------------------------------------------------------- 82 SECTION SEG_TEXT 83 BITS 32 84 ; 85 ; Perform dequantization and inverse DCT on one block of coefficients. 86 ; 87 ; GLOBAL(void) 88 ; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, 89 ; JSAMPARRAY output_buf, JDIMENSION output_col) 90 ; 91 92 %define dct_table(b) (b) + 8 ; jpeg_component_info *compptr 93 %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 94 %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 95 %define output_col(b) (b) + 20 ; JDIMENSION output_col 96 97 %define original_ebp ebp + 0 98 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 99 ; xmmword wk[WK_NUM] 100 %define WK_NUM 12 101 102 align 32 103 GLOBAL_FUNCTION(jsimd_idct_islow_sse2) 104 105 EXTN(jsimd_idct_islow_sse2): 106 push ebp 107 mov eax, esp ; eax = original ebp 108 sub esp, byte 4 109 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 110 mov [esp], eax 111 mov ebp, esp ; ebp = aligned ebp 112 lea esp, [wk(0)] 113 PUSHPIC ebx 114 ; push ecx ; unused 115 ; push edx ; need not be preserved 116 push esi 117 push edi 118 119 GET_GOT ebx ; get GOT address 120 121 ; ---- Pass 1: process columns from input. 122 123 ; mov eax, [original_ebp] 124 mov edx, POINTER [dct_table(eax)] ; quantptr 125 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 126 127 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 128 mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 129 or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 130 jnz near .columnDCT 131 132 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 133 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 134 por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 135 por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 136 por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 137 por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 138 por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 139 por xmm1, xmm0 140 packsswb xmm1, xmm1 141 packsswb xmm1, xmm1 142 movd eax, xmm1 143 test eax, eax 144 jnz short .columnDCT 145 146 ; -- AC terms all zero 147 148 movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 149 pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 150 151 psllw xmm5, PASS1_BITS 152 153 movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) 154 punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) 155 punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) 156 157 pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) 158 pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) 159 pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) 160 pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) 161 pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) 162 pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) 163 pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) 164 pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) 165 166 movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 167 movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 168 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 169 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 170 jmp near .column_end 171 ALIGNX 16, 7 172 %endif 173 .columnDCT: 174 175 ; -- Even part 176 177 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 178 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 179 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 180 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 181 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 182 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 183 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 184 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 185 186 ; (Original) 187 ; z1 = (z2 + z3) * 0.541196100; 188 ; tmp2 = z1 + z3 * -1.847759065; 189 ; tmp3 = z1 + z2 * 0.765366865; 190 ; 191 ; (This implementation) 192 ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 193 ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 194 195 movdqa xmm4, xmm1 ; xmm1=in2=z2 196 movdqa xmm5, xmm1 197 punpcklwd xmm4, xmm3 ; xmm3=in6=z3 198 punpckhwd xmm5, xmm3 199 movdqa xmm1, xmm4 200 movdqa xmm3, xmm5 201 pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L 202 pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H 203 pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L 204 pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H 205 206 movdqa xmm6, xmm0 207 paddw xmm0, xmm2 ; xmm0=in0+in4 208 psubw xmm6, xmm2 ; xmm6=in0-in4 209 210 pxor xmm7, xmm7 211 pxor xmm2, xmm2 212 punpcklwd xmm7, xmm0 ; xmm7=tmp0L 213 punpckhwd xmm2, xmm0 ; xmm2=tmp0H 214 psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 215 psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS 216 217 movdqa xmm0, xmm7 218 paddd xmm7, xmm4 ; xmm7=tmp10L 219 psubd xmm0, xmm4 ; xmm0=tmp13L 220 movdqa xmm4, xmm2 221 paddd xmm2, xmm5 ; xmm2=tmp10H 222 psubd xmm4, xmm5 ; xmm4=tmp13H 223 224 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L 225 movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H 226 movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L 227 movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H 228 229 pxor xmm5, xmm5 230 pxor xmm7, xmm7 231 punpcklwd xmm5, xmm6 ; xmm5=tmp1L 232 punpckhwd xmm7, xmm6 ; xmm7=tmp1H 233 psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 234 psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 235 236 movdqa xmm2, xmm5 237 paddd xmm5, xmm1 ; xmm5=tmp11L 238 psubd xmm2, xmm1 ; xmm2=tmp12L 239 movdqa xmm0, xmm7 240 paddd xmm7, xmm3 ; xmm7=tmp11H 241 psubd xmm0, xmm3 ; xmm0=tmp12H 242 243 movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 244 movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H 245 movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L 246 movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H 247 248 ; -- Odd part 249 250 movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 251 movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 252 pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 253 pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 254 movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 255 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 256 pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 257 pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 258 259 movdqa xmm5, xmm6 260 movdqa xmm7, xmm4 261 paddw xmm5, xmm3 ; xmm5=z3 262 paddw xmm7, xmm1 ; xmm7=z4 263 264 ; (Original) 265 ; z5 = (z3 + z4) * 1.175875602; 266 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 267 ; z3 += z5; z4 += z5; 268 ; 269 ; (This implementation) 270 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 271 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 272 273 movdqa xmm2, xmm5 274 movdqa xmm0, xmm5 275 punpcklwd xmm2, xmm7 276 punpckhwd xmm0, xmm7 277 movdqa xmm5, xmm2 278 movdqa xmm7, xmm0 279 pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L 280 pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H 281 pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L 282 pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H 283 284 movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L 285 movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H 286 287 ; (Original) 288 ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 289 ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 290 ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 291 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 292 ; tmp0 += z1 + z3; tmp1 += z2 + z4; 293 ; tmp2 += z2 + z3; tmp3 += z1 + z4; 294 ; 295 ; (This implementation) 296 ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 297 ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 298 ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 299 ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 300 ; tmp0 += z3; tmp1 += z4; 301 ; tmp2 += z3; tmp3 += z4; 302 303 movdqa xmm2, xmm3 304 movdqa xmm0, xmm3 305 punpcklwd xmm2, xmm4 306 punpckhwd xmm0, xmm4 307 movdqa xmm3, xmm2 308 movdqa xmm4, xmm0 309 pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L 310 pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H 311 pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L 312 pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H 313 314 paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L 315 paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H 316 paddd xmm3, xmm5 ; xmm3=tmp3L 317 paddd xmm4, xmm7 ; xmm4=tmp3H 318 319 movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L 320 movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H 321 322 movdqa xmm2, xmm1 323 movdqa xmm0, xmm1 324 punpcklwd xmm2, xmm6 325 punpckhwd xmm0, xmm6 326 movdqa xmm1, xmm2 327 movdqa xmm6, xmm0 328 pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L 329 pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H 330 pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L 331 pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H 332 333 paddd xmm2, xmm5 ; xmm2=tmp1L 334 paddd xmm0, xmm7 ; xmm0=tmp1H 335 paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L 336 paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 337 338 movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L 339 movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H 340 341 ; -- Final output stage 342 343 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 344 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H 345 346 movdqa xmm2, xmm5 347 movdqa xmm0, xmm7 348 paddd xmm5, xmm3 ; xmm5=data0L 349 paddd xmm7, xmm4 ; xmm7=data0H 350 psubd xmm2, xmm3 ; xmm2=data7L 351 psubd xmm0, xmm4 ; xmm0=data7H 352 353 movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] 354 355 paddd xmm5, xmm3 356 paddd xmm7, xmm3 357 psrad xmm5, DESCALE_P1 358 psrad xmm7, DESCALE_P1 359 paddd xmm2, xmm3 360 paddd xmm0, xmm3 361 psrad xmm2, DESCALE_P1 362 psrad xmm0, DESCALE_P1 363 364 packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) 365 packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) 366 367 movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L 368 movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H 369 370 movdqa xmm7, xmm4 371 movdqa xmm0, xmm3 372 paddd xmm4, xmm1 ; xmm4=data1L 373 paddd xmm3, xmm6 ; xmm3=data1H 374 psubd xmm7, xmm1 ; xmm7=data6L 375 psubd xmm0, xmm6 ; xmm0=data6H 376 377 movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] 378 379 paddd xmm4, xmm1 380 paddd xmm3, xmm1 381 psrad xmm4, DESCALE_P1 382 psrad xmm3, DESCALE_P1 383 paddd xmm7, xmm1 384 paddd xmm0, xmm1 385 psrad xmm7, DESCALE_P1 386 psrad xmm0, DESCALE_P1 387 388 packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) 389 packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) 390 391 movdqa xmm6, xmm5 ; transpose coefficients(phase 1) 392 punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) 393 punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) 394 movdqa xmm1, xmm7 ; transpose coefficients(phase 1) 395 punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) 396 punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) 397 398 movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L 399 movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H 400 movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L 401 movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H 402 403 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) 404 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) 405 movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) 406 movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) 407 408 movdqa xmm5, xmm3 409 movdqa xmm6, xmm0 410 paddd xmm3, xmm4 ; xmm3=data2L 411 paddd xmm0, xmm2 ; xmm0=data2H 412 psubd xmm5, xmm4 ; xmm5=data5L 413 psubd xmm6, xmm2 ; xmm6=data5H 414 415 movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] 416 417 paddd xmm3, xmm7 418 paddd xmm0, xmm7 419 psrad xmm3, DESCALE_P1 420 psrad xmm0, DESCALE_P1 421 paddd xmm5, xmm7 422 paddd xmm6, xmm7 423 psrad xmm5, DESCALE_P1 424 psrad xmm6, DESCALE_P1 425 426 packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) 427 packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) 428 429 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L 430 movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H 431 movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L 432 movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H 433 434 movdqa xmm0, xmm1 435 movdqa xmm6, xmm4 436 paddd xmm1, xmm2 ; xmm1=data3L 437 paddd xmm4, xmm7 ; xmm4=data3H 438 psubd xmm0, xmm2 ; xmm0=data4L 439 psubd xmm6, xmm7 ; xmm6=data4H 440 441 movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] 442 443 paddd xmm1, xmm2 444 paddd xmm4, xmm2 445 psrad xmm1, DESCALE_P1 446 psrad xmm4, DESCALE_P1 447 paddd xmm0, xmm2 448 paddd xmm6, xmm2 449 psrad xmm0, DESCALE_P1 450 psrad xmm6, DESCALE_P1 451 452 packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) 453 packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) 454 455 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) 456 movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) 457 458 movdqa xmm4, xmm3 ; transpose coefficients(phase 1) 459 punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) 460 punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) 461 movdqa xmm6, xmm0 ; transpose coefficients(phase 1) 462 punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) 463 punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) 464 465 movdqa xmm1, xmm7 ; transpose coefficients(phase 2) 466 punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) 467 punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) 468 movdqa xmm5, xmm2 ; transpose coefficients(phase 2) 469 punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) 470 punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) 471 472 movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) 473 movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) 474 475 movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) 476 movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) 477 478 movdqa xmm2, xmm0 ; transpose coefficients(phase 2) 479 punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) 480 punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) 481 movdqa xmm5, xmm6 ; transpose coefficients(phase 2) 482 punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) 483 punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) 484 485 movdqa xmm3, xmm7 ; transpose coefficients(phase 3) 486 punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) 487 punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) 488 movdqa xmm4, xmm1 ; transpose coefficients(phase 3) 489 punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) 490 punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) 491 492 movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) 493 movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) 494 495 movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 496 movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 497 498 movdqa xmm3, xmm0 ; transpose coefficients(phase 3) 499 punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) 500 punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) 501 movdqa xmm4, xmm2 ; transpose coefficients(phase 3) 502 punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) 503 punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) 504 505 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 506 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 507 .column_end: 508 509 ; -- Prefetch the next coefficient block 510 511 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 512 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 513 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 514 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 515 516 ; ---- Pass 2: process rows from work array, store into output array. 517 518 mov eax, [original_ebp] 519 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 520 mov eax, JDIMENSION [output_col(eax)] 521 522 ; -- Even part 523 524 ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 525 526 ; (Original) 527 ; z1 = (z2 + z3) * 0.541196100; 528 ; tmp2 = z1 + z3 * -1.847759065; 529 ; tmp3 = z1 + z2 * 0.765366865; 530 ; 531 ; (This implementation) 532 ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 533 ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 534 535 movdqa xmm6, xmm1 ; xmm1=in2=z2 536 movdqa xmm5, xmm1 537 punpcklwd xmm6, xmm2 ; xmm2=in6=z3 538 punpckhwd xmm5, xmm2 539 movdqa xmm1, xmm6 540 movdqa xmm2, xmm5 541 pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L 542 pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H 543 pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L 544 pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H 545 546 movdqa xmm3, xmm7 547 paddw xmm7, xmm0 ; xmm7=in0+in4 548 psubw xmm3, xmm0 ; xmm3=in0-in4 549 550 pxor xmm4, xmm4 551 pxor xmm0, xmm0 552 punpcklwd xmm4, xmm7 ; xmm4=tmp0L 553 punpckhwd xmm0, xmm7 ; xmm0=tmp0H 554 psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 555 psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS 556 557 movdqa xmm7, xmm4 558 paddd xmm4, xmm6 ; xmm4=tmp10L 559 psubd xmm7, xmm6 ; xmm7=tmp13L 560 movdqa xmm6, xmm0 561 paddd xmm0, xmm5 ; xmm0=tmp10H 562 psubd xmm6, xmm5 ; xmm6=tmp13H 563 564 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L 565 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H 566 movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L 567 movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H 568 569 pxor xmm5, xmm5 570 pxor xmm4, xmm4 571 punpcklwd xmm5, xmm3 ; xmm5=tmp1L 572 punpckhwd xmm4, xmm3 ; xmm4=tmp1H 573 psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 574 psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 575 576 movdqa xmm0, xmm5 577 paddd xmm5, xmm1 ; xmm5=tmp11L 578 psubd xmm0, xmm1 ; xmm0=tmp12L 579 movdqa xmm7, xmm4 580 paddd xmm4, xmm2 ; xmm4=tmp11H 581 psubd xmm7, xmm2 ; xmm7=tmp12H 582 583 movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 584 movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H 585 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L 586 movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H 587 588 ; -- Odd part 589 590 movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 591 movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 592 movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 593 movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 594 595 movdqa xmm5, xmm6 596 movdqa xmm4, xmm3 597 paddw xmm5, xmm1 ; xmm5=z3 598 paddw xmm4, xmm2 ; xmm4=z4 599 600 ; (Original) 601 ; z5 = (z3 + z4) * 1.175875602; 602 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 603 ; z3 += z5; z4 += z5; 604 ; 605 ; (This implementation) 606 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 607 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 608 609 movdqa xmm0, xmm5 610 movdqa xmm7, xmm5 611 punpcklwd xmm0, xmm4 612 punpckhwd xmm7, xmm4 613 movdqa xmm5, xmm0 614 movdqa xmm4, xmm7 615 pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L 616 pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H 617 pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L 618 pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H 619 620 movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L 621 movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H 622 623 ; (Original) 624 ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 625 ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 626 ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 627 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 628 ; tmp0 += z1 + z3; tmp1 += z2 + z4; 629 ; tmp2 += z2 + z3; tmp3 += z1 + z4; 630 ; 631 ; (This implementation) 632 ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 633 ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 634 ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 635 ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 636 ; tmp0 += z3; tmp1 += z4; 637 ; tmp2 += z3; tmp3 += z4; 638 639 movdqa xmm0, xmm1 640 movdqa xmm7, xmm1 641 punpcklwd xmm0, xmm3 642 punpckhwd xmm7, xmm3 643 movdqa xmm1, xmm0 644 movdqa xmm3, xmm7 645 pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L 646 pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H 647 pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L 648 pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H 649 650 paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L 651 paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H 652 paddd xmm1, xmm5 ; xmm1=tmp3L 653 paddd xmm3, xmm4 ; xmm3=tmp3H 654 655 movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L 656 movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H 657 658 movdqa xmm0, xmm2 659 movdqa xmm7, xmm2 660 punpcklwd xmm0, xmm6 661 punpckhwd xmm7, xmm6 662 movdqa xmm2, xmm0 663 movdqa xmm6, xmm7 664 pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L 665 pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H 666 pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L 667 pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H 668 669 paddd xmm0, xmm5 ; xmm0=tmp1L 670 paddd xmm7, xmm4 ; xmm7=tmp1H 671 paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L 672 paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 673 674 movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L 675 movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H 676 677 ; -- Final output stage 678 679 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 680 movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H 681 682 movdqa xmm0, xmm5 683 movdqa xmm7, xmm4 684 paddd xmm5, xmm1 ; xmm5=data0L 685 paddd xmm4, xmm3 ; xmm4=data0H 686 psubd xmm0, xmm1 ; xmm0=data7L 687 psubd xmm7, xmm3 ; xmm7=data7H 688 689 movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] 690 691 paddd xmm5, xmm1 692 paddd xmm4, xmm1 693 psrad xmm5, DESCALE_P2 694 psrad xmm4, DESCALE_P2 695 paddd xmm0, xmm1 696 paddd xmm7, xmm1 697 psrad xmm0, DESCALE_P2 698 psrad xmm7, DESCALE_P2 699 700 packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) 701 packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) 702 703 movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L 704 movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H 705 706 movdqa xmm4, xmm3 707 movdqa xmm7, xmm1 708 paddd xmm3, xmm2 ; xmm3=data1L 709 paddd xmm1, xmm6 ; xmm1=data1H 710 psubd xmm4, xmm2 ; xmm4=data6L 711 psubd xmm7, xmm6 ; xmm7=data6H 712 713 movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] 714 715 paddd xmm3, xmm2 716 paddd xmm1, xmm2 717 psrad xmm3, DESCALE_P2 718 psrad xmm1, DESCALE_P2 719 paddd xmm4, xmm2 720 paddd xmm7, xmm2 721 psrad xmm4, DESCALE_P2 722 psrad xmm7, DESCALE_P2 723 724 packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) 725 packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) 726 727 packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 728 packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 729 730 movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L 731 movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H 732 movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L 733 movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H 734 735 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 736 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 737 738 movdqa xmm4, xmm6 739 movdqa xmm0, xmm2 740 paddd xmm6, xmm1 ; xmm6=data2L 741 paddd xmm2, xmm7 ; xmm2=data2H 742 psubd xmm4, xmm1 ; xmm4=data5L 743 psubd xmm0, xmm7 ; xmm0=data5H 744 745 movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] 746 747 paddd xmm6, xmm5 748 paddd xmm2, xmm5 749 psrad xmm6, DESCALE_P2 750 psrad xmm2, DESCALE_P2 751 paddd xmm4, xmm5 752 paddd xmm0, xmm5 753 psrad xmm4, DESCALE_P2 754 psrad xmm0, DESCALE_P2 755 756 packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) 757 packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) 758 759 movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L 760 movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H 761 movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L 762 movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H 763 764 movdqa xmm2, xmm3 765 movdqa xmm0, xmm1 766 paddd xmm3, xmm7 ; xmm3=data3L 767 paddd xmm1, xmm5 ; xmm1=data3H 768 psubd xmm2, xmm7 ; xmm2=data4L 769 psubd xmm0, xmm5 ; xmm0=data4H 770 771 movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] 772 773 paddd xmm3, xmm7 774 paddd xmm1, xmm7 775 psrad xmm3, DESCALE_P2 776 psrad xmm1, DESCALE_P2 777 paddd xmm2, xmm7 778 paddd xmm0, xmm7 779 psrad xmm2, DESCALE_P2 780 psrad xmm0, DESCALE_P2 781 782 movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] 783 784 packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) 785 packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) 786 787 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 788 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 789 790 packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 791 packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 792 793 paddb xmm7, xmm5 794 paddb xmm1, xmm5 795 paddb xmm6, xmm5 796 paddb xmm3, xmm5 797 798 movdqa xmm0, xmm7 ; transpose coefficients(phase 1) 799 punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 800 punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 801 movdqa xmm2, xmm6 ; transpose coefficients(phase 1) 802 punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 803 punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 804 805 movdqa xmm4, xmm7 ; transpose coefficients(phase 2) 806 punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 807 punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 808 movdqa xmm5, xmm2 ; transpose coefficients(phase 2) 809 punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 810 punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 811 812 movdqa xmm1, xmm7 ; transpose coefficients(phase 3) 813 punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 814 punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 815 movdqa xmm3, xmm4 ; transpose coefficients(phase 3) 816 punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 817 punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 818 819 pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 820 pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 821 pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 822 pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 823 824 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 825 mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 826 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 827 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 828 mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] 829 mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] 830 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 831 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 832 833 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 834 mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 835 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 836 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 837 mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] 838 mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] 839 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 840 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 841 842 pop edi 843 pop esi 844 ; pop edx ; need not be preserved 845 ; pop ecx ; unused 846 POPPIC ebx 847 mov esp, ebp ; esp <- aligned ebp 848 pop esp ; esp <- original ebp 849 pop ebp 850 ret 851 852 ; For some reason, the OS X linker does not honor the request to align the 853 ; segment unless we do this. 854 align 32