jidctflt-sse2.asm (21570B)
1 ; 2 ; jidctflt.asm - floating-point IDCT (SSE & SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains a floating-point implementation of the inverse DCT 14 ; (Discrete Cosine Transform). The following code is based directly on 15 ; the IJG's original jidctflt.c; see the jidctflt.c for more details. 16 17 %include "jsimdext.inc" 18 %include "jdct.inc" 19 20 ; -------------------------------------------------------------------------- 21 22 %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 23 shufps %1, %2, 0x44 24 %endmacro 25 26 %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 27 shufps %1, %2, 0xEE 28 %endmacro 29 30 ; -------------------------------------------------------------------------- 31 SECTION SEG_CONST 32 33 ALIGNZ 32 34 GLOBAL_DATA(jconst_idct_float_sse2) 35 36 EXTN(jconst_idct_float_sse2): 37 38 PD_1_414 times 4 dd 1.414213562373095048801689 39 PD_1_847 times 4 dd 1.847759065022573512256366 40 PD_1_082 times 4 dd 1.082392200292393968799446 41 PD_M2_613 times 4 dd -2.613125929752753055713286 42 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 43 PB_CENTERJSAMP times 16 db CENTERJSAMPLE 44 45 ALIGNZ 32 46 47 ; -------------------------------------------------------------------------- 48 SECTION SEG_TEXT 49 BITS 32 50 ; 51 ; Perform dequantization and inverse DCT on one block of coefficients. 52 ; 53 ; GLOBAL(void) 54 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 55 ; JSAMPARRAY output_buf, JDIMENSION output_col) 56 ; 57 58 %define dct_table(b) (b) + 8 ; void *dct_table 59 %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 60 %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 61 %define output_col(b) (b) + 20 ; JDIMENSION output_col 62 63 %define original_ebp ebp + 0 64 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 65 ; xmmword wk[WK_NUM] 66 %define WK_NUM 2 67 %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 68 ; FAST_FLOAT workspace[DCTSIZE2] 69 70 align 32 71 GLOBAL_FUNCTION(jsimd_idct_float_sse2) 72 73 EXTN(jsimd_idct_float_sse2): 74 push ebp 75 mov eax, esp ; eax = original ebp 76 sub esp, byte 4 77 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 78 mov [esp], eax 79 mov ebp, esp ; ebp = aligned ebp 80 lea esp, [workspace] 81 push ebx 82 ; push ecx ; need not be preserved 83 ; push edx ; need not be preserved 84 push esi 85 push edi 86 87 GET_GOT ebx ; get GOT address 88 89 ; ---- Pass 1: process columns from input, store into work array. 90 91 ; mov eax, [original_ebp] 92 mov edx, POINTER [dct_table(eax)] ; quantptr 93 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 94 lea edi, [workspace] ; FAST_FLOAT *wsptr 95 mov ecx, DCTSIZE/4 ; ctr 96 ALIGNX 16, 7 97 .columnloop: 98 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 99 mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 100 or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 101 jnz near .columnDCT 102 103 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 104 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 105 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 106 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 107 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 108 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 109 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 110 por xmm1, xmm2 111 por xmm3, xmm4 112 por xmm5, xmm6 113 por xmm1, xmm3 114 por xmm5, xmm7 115 por xmm1, xmm5 116 packsswb xmm1, xmm1 117 movd eax, xmm1 118 test eax, eax 119 jnz short .columnDCT 120 121 ; -- AC terms all zero 122 123 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 124 125 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 126 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 127 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 128 129 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 130 131 movaps xmm1, xmm0 132 movaps xmm2, xmm0 133 movaps xmm3, xmm0 134 135 shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 136 shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 137 shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 138 shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 139 140 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 141 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 142 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 143 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 144 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 145 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 146 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 147 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 148 jmp near .nextcolumn 149 ALIGNX 16, 7 150 %endif 151 .columnDCT: 152 153 ; -- Even part 154 155 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 156 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 157 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 158 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 159 160 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 161 punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 162 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 163 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 164 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 165 cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 166 167 punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 168 punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 169 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 170 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 171 cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 172 cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 173 174 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 175 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 176 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 177 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 178 179 movaps xmm4, xmm0 180 movaps xmm5, xmm1 181 subps xmm0, xmm2 ; xmm0=tmp11 182 subps xmm1, xmm3 183 addps xmm4, xmm2 ; xmm4=tmp10 184 addps xmm5, xmm3 ; xmm5=tmp13 185 186 mulps xmm1, [GOTOFF(ebx,PD_1_414)] 187 subps xmm1, xmm5 ; xmm1=tmp12 188 189 movaps xmm6, xmm4 190 movaps xmm7, xmm0 191 subps xmm4, xmm5 ; xmm4=tmp3 192 subps xmm0, xmm1 ; xmm0=tmp2 193 addps xmm6, xmm5 ; xmm6=tmp0 194 addps xmm7, xmm1 ; xmm7=tmp1 195 196 movaps XMMWORD [wk(1)], xmm4 ; tmp3 197 movaps XMMWORD [wk(0)], xmm0 ; tmp2 198 199 ; -- Odd part 200 201 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 202 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 203 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 204 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 205 206 punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 207 punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 208 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 209 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 210 cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 211 cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 212 213 punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 214 punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 215 psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 216 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 217 cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 218 cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 219 220 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 221 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 222 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 223 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 224 225 movaps xmm4, xmm2 226 movaps xmm0, xmm5 227 addps xmm2, xmm1 ; xmm2=z11 228 addps xmm5, xmm3 ; xmm5=z13 229 subps xmm4, xmm1 ; xmm4=z12 230 subps xmm0, xmm3 ; xmm0=z10 231 232 movaps xmm1, xmm2 233 subps xmm2, xmm5 234 addps xmm1, xmm5 ; xmm1=tmp7 235 236 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 237 238 movaps xmm3, xmm0 239 addps xmm0, xmm4 240 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 241 mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 242 mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 243 addps xmm3, xmm0 ; xmm3=tmp12 244 subps xmm4, xmm0 ; xmm4=tmp10 245 246 ; -- Final output stage 247 248 subps xmm3, xmm1 ; xmm3=tmp6 249 movaps xmm5, xmm6 250 movaps xmm0, xmm7 251 addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 252 addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 253 subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 254 subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 255 subps xmm2, xmm3 ; xmm2=tmp5 256 257 movaps xmm1, xmm6 ; transpose coefficients(phase 1) 258 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 259 unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 260 movaps xmm3, xmm0 ; transpose coefficients(phase 1) 261 unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 262 unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 263 264 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 265 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 266 267 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 268 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 269 270 addps xmm4, xmm2 ; xmm4=tmp4 271 movaps xmm0, xmm7 272 movaps xmm3, xmm5 273 addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 274 addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 275 subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 276 subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 277 278 movaps xmm2, xmm7 ; transpose coefficients(phase 1) 279 unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 280 unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 281 movaps xmm4, xmm5 ; transpose coefficients(phase 1) 282 unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 283 unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 284 285 movaps xmm3, xmm6 ; transpose coefficients(phase 2) 286 UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30) 287 UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31) 288 movaps xmm0, xmm1 ; transpose coefficients(phase 2) 289 UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32) 290 UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33) 291 292 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 293 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 294 295 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 296 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 297 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 298 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 299 300 movaps xmm6, xmm5 ; transpose coefficients(phase 2) 301 UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70) 302 UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71) 303 movaps xmm3, xmm4 ; transpose coefficients(phase 2) 304 UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72) 305 UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73) 306 307 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 308 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 309 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 310 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 311 312 .nextcolumn: 313 add esi, byte 4*SIZEOF_JCOEF ; coef_block 314 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 315 add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 316 dec ecx ; ctr 317 jnz near .columnloop 318 319 ; -- Prefetch the next coefficient block 320 321 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 322 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 323 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 324 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 325 326 ; ---- Pass 2: process rows from work array, store into output array. 327 328 mov eax, [original_ebp] 329 lea esi, [workspace] ; FAST_FLOAT *wsptr 330 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 331 mov eax, JDIMENSION [output_col(eax)] 332 mov ecx, DCTSIZE/4 ; ctr 333 ALIGNX 16, 7 334 .rowloop: 335 336 ; -- Even part 337 338 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 339 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 340 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 341 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 342 343 movaps xmm4, xmm0 344 movaps xmm5, xmm1 345 subps xmm0, xmm2 ; xmm0=tmp11 346 subps xmm1, xmm3 347 addps xmm4, xmm2 ; xmm4=tmp10 348 addps xmm5, xmm3 ; xmm5=tmp13 349 350 mulps xmm1, [GOTOFF(ebx,PD_1_414)] 351 subps xmm1, xmm5 ; xmm1=tmp12 352 353 movaps xmm6, xmm4 354 movaps xmm7, xmm0 355 subps xmm4, xmm5 ; xmm4=tmp3 356 subps xmm0, xmm1 ; xmm0=tmp2 357 addps xmm6, xmm5 ; xmm6=tmp0 358 addps xmm7, xmm1 ; xmm7=tmp1 359 360 movaps XMMWORD [wk(1)], xmm4 ; tmp3 361 movaps XMMWORD [wk(0)], xmm0 ; tmp2 362 363 ; -- Odd part 364 365 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 366 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 367 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 368 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 369 370 movaps xmm4, xmm2 371 movaps xmm0, xmm5 372 addps xmm2, xmm1 ; xmm2=z11 373 addps xmm5, xmm3 ; xmm5=z13 374 subps xmm4, xmm1 ; xmm4=z12 375 subps xmm0, xmm3 ; xmm0=z10 376 377 movaps xmm1, xmm2 378 subps xmm2, xmm5 379 addps xmm1, xmm5 ; xmm1=tmp7 380 381 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 382 383 movaps xmm3, xmm0 384 addps xmm0, xmm4 385 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 386 mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 387 mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 388 addps xmm3, xmm0 ; xmm3=tmp12 389 subps xmm4, xmm0 ; xmm4=tmp10 390 391 ; -- Final output stage 392 393 subps xmm3, xmm1 ; xmm3=tmp6 394 movaps xmm5, xmm6 395 movaps xmm0, xmm7 396 addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 397 addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 398 subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 399 subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 400 subps xmm2, xmm3 ; xmm2=tmp5 401 402 movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] 403 pcmpeqd xmm3, xmm3 404 psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 405 406 addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 407 addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 408 addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 409 addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 410 411 pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 412 pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 413 pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 414 pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 415 por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 416 por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 417 418 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 419 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 420 421 addps xmm4, xmm2 ; xmm4=tmp4 422 movaps xmm7, xmm1 423 movaps xmm5, xmm3 424 addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 425 addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 426 subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 427 subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 428 429 movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] 430 pcmpeqd xmm4, xmm4 431 psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 432 433 addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 434 addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 435 addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 436 addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 437 438 pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 439 pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 440 pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 441 pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 442 por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 443 por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 444 445 movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 446 447 packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 448 packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 449 paddb xmm6, xmm2 450 paddb xmm1, xmm2 451 452 movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 453 punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 454 punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 455 456 movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 457 punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 458 punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 459 460 pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 461 pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 462 463 PUSHPIC ebx ; save GOT address 464 465 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 466 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 467 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 468 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 469 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 470 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 471 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 472 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 473 474 POPPIC ebx ; restore GOT address 475 476 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 477 add edi, byte 4*SIZEOF_JSAMPROW 478 dec ecx ; ctr 479 jnz near .rowloop 480 481 pop edi 482 pop esi 483 ; pop edx ; need not be preserved 484 ; pop ecx ; need not be preserved 485 pop ebx 486 mov esp, ebp ; esp <- aligned ebp 487 pop esp ; esp <- original ebp 488 pop ebp 489 ret 490 491 ; For some reason, the OS X linker does not honor the request to align the 492 ; segment unless we do this. 493 align 32