jidctflt-sse2.asm (20576B)
1 ; 2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 ; 15 ; This file contains a floating-point implementation of the inverse DCT 16 ; (Discrete Cosine Transform). The following code is based directly on 17 ; the IJG's original jidctflt.c; see the jidctflt.c for more details. 18 19 %include "jsimdext.inc" 20 %include "jdct.inc" 21 22 ; -------------------------------------------------------------------------- 23 24 %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 25 shufps %1, %2, 0x44 26 %endmacro 27 28 %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 29 shufps %1, %2, 0xEE 30 %endmacro 31 32 ; -------------------------------------------------------------------------- 33 SECTION SEG_CONST 34 35 ALIGNZ 32 36 GLOBAL_DATA(jconst_idct_float_sse2) 37 38 EXTN(jconst_idct_float_sse2): 39 40 PD_1_414 times 4 dd 1.414213562373095048801689 41 PD_1_847 times 4 dd 1.847759065022573512256366 42 PD_1_082 times 4 dd 1.082392200292393968799446 43 PD_M2_613 times 4 dd -2.613125929752753055713286 44 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 45 PB_CENTERJSAMP times 16 db CENTERJSAMPLE 46 47 ALIGNZ 32 48 49 ; -------------------------------------------------------------------------- 50 SECTION SEG_TEXT 51 BITS 64 52 ; 53 ; Perform dequantization and inverse DCT on one block of coefficients. 54 ; 55 ; GLOBAL(void) 56 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 57 ; JSAMPARRAY output_buf, JDIMENSION output_col) 58 ; 59 60 ; r10 = void *dct_table 61 ; r11 = JCOEFPTR coef_block 62 ; r12 = JSAMPARRAY output_buf 63 ; r13d = JDIMENSION output_col 64 65 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD 66 ; xmmword wk[WK_NUM] 67 %define WK_NUM 2 68 %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 69 ; FAST_FLOAT workspace[DCTSIZE2] 70 71 align 32 72 GLOBAL_FUNCTION(jsimd_idct_float_sse2) 73 74 EXTN(jsimd_idct_float_sse2): 75 ENDBR64 76 push rbp 77 mov rbp, rsp 78 push r15 79 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 80 ; Allocate stack space for wk array. r15 is used to access it. 81 mov r15, rsp 82 lea rsp, [workspace] 83 COLLECT_ARGS 4 84 push rbx 85 86 ; ---- Pass 1: process columns from input, store into work array. 87 88 mov rdx, r10 ; quantptr 89 mov rsi, r11 ; inptr 90 lea rdi, [workspace] ; FAST_FLOAT *wsptr 91 mov rcx, DCTSIZE/4 ; ctr 92 .columnloop: 93 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 94 mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 95 or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 96 jnz near .columnDCT 97 98 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 99 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 100 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 101 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 102 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 103 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 104 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 105 por xmm1, xmm2 106 por xmm3, xmm4 107 por xmm5, xmm6 108 por xmm1, xmm3 109 por xmm5, xmm7 110 por xmm1, xmm5 111 packsswb xmm1, xmm1 112 movd eax, xmm1 113 test rax, rax 114 jnz short .columnDCT 115 116 ; -- AC terms all zero 117 118 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 119 120 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 121 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 122 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 123 124 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 125 126 movaps xmm1, xmm0 127 movaps xmm2, xmm0 128 movaps xmm3, xmm0 129 130 shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 131 shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 132 shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 133 shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 134 135 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 136 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 137 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 138 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 139 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 140 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 141 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 142 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 143 jmp near .nextcolumn 144 %endif 145 .columnDCT: 146 147 ; -- Even part 148 149 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 150 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 151 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 152 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 153 154 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 155 punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 156 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 157 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 158 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 159 cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 160 161 punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 162 punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 163 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 164 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 165 cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 166 cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 167 168 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 169 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 170 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 171 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 172 173 movaps xmm4, xmm0 174 movaps xmm5, xmm1 175 subps xmm0, xmm2 ; xmm0=tmp11 176 subps xmm1, xmm3 177 addps xmm4, xmm2 ; xmm4=tmp10 178 addps xmm5, xmm3 ; xmm5=tmp13 179 180 mulps xmm1, [rel PD_1_414] 181 subps xmm1, xmm5 ; xmm1=tmp12 182 183 movaps xmm6, xmm4 184 movaps xmm7, xmm0 185 subps xmm4, xmm5 ; xmm4=tmp3 186 subps xmm0, xmm1 ; xmm0=tmp2 187 addps xmm6, xmm5 ; xmm6=tmp0 188 addps xmm7, xmm1 ; xmm7=tmp1 189 190 movaps XMMWORD [wk(1)], xmm4 ; tmp3 191 movaps XMMWORD [wk(0)], xmm0 ; tmp2 192 193 ; -- Odd part 194 195 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 196 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 197 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 198 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 199 200 punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 201 punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 202 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 203 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 204 cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 205 cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 206 207 punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 208 punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 209 psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 210 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 211 cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 212 cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 213 214 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 215 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 216 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 217 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 218 219 movaps xmm4, xmm2 220 movaps xmm0, xmm5 221 addps xmm2, xmm1 ; xmm2=z11 222 addps xmm5, xmm3 ; xmm5=z13 223 subps xmm4, xmm1 ; xmm4=z12 224 subps xmm0, xmm3 ; xmm0=z10 225 226 movaps xmm1, xmm2 227 subps xmm2, xmm5 228 addps xmm1, xmm5 ; xmm1=tmp7 229 230 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 231 232 movaps xmm3, xmm0 233 addps xmm0, xmm4 234 mulps xmm0, [rel PD_1_847] ; xmm0=z5 235 mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 236 mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 237 addps xmm3, xmm0 ; xmm3=tmp12 238 subps xmm4, xmm0 ; xmm4=tmp10 239 240 ; -- Final output stage 241 242 subps xmm3, xmm1 ; xmm3=tmp6 243 movaps xmm5, xmm6 244 movaps xmm0, xmm7 245 addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 246 addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 247 subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 248 subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 249 subps xmm2, xmm3 ; xmm2=tmp5 250 251 movaps xmm1, xmm6 ; transpose coefficients(phase 1) 252 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 253 unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 254 movaps xmm3, xmm0 ; transpose coefficients(phase 1) 255 unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 256 unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 257 258 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 259 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 260 261 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 262 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 263 264 addps xmm4, xmm2 ; xmm4=tmp4 265 movaps xmm0, xmm7 266 movaps xmm3, xmm5 267 addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 268 addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 269 subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 270 subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 271 272 movaps xmm2, xmm7 ; transpose coefficients(phase 1) 273 unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 274 unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 275 movaps xmm4, xmm5 ; transpose coefficients(phase 1) 276 unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 277 unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 278 279 movaps xmm3, xmm6 ; transpose coefficients(phase 2) 280 UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30) 281 UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31) 282 movaps xmm0, xmm1 ; transpose coefficients(phase 2) 283 UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32) 284 UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33) 285 286 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 287 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 288 289 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 290 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 291 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 292 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 293 294 movaps xmm6, xmm5 ; transpose coefficients(phase 2) 295 UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70) 296 UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71) 297 movaps xmm3, xmm4 ; transpose coefficients(phase 2) 298 UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72) 299 UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73) 300 301 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 302 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 303 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 304 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 305 306 .nextcolumn: 307 add rsi, byte 4*SIZEOF_JCOEF ; coef_block 308 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 309 add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 310 dec rcx ; ctr 311 jnz near .columnloop 312 313 ; -- Prefetch the next coefficient block 314 315 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 316 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 317 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 318 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 319 320 ; ---- Pass 2: process rows from work array, store into output array. 321 322 lea rsi, [workspace] ; FAST_FLOAT *wsptr 323 mov rdi, r12 ; (JSAMPROW *) 324 mov eax, r13d 325 mov rcx, DCTSIZE/4 ; ctr 326 .rowloop: 327 328 ; -- Even part 329 330 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 331 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] 332 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] 333 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] 334 335 movaps xmm4, xmm0 336 movaps xmm5, xmm1 337 subps xmm0, xmm2 ; xmm0=tmp11 338 subps xmm1, xmm3 339 addps xmm4, xmm2 ; xmm4=tmp10 340 addps xmm5, xmm3 ; xmm5=tmp13 341 342 mulps xmm1, [rel PD_1_414] 343 subps xmm1, xmm5 ; xmm1=tmp12 344 345 movaps xmm6, xmm4 346 movaps xmm7, xmm0 347 subps xmm4, xmm5 ; xmm4=tmp3 348 subps xmm0, xmm1 ; xmm0=tmp2 349 addps xmm6, xmm5 ; xmm6=tmp0 350 addps xmm7, xmm1 ; xmm7=tmp1 351 352 movaps XMMWORD [wk(1)], xmm4 ; tmp3 353 movaps XMMWORD [wk(0)], xmm0 ; tmp2 354 355 ; -- Odd part 356 357 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 358 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] 359 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] 360 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] 361 362 movaps xmm4, xmm2 363 movaps xmm0, xmm5 364 addps xmm2, xmm1 ; xmm2=z11 365 addps xmm5, xmm3 ; xmm5=z13 366 subps xmm4, xmm1 ; xmm4=z12 367 subps xmm0, xmm3 ; xmm0=z10 368 369 movaps xmm1, xmm2 370 subps xmm2, xmm5 371 addps xmm1, xmm5 ; xmm1=tmp7 372 373 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 374 375 movaps xmm3, xmm0 376 addps xmm0, xmm4 377 mulps xmm0, [rel PD_1_847] ; xmm0=z5 378 mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 379 mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 380 addps xmm3, xmm0 ; xmm3=tmp12 381 subps xmm4, xmm0 ; xmm4=tmp10 382 383 ; -- Final output stage 384 385 subps xmm3, xmm1 ; xmm3=tmp6 386 movaps xmm5, xmm6 387 movaps xmm0, xmm7 388 addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 389 addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 390 subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 391 subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 392 subps xmm2, xmm3 ; xmm2=tmp5 393 394 movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] 395 pcmpeqd xmm3, xmm3 396 psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 397 398 addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 399 addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 400 addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 401 addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 402 403 pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 404 pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 405 pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 406 pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 407 por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 408 por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 409 410 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 411 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 412 413 addps xmm4, xmm2 ; xmm4=tmp4 414 movaps xmm7, xmm1 415 movaps xmm5, xmm3 416 addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 417 addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 418 subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 419 subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 420 421 movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] 422 pcmpeqd xmm4, xmm4 423 psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 424 425 addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 426 addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 427 addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 428 addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 429 430 pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 431 pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 432 pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 433 pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 434 por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 435 por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 436 437 movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 438 439 packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 440 packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 441 paddb xmm6, xmm2 442 paddb xmm1, xmm2 443 444 movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 445 punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 446 punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 447 448 movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 449 punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 450 punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 451 452 pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 453 pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 454 455 mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 456 mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 457 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 458 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 459 mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 460 mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 461 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 462 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 463 464 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 465 add rdi, byte 4*SIZEOF_JSAMPROW 466 dec rcx ; ctr 467 jnz near .rowloop 468 469 pop rbx 470 UNCOLLECT_ARGS 4 471 lea rsp, [rbp-8] 472 pop r15 473 pop rbp 474 ret 475 476 ; For some reason, the OS X linker does not honor the request to align the 477 ; segment unless we do this. 478 align 32