jidctint-sse2.asm (34577B)
1 ; 2 ; jidctint.asm - accurate integer IDCT (64-bit SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 ; 15 ; This file contains a slower but more accurate integer implementation of the 16 ; inverse DCT (Discrete Cosine Transform). The following code is based 17 ; directly on the IJG's original jidctint.c; see the jidctint.c for 18 ; more details. 19 20 %include "jsimdext.inc" 21 %include "jdct.inc" 22 23 ; -------------------------------------------------------------------------- 24 25 %define CONST_BITS 13 26 %define PASS1_BITS 2 27 28 %define DESCALE_P1 (CONST_BITS - PASS1_BITS) 29 %define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) 30 31 %if CONST_BITS == 13 32 F_0_298 equ 2446 ; FIX(0.298631336) 33 F_0_390 equ 3196 ; FIX(0.390180644) 34 F_0_541 equ 4433 ; FIX(0.541196100) 35 F_0_765 equ 6270 ; FIX(0.765366865) 36 F_0_899 equ 7373 ; FIX(0.899976223) 37 F_1_175 equ 9633 ; FIX(1.175875602) 38 F_1_501 equ 12299 ; FIX(1.501321110) 39 F_1_847 equ 15137 ; FIX(1.847759065) 40 F_1_961 equ 16069 ; FIX(1.961570560) 41 F_2_053 equ 16819 ; FIX(2.053119869) 42 F_2_562 equ 20995 ; FIX(2.562915447) 43 F_3_072 equ 25172 ; FIX(3.072711026) 44 %else 45 ; NASM cannot do compile-time arithmetic on floating-point constants. 46 %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 47 F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) 48 F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) 49 F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 50 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 51 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 52 F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) 53 F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) 54 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 55 F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) 56 F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) 57 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 58 F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) 59 %endif 60 61 ; -------------------------------------------------------------------------- 62 SECTION SEG_CONST 63 64 ALIGNZ 32 65 GLOBAL_DATA(jconst_idct_islow_sse2) 66 67 EXTN(jconst_idct_islow_sse2): 68 69 PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 70 PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) 71 PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 72 PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) 73 PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 74 PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) 75 PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 76 PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) 77 PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) 78 PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) 79 PB_CENTERJSAMP times 16 db CENTERJSAMPLE 80 81 ALIGNZ 32 82 83 ; -------------------------------------------------------------------------- 84 SECTION SEG_TEXT 85 BITS 64 86 ; 87 ; Perform dequantization and inverse DCT on one block of coefficients. 88 ; 89 ; GLOBAL(void) 90 ; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, 91 ; JSAMPARRAY output_buf, JDIMENSION output_col) 92 ; 93 94 ; r10 = jpeg_component_info *compptr 95 ; r11 = JCOEFPTR coef_block 96 ; r12 = JSAMPARRAY output_buf 97 ; r13d = JDIMENSION output_col 98 99 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD 100 ; xmmword wk[WK_NUM] 101 %define WK_NUM 12 102 103 align 32 104 GLOBAL_FUNCTION(jsimd_idct_islow_sse2) 105 106 EXTN(jsimd_idct_islow_sse2): 107 ENDBR64 108 push rbp 109 mov rbp, rsp 110 push r15 111 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 112 ; Allocate stack space for wk array. r15 is used to access it. 113 mov r15, rsp 114 sub rsp, (SIZEOF_XMMWORD * WK_NUM) 115 COLLECT_ARGS 4 116 117 ; ---- Pass 1: process columns from input. 118 119 mov rdx, r10 ; quantptr 120 mov rsi, r11 ; inptr 121 122 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 123 mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 124 or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 125 jnz near .columnDCT 126 127 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 128 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 129 por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 130 por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 131 por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 132 por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 133 por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 134 por xmm1, xmm0 135 packsswb xmm1, xmm1 136 packsswb xmm1, xmm1 137 movd eax, xmm1 138 test rax, rax 139 jnz short .columnDCT 140 141 ; -- AC terms all zero 142 143 movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 144 pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 145 146 psllw xmm5, PASS1_BITS 147 148 movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) 149 punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) 150 punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) 151 152 pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) 153 pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) 154 pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) 155 pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) 156 pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) 157 pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) 158 pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) 159 pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) 160 161 movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 162 movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 163 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 164 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 165 jmp near .column_end 166 %endif 167 .columnDCT: 168 169 ; -- Even part 170 171 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 172 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 173 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 174 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 175 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 176 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 177 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 178 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 179 180 ; (Original) 181 ; z1 = (z2 + z3) * 0.541196100; 182 ; tmp2 = z1 + z3 * -1.847759065; 183 ; tmp3 = z1 + z2 * 0.765366865; 184 ; 185 ; (This implementation) 186 ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 187 ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 188 189 movdqa xmm4, xmm1 ; xmm1=in2=z2 190 movdqa xmm5, xmm1 191 punpcklwd xmm4, xmm3 ; xmm3=in6=z3 192 punpckhwd xmm5, xmm3 193 movdqa xmm1, xmm4 194 movdqa xmm3, xmm5 195 pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L 196 pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H 197 pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L 198 pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H 199 200 movdqa xmm6, xmm0 201 paddw xmm0, xmm2 ; xmm0=in0+in4 202 psubw xmm6, xmm2 ; xmm6=in0-in4 203 204 pxor xmm7, xmm7 205 pxor xmm2, xmm2 206 punpcklwd xmm7, xmm0 ; xmm7=tmp0L 207 punpckhwd xmm2, xmm0 ; xmm2=tmp0H 208 psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 209 psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS 210 211 movdqa xmm0, xmm7 212 paddd xmm7, xmm4 ; xmm7=tmp10L 213 psubd xmm0, xmm4 ; xmm0=tmp13L 214 movdqa xmm4, xmm2 215 paddd xmm2, xmm5 ; xmm2=tmp10H 216 psubd xmm4, xmm5 ; xmm4=tmp13H 217 218 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L 219 movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H 220 movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L 221 movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H 222 223 pxor xmm5, xmm5 224 pxor xmm7, xmm7 225 punpcklwd xmm5, xmm6 ; xmm5=tmp1L 226 punpckhwd xmm7, xmm6 ; xmm7=tmp1H 227 psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 228 psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 229 230 movdqa xmm2, xmm5 231 paddd xmm5, xmm1 ; xmm5=tmp11L 232 psubd xmm2, xmm1 ; xmm2=tmp12L 233 movdqa xmm0, xmm7 234 paddd xmm7, xmm3 ; xmm7=tmp11H 235 psubd xmm0, xmm3 ; xmm0=tmp12H 236 237 movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 238 movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H 239 movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L 240 movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H 241 242 ; -- Odd part 243 244 movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 245 movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 246 pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 247 pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 248 movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 249 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 250 pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 251 pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 252 253 movdqa xmm5, xmm6 254 movdqa xmm7, xmm4 255 paddw xmm5, xmm3 ; xmm5=z3 256 paddw xmm7, xmm1 ; xmm7=z4 257 258 ; (Original) 259 ; z5 = (z3 + z4) * 1.175875602; 260 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 261 ; z3 += z5; z4 += z5; 262 ; 263 ; (This implementation) 264 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 265 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 266 267 movdqa xmm2, xmm5 268 movdqa xmm0, xmm5 269 punpcklwd xmm2, xmm7 270 punpckhwd xmm0, xmm7 271 movdqa xmm5, xmm2 272 movdqa xmm7, xmm0 273 pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L 274 pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H 275 pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L 276 pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H 277 278 movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L 279 movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H 280 281 ; (Original) 282 ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 283 ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 284 ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 285 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 286 ; tmp0 += z1 + z3; tmp1 += z2 + z4; 287 ; tmp2 += z2 + z3; tmp3 += z1 + z4; 288 ; 289 ; (This implementation) 290 ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 291 ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 292 ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 293 ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 294 ; tmp0 += z3; tmp1 += z4; 295 ; tmp2 += z3; tmp3 += z4; 296 297 movdqa xmm2, xmm3 298 movdqa xmm0, xmm3 299 punpcklwd xmm2, xmm4 300 punpckhwd xmm0, xmm4 301 movdqa xmm3, xmm2 302 movdqa xmm4, xmm0 303 pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L 304 pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H 305 pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L 306 pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H 307 308 paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L 309 paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H 310 paddd xmm3, xmm5 ; xmm3=tmp3L 311 paddd xmm4, xmm7 ; xmm4=tmp3H 312 313 movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L 314 movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H 315 316 movdqa xmm2, xmm1 317 movdqa xmm0, xmm1 318 punpcklwd xmm2, xmm6 319 punpckhwd xmm0, xmm6 320 movdqa xmm1, xmm2 321 movdqa xmm6, xmm0 322 pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L 323 pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H 324 pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L 325 pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H 326 327 paddd xmm2, xmm5 ; xmm2=tmp1L 328 paddd xmm0, xmm7 ; xmm0=tmp1H 329 paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L 330 paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 331 332 movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L 333 movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H 334 335 ; -- Final output stage 336 337 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 338 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H 339 340 movdqa xmm2, xmm5 341 movdqa xmm0, xmm7 342 paddd xmm5, xmm3 ; xmm5=data0L 343 paddd xmm7, xmm4 ; xmm7=data0H 344 psubd xmm2, xmm3 ; xmm2=data7L 345 psubd xmm0, xmm4 ; xmm0=data7H 346 347 movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] 348 349 paddd xmm5, xmm3 350 paddd xmm7, xmm3 351 psrad xmm5, DESCALE_P1 352 psrad xmm7, DESCALE_P1 353 paddd xmm2, xmm3 354 paddd xmm0, xmm3 355 psrad xmm2, DESCALE_P1 356 psrad xmm0, DESCALE_P1 357 358 packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) 359 packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) 360 361 movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L 362 movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H 363 364 movdqa xmm7, xmm4 365 movdqa xmm0, xmm3 366 paddd xmm4, xmm1 ; xmm4=data1L 367 paddd xmm3, xmm6 ; xmm3=data1H 368 psubd xmm7, xmm1 ; xmm7=data6L 369 psubd xmm0, xmm6 ; xmm0=data6H 370 371 movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] 372 373 paddd xmm4, xmm1 374 paddd xmm3, xmm1 375 psrad xmm4, DESCALE_P1 376 psrad xmm3, DESCALE_P1 377 paddd xmm7, xmm1 378 paddd xmm0, xmm1 379 psrad xmm7, DESCALE_P1 380 psrad xmm0, DESCALE_P1 381 382 packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) 383 packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) 384 385 movdqa xmm6, xmm5 ; transpose coefficients(phase 1) 386 punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) 387 punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) 388 movdqa xmm1, xmm7 ; transpose coefficients(phase 1) 389 punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) 390 punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) 391 392 movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L 393 movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H 394 movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L 395 movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H 396 397 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) 398 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) 399 movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) 400 movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) 401 402 movdqa xmm5, xmm3 403 movdqa xmm6, xmm0 404 paddd xmm3, xmm4 ; xmm3=data2L 405 paddd xmm0, xmm2 ; xmm0=data2H 406 psubd xmm5, xmm4 ; xmm5=data5L 407 psubd xmm6, xmm2 ; xmm6=data5H 408 409 movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] 410 411 paddd xmm3, xmm7 412 paddd xmm0, xmm7 413 psrad xmm3, DESCALE_P1 414 psrad xmm0, DESCALE_P1 415 paddd xmm5, xmm7 416 paddd xmm6, xmm7 417 psrad xmm5, DESCALE_P1 418 psrad xmm6, DESCALE_P1 419 420 packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) 421 packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) 422 423 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L 424 movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H 425 movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L 426 movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H 427 428 movdqa xmm0, xmm1 429 movdqa xmm6, xmm4 430 paddd xmm1, xmm2 ; xmm1=data3L 431 paddd xmm4, xmm7 ; xmm4=data3H 432 psubd xmm0, xmm2 ; xmm0=data4L 433 psubd xmm6, xmm7 ; xmm6=data4H 434 435 movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] 436 437 paddd xmm1, xmm2 438 paddd xmm4, xmm2 439 psrad xmm1, DESCALE_P1 440 psrad xmm4, DESCALE_P1 441 paddd xmm0, xmm2 442 paddd xmm6, xmm2 443 psrad xmm0, DESCALE_P1 444 psrad xmm6, DESCALE_P1 445 446 packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) 447 packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) 448 449 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) 450 movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) 451 452 movdqa xmm4, xmm3 ; transpose coefficients(phase 1) 453 punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) 454 punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) 455 movdqa xmm6, xmm0 ; transpose coefficients(phase 1) 456 punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) 457 punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) 458 459 movdqa xmm1, xmm7 ; transpose coefficients(phase 2) 460 punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) 461 punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) 462 movdqa xmm5, xmm2 ; transpose coefficients(phase 2) 463 punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) 464 punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) 465 466 movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) 467 movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) 468 469 movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) 470 movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) 471 472 movdqa xmm2, xmm0 ; transpose coefficients(phase 2) 473 punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) 474 punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) 475 movdqa xmm5, xmm6 ; transpose coefficients(phase 2) 476 punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) 477 punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) 478 479 movdqa xmm3, xmm7 ; transpose coefficients(phase 3) 480 punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) 481 punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) 482 movdqa xmm4, xmm1 ; transpose coefficients(phase 3) 483 punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) 484 punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) 485 486 movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) 487 movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) 488 489 movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 490 movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 491 492 movdqa xmm3, xmm0 ; transpose coefficients(phase 3) 493 punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) 494 punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) 495 movdqa xmm4, xmm2 ; transpose coefficients(phase 3) 496 punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) 497 punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) 498 499 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 500 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 501 .column_end: 502 503 ; -- Prefetch the next coefficient block 504 505 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 506 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 507 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 508 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 509 510 ; ---- Pass 2: process rows from work array, store into output array. 511 512 mov rdi, r12 ; (JSAMPROW *) 513 mov eax, r13d 514 515 ; -- Even part 516 517 ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 518 519 ; (Original) 520 ; z1 = (z2 + z3) * 0.541196100; 521 ; tmp2 = z1 + z3 * -1.847759065; 522 ; tmp3 = z1 + z2 * 0.765366865; 523 ; 524 ; (This implementation) 525 ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 526 ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 527 528 movdqa xmm6, xmm1 ; xmm1=in2=z2 529 movdqa xmm5, xmm1 530 punpcklwd xmm6, xmm2 ; xmm2=in6=z3 531 punpckhwd xmm5, xmm2 532 movdqa xmm1, xmm6 533 movdqa xmm2, xmm5 534 pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L 535 pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H 536 pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L 537 pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H 538 539 movdqa xmm3, xmm7 540 paddw xmm7, xmm0 ; xmm7=in0+in4 541 psubw xmm3, xmm0 ; xmm3=in0-in4 542 543 pxor xmm4, xmm4 544 pxor xmm0, xmm0 545 punpcklwd xmm4, xmm7 ; xmm4=tmp0L 546 punpckhwd xmm0, xmm7 ; xmm0=tmp0H 547 psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 548 psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS 549 550 movdqa xmm7, xmm4 551 paddd xmm4, xmm6 ; xmm4=tmp10L 552 psubd xmm7, xmm6 ; xmm7=tmp13L 553 movdqa xmm6, xmm0 554 paddd xmm0, xmm5 ; xmm0=tmp10H 555 psubd xmm6, xmm5 ; xmm6=tmp13H 556 557 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L 558 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H 559 movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L 560 movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H 561 562 pxor xmm5, xmm5 563 pxor xmm4, xmm4 564 punpcklwd xmm5, xmm3 ; xmm5=tmp1L 565 punpckhwd xmm4, xmm3 ; xmm4=tmp1H 566 psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 567 psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 568 569 movdqa xmm0, xmm5 570 paddd xmm5, xmm1 ; xmm5=tmp11L 571 psubd xmm0, xmm1 ; xmm0=tmp12L 572 movdqa xmm7, xmm4 573 paddd xmm4, xmm2 ; xmm4=tmp11H 574 psubd xmm7, xmm2 ; xmm7=tmp12H 575 576 movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 577 movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H 578 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L 579 movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H 580 581 ; -- Odd part 582 583 movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 584 movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 585 movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 586 movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 587 588 movdqa xmm5, xmm6 589 movdqa xmm4, xmm3 590 paddw xmm5, xmm1 ; xmm5=z3 591 paddw xmm4, xmm2 ; xmm4=z4 592 593 ; (Original) 594 ; z5 = (z3 + z4) * 1.175875602; 595 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 596 ; z3 += z5; z4 += z5; 597 ; 598 ; (This implementation) 599 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 600 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 601 602 movdqa xmm0, xmm5 603 movdqa xmm7, xmm5 604 punpcklwd xmm0, xmm4 605 punpckhwd xmm7, xmm4 606 movdqa xmm5, xmm0 607 movdqa xmm4, xmm7 608 pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L 609 pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H 610 pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L 611 pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H 612 613 movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L 614 movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H 615 616 ; (Original) 617 ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 618 ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 619 ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 620 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 621 ; tmp0 += z1 + z3; tmp1 += z2 + z4; 622 ; tmp2 += z2 + z3; tmp3 += z1 + z4; 623 ; 624 ; (This implementation) 625 ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 626 ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 627 ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 628 ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 629 ; tmp0 += z3; tmp1 += z4; 630 ; tmp2 += z3; tmp3 += z4; 631 632 movdqa xmm0, xmm1 633 movdqa xmm7, xmm1 634 punpcklwd xmm0, xmm3 635 punpckhwd xmm7, xmm3 636 movdqa xmm1, xmm0 637 movdqa xmm3, xmm7 638 pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L 639 pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H 640 pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L 641 pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H 642 643 paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L 644 paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H 645 paddd xmm1, xmm5 ; xmm1=tmp3L 646 paddd xmm3, xmm4 ; xmm3=tmp3H 647 648 movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L 649 movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H 650 651 movdqa xmm0, xmm2 652 movdqa xmm7, xmm2 653 punpcklwd xmm0, xmm6 654 punpckhwd xmm7, xmm6 655 movdqa xmm2, xmm0 656 movdqa xmm6, xmm7 657 pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L 658 pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H 659 pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L 660 pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H 661 662 paddd xmm0, xmm5 ; xmm0=tmp1L 663 paddd xmm7, xmm4 ; xmm7=tmp1H 664 paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L 665 paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 666 667 movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L 668 movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H 669 670 ; -- Final output stage 671 672 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 673 movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H 674 675 movdqa xmm0, xmm5 676 movdqa xmm7, xmm4 677 paddd xmm5, xmm1 ; xmm5=data0L 678 paddd xmm4, xmm3 ; xmm4=data0H 679 psubd xmm0, xmm1 ; xmm0=data7L 680 psubd xmm7, xmm3 ; xmm7=data7H 681 682 movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] 683 684 paddd xmm5, xmm1 685 paddd xmm4, xmm1 686 psrad xmm5, DESCALE_P2 687 psrad xmm4, DESCALE_P2 688 paddd xmm0, xmm1 689 paddd xmm7, xmm1 690 psrad xmm0, DESCALE_P2 691 psrad xmm7, DESCALE_P2 692 693 packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) 694 packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) 695 696 movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L 697 movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H 698 699 movdqa xmm4, xmm3 700 movdqa xmm7, xmm1 701 paddd xmm3, xmm2 ; xmm3=data1L 702 paddd xmm1, xmm6 ; xmm1=data1H 703 psubd xmm4, xmm2 ; xmm4=data6L 704 psubd xmm7, xmm6 ; xmm7=data6H 705 706 movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] 707 708 paddd xmm3, xmm2 709 paddd xmm1, xmm2 710 psrad xmm3, DESCALE_P2 711 psrad xmm1, DESCALE_P2 712 paddd xmm4, xmm2 713 paddd xmm7, xmm2 714 psrad xmm4, DESCALE_P2 715 psrad xmm7, DESCALE_P2 716 717 packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) 718 packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) 719 720 packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 721 packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 722 723 movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L 724 movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H 725 movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L 726 movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H 727 728 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 729 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 730 731 movdqa xmm4, xmm6 732 movdqa xmm0, xmm2 733 paddd xmm6, xmm1 ; xmm6=data2L 734 paddd xmm2, xmm7 ; xmm2=data2H 735 psubd xmm4, xmm1 ; xmm4=data5L 736 psubd xmm0, xmm7 ; xmm0=data5H 737 738 movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] 739 740 paddd xmm6, xmm5 741 paddd xmm2, xmm5 742 psrad xmm6, DESCALE_P2 743 psrad xmm2, DESCALE_P2 744 paddd xmm4, xmm5 745 paddd xmm0, xmm5 746 psrad xmm4, DESCALE_P2 747 psrad xmm0, DESCALE_P2 748 749 packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) 750 packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) 751 752 movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L 753 movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H 754 movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L 755 movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H 756 757 movdqa xmm2, xmm3 758 movdqa xmm0, xmm1 759 paddd xmm3, xmm7 ; xmm3=data3L 760 paddd xmm1, xmm5 ; xmm1=data3H 761 psubd xmm2, xmm7 ; xmm2=data4L 762 psubd xmm0, xmm5 ; xmm0=data4H 763 764 movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] 765 766 paddd xmm3, xmm7 767 paddd xmm1, xmm7 768 psrad xmm3, DESCALE_P2 769 psrad xmm1, DESCALE_P2 770 paddd xmm2, xmm7 771 paddd xmm0, xmm7 772 psrad xmm2, DESCALE_P2 773 psrad xmm0, DESCALE_P2 774 775 movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] 776 777 packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) 778 packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) 779 780 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 781 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 782 783 packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 784 packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 785 786 paddb xmm7, xmm5 787 paddb xmm1, xmm5 788 paddb xmm6, xmm5 789 paddb xmm3, xmm5 790 791 movdqa xmm0, xmm7 ; transpose coefficients(phase 1) 792 punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 793 punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 794 movdqa xmm2, xmm6 ; transpose coefficients(phase 1) 795 punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 796 punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 797 798 movdqa xmm4, xmm7 ; transpose coefficients(phase 2) 799 punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 800 punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 801 movdqa xmm5, xmm2 ; transpose coefficients(phase 2) 802 punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 803 punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 804 805 movdqa xmm1, xmm7 ; transpose coefficients(phase 3) 806 punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 807 punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 808 movdqa xmm3, xmm4 ; transpose coefficients(phase 3) 809 punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 810 punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 811 812 pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 813 pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 814 pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 815 pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 816 817 mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 818 mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 819 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 820 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 821 mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] 822 mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] 823 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 824 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 825 826 mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 827 mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 828 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 829 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 830 mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] 831 mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] 832 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 833 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 834 835 UNCOLLECT_ARGS 4 836 lea rsp, [rbp-8] 837 pop r15 838 pop rbp 839 ret 840 841 ; For some reason, the OS X linker does not honor the request to align the 842 ; segment unless we do this. 843 align 32