jfdctflt-sse.asm (14924B)
1 ; 2 ; jfdctflt.asm - floating-point FDCT (SSE) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 ; 13 ; This file contains a floating-point implementation of the forward DCT 14 ; (Discrete Cosine Transform). The following code is based directly on 15 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 16 17 %include "jsimdext.inc" 18 %include "jdct.inc" 19 20 ; -------------------------------------------------------------------------- 21 22 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 23 shufps %1, %2, 0x44 24 %endmacro 25 26 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 27 shufps %1, %2, 0xEE 28 %endmacro 29 30 ; -------------------------------------------------------------------------- 31 SECTION SEG_CONST 32 33 ALIGNZ 32 34 GLOBAL_DATA(jconst_fdct_float_sse) 35 36 EXTN(jconst_fdct_float_sse): 37 38 PD_0_382 times 4 dd 0.382683432365089771728460 39 PD_0_707 times 4 dd 0.707106781186547524400844 40 PD_0_541 times 4 dd 0.541196100146196984399723 41 PD_1_306 times 4 dd 1.306562964876376527856643 42 43 ALIGNZ 32 44 45 ; -------------------------------------------------------------------------- 46 SECTION SEG_TEXT 47 BITS 32 48 ; 49 ; Perform the forward DCT on one block of samples. 50 ; 51 ; GLOBAL(void) 52 ; jsimd_fdct_float_sse(FAST_FLOAT *data) 53 ; 54 55 %define data(b) (b) + 8 ; FAST_FLOAT *data 56 57 %define original_ebp ebp + 0 58 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 59 ; xmmword wk[WK_NUM] 60 %define WK_NUM 2 61 62 align 32 63 GLOBAL_FUNCTION(jsimd_fdct_float_sse) 64 65 EXTN(jsimd_fdct_float_sse): 66 push ebp 67 mov eax, esp ; eax = original ebp 68 sub esp, byte 4 69 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 70 mov [esp], eax 71 mov ebp, esp ; ebp = aligned ebp 72 lea esp, [wk(0)] 73 PUSHPIC ebx 74 ; push ecx ; need not be preserved 75 ; push edx ; need not be preserved 76 ; push esi ; unused 77 ; push edi ; unused 78 79 GET_GOT ebx ; get GOT address 80 81 ; ---- Pass 1: process rows. 82 83 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 84 mov ecx, DCTSIZE/4 85 ALIGNX 16, 7 86 .rowloop: 87 88 movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] 89 movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] 90 movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] 91 movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] 92 93 ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) 94 ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) 95 96 movaps xmm4, xmm0 ; transpose coefficients(phase 1) 97 unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) 98 unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) 99 movaps xmm5, xmm2 ; transpose coefficients(phase 1) 100 unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) 101 unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) 102 103 movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 104 movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 105 movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 106 movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 107 108 ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) 109 ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) 110 111 movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) 112 movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) 113 114 movaps xmm4, xmm6 ; transpose coefficients(phase 1) 115 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 116 unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) 117 movaps xmm2, xmm1 ; transpose coefficients(phase 1) 118 unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) 119 unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) 120 121 movaps xmm7, xmm6 ; transpose coefficients(phase 2) 122 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 123 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 124 movaps xmm3, xmm2 ; transpose coefficients(phase 2) 125 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 126 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 127 128 movaps xmm0, xmm7 129 movaps xmm5, xmm6 130 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 131 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 132 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 133 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 134 135 movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) 136 movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) 137 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 138 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 139 140 movaps xmm7, xmm4 ; transpose coefficients(phase 2) 141 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 142 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 143 movaps xmm6, xmm1 ; transpose coefficients(phase 2) 144 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 145 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 146 147 movaps xmm2, xmm7 148 movaps xmm3, xmm4 149 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 150 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 151 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 152 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 153 154 ; -- Even part 155 156 movaps xmm1, xmm5 157 movaps xmm6, xmm0 158 subps xmm5, xmm7 ; xmm5=tmp13 159 subps xmm0, xmm4 ; xmm0=tmp12 160 addps xmm1, xmm7 ; xmm1=tmp10 161 addps xmm6, xmm4 ; xmm6=tmp11 162 163 addps xmm0, xmm5 164 mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 165 166 movaps xmm7, xmm1 167 movaps xmm4, xmm5 168 subps xmm1, xmm6 ; xmm1=data4 169 subps xmm5, xmm0 ; xmm5=data6 170 addps xmm7, xmm6 ; xmm7=data0 171 addps xmm4, xmm0 ; xmm4=data2 172 173 movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 174 movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 175 movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 176 movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 177 178 ; -- Odd part 179 180 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 181 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 182 183 addps xmm2, xmm3 ; xmm2=tmp10 184 addps xmm3, xmm6 ; xmm3=tmp11 185 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 186 187 mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 188 189 movaps xmm1, xmm2 ; xmm1=tmp10 190 subps xmm2, xmm6 191 mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 192 mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) 193 mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) 194 addps xmm1, xmm2 ; xmm1=z2 195 addps xmm6, xmm2 ; xmm6=z4 196 197 movaps xmm5, xmm0 198 subps xmm0, xmm3 ; xmm0=z13 199 addps xmm5, xmm3 ; xmm5=z11 200 201 movaps xmm7, xmm0 202 movaps xmm4, xmm5 203 subps xmm0, xmm1 ; xmm0=data3 204 subps xmm5, xmm6 ; xmm5=data7 205 addps xmm7, xmm1 ; xmm7=data5 206 addps xmm4, xmm6 ; xmm4=data1 207 208 movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 209 movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 210 movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 211 movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 212 213 add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT 214 dec ecx 215 jnz near .rowloop 216 217 ; ---- Pass 2: process columns. 218 219 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 220 mov ecx, DCTSIZE/4 221 ALIGNX 16, 7 222 .columnloop: 223 224 movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] 225 movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] 226 movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] 227 movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] 228 229 ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) 230 ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) 231 232 movaps xmm4, xmm0 ; transpose coefficients(phase 1) 233 unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) 234 unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) 235 movaps xmm5, xmm2 ; transpose coefficients(phase 1) 236 unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) 237 unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) 238 239 movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 240 movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 241 movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] 242 movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] 243 244 ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) 245 ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) 246 247 movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) 248 movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) 249 250 movaps xmm4, xmm6 ; transpose coefficients(phase 1) 251 unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) 252 unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) 253 movaps xmm2, xmm1 ; transpose coefficients(phase 1) 254 unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) 255 unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) 256 257 movaps xmm7, xmm6 ; transpose coefficients(phase 2) 258 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 259 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 260 movaps xmm3, xmm2 ; transpose coefficients(phase 2) 261 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 262 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 263 264 movaps xmm0, xmm7 265 movaps xmm5, xmm6 266 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 267 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 268 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 269 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 270 271 movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) 272 movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) 273 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 274 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 275 276 movaps xmm7, xmm4 ; transpose coefficients(phase 2) 277 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 278 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 279 movaps xmm6, xmm1 ; transpose coefficients(phase 2) 280 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 281 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 282 283 movaps xmm2, xmm7 284 movaps xmm3, xmm4 285 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 286 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 287 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 288 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 289 290 ; -- Even part 291 292 movaps xmm1, xmm5 293 movaps xmm6, xmm0 294 subps xmm5, xmm7 ; xmm5=tmp13 295 subps xmm0, xmm4 ; xmm0=tmp12 296 addps xmm1, xmm7 ; xmm1=tmp10 297 addps xmm6, xmm4 ; xmm6=tmp11 298 299 addps xmm0, xmm5 300 mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 301 302 movaps xmm7, xmm1 303 movaps xmm4, xmm5 304 subps xmm1, xmm6 ; xmm1=data4 305 subps xmm5, xmm0 ; xmm5=data6 306 addps xmm7, xmm6 ; xmm7=data0 307 addps xmm4, xmm0 ; xmm4=data2 308 309 movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 310 movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 311 movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 312 movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 313 314 ; -- Odd part 315 316 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 317 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 318 319 addps xmm2, xmm3 ; xmm2=tmp10 320 addps xmm3, xmm6 ; xmm3=tmp11 321 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 322 323 mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 324 325 movaps xmm1, xmm2 ; xmm1=tmp10 326 subps xmm2, xmm6 327 mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 328 mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) 329 mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) 330 addps xmm1, xmm2 ; xmm1=z2 331 addps xmm6, xmm2 ; xmm6=z4 332 333 movaps xmm5, xmm0 334 subps xmm0, xmm3 ; xmm0=z13 335 addps xmm5, xmm3 ; xmm5=z11 336 337 movaps xmm7, xmm0 338 movaps xmm4, xmm5 339 subps xmm0, xmm1 ; xmm0=data3 340 subps xmm5, xmm6 ; xmm5=data7 341 addps xmm7, xmm1 ; xmm7=data5 342 addps xmm4, xmm6 ; xmm4=data1 343 344 movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 345 movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 346 movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 347 movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 348 349 add edx, byte 4*SIZEOF_FAST_FLOAT 350 dec ecx 351 jnz near .columnloop 352 353 ; pop edi ; unused 354 ; pop esi ; unused 355 ; pop edx ; need not be preserved 356 ; pop ecx ; need not be preserved 357 POPPIC ebx 358 mov esp, ebp ; esp <- aligned ebp 359 pop esp ; esp <- original ebp 360 pop ebp 361 ret 362 363 ; For some reason, the OS X linker does not honor the request to align the 364 ; segment unless we do this. 365 align 32