jfdctflt-sse.asm (14126B)
1 ; 2 ; jfdctflt.asm - floating-point FDCT (64-bit SSE) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2023, Aliaksiej Kandracienka. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 ; 14 ; This file contains a floating-point implementation of the forward DCT 15 ; (Discrete Cosine Transform). The following code is based directly on 16 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 17 18 %include "jsimdext.inc" 19 %include "jdct.inc" 20 21 ; -------------------------------------------------------------------------- 22 23 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 24 shufps %1, %2, 0x44 25 %endmacro 26 27 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 28 shufps %1, %2, 0xEE 29 %endmacro 30 31 ; -------------------------------------------------------------------------- 32 SECTION SEG_CONST 33 34 ALIGNZ 32 35 GLOBAL_DATA(jconst_fdct_float_sse) 36 37 EXTN(jconst_fdct_float_sse): 38 39 PD_0_382 times 4 dd 0.382683432365089771728460 40 PD_0_707 times 4 dd 0.707106781186547524400844 41 PD_0_541 times 4 dd 0.541196100146196984399723 42 PD_1_306 times 4 dd 1.306562964876376527856643 43 44 ALIGNZ 32 45 46 ; -------------------------------------------------------------------------- 47 SECTION SEG_TEXT 48 BITS 64 49 ; 50 ; Perform the forward DCT on one block of samples. 51 ; 52 ; GLOBAL(void) 53 ; jsimd_fdct_float_sse(FAST_FLOAT *data) 54 ; 55 56 ; r10 = FAST_FLOAT *data 57 58 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 59 %define WK_NUM 2 60 61 align 32 62 GLOBAL_FUNCTION(jsimd_fdct_float_sse) 63 64 EXTN(jsimd_fdct_float_sse): 65 ENDBR64 66 push rbp 67 mov rbp, rsp 68 push r15 69 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 70 ; Allocate stack space for wk array. r15 is used to access it. 71 mov r15, rsp 72 sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) 73 COLLECT_ARGS 1 74 75 ; ---- Pass 1: process rows. 76 77 mov rdx, r10 ; (FAST_FLOAT *) 78 mov rcx, DCTSIZE/4 79 .rowloop: 80 81 movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] 82 movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] 83 movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] 84 movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] 85 86 ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) 87 ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) 88 89 movaps xmm4, xmm0 ; transpose coefficients(phase 1) 90 unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) 91 unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) 92 movaps xmm5, xmm2 ; transpose coefficients(phase 1) 93 unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) 94 unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) 95 96 movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 97 movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 98 movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] 99 movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] 100 101 ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) 102 ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) 103 104 movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) 105 movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) 106 107 movaps xmm4, xmm6 ; transpose coefficients(phase 1) 108 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 109 unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) 110 movaps xmm2, xmm1 ; transpose coefficients(phase 1) 111 unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) 112 unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) 113 114 movaps xmm7, xmm6 ; transpose coefficients(phase 2) 115 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 116 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 117 movaps xmm3, xmm2 ; transpose coefficients(phase 2) 118 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 119 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 120 121 movaps xmm0, xmm7 122 movaps xmm5, xmm6 123 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 124 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 125 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 126 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 127 128 movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) 129 movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) 130 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 131 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 132 133 movaps xmm7, xmm4 ; transpose coefficients(phase 2) 134 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 135 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 136 movaps xmm6, xmm1 ; transpose coefficients(phase 2) 137 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 138 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 139 140 movaps xmm2, xmm7 141 movaps xmm3, xmm4 142 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 143 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 144 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 145 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 146 147 ; -- Even part 148 149 movaps xmm1, xmm5 150 movaps xmm6, xmm0 151 subps xmm5, xmm7 ; xmm5=tmp13 152 subps xmm0, xmm4 ; xmm0=tmp12 153 addps xmm1, xmm7 ; xmm1=tmp10 154 addps xmm6, xmm4 ; xmm6=tmp11 155 156 addps xmm0, xmm5 157 mulps xmm0, [rel PD_0_707] ; xmm0=z1 158 159 movaps xmm7, xmm1 160 movaps xmm4, xmm5 161 subps xmm1, xmm6 ; xmm1=data4 162 subps xmm5, xmm0 ; xmm5=data6 163 addps xmm7, xmm6 ; xmm7=data0 164 addps xmm4, xmm0 ; xmm4=data2 165 166 movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 167 movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 168 movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 169 movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 170 171 ; -- Odd part 172 173 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 174 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 175 176 addps xmm2, xmm3 ; xmm2=tmp10 177 addps xmm3, xmm6 ; xmm3=tmp11 178 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 179 180 mulps xmm3, [rel PD_0_707] ; xmm3=z3 181 182 movaps xmm1, xmm2 ; xmm1=tmp10 183 subps xmm2, xmm6 184 mulps xmm2, [rel PD_0_382] ; xmm2=z5 185 mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) 186 mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) 187 addps xmm1, xmm2 ; xmm1=z2 188 addps xmm6, xmm2 ; xmm6=z4 189 190 movaps xmm5, xmm0 191 subps xmm0, xmm3 ; xmm0=z13 192 addps xmm5, xmm3 ; xmm5=z11 193 194 movaps xmm7, xmm0 195 movaps xmm4, xmm5 196 subps xmm0, xmm1 ; xmm0=data3 197 subps xmm5, xmm6 ; xmm5=data7 198 addps xmm7, xmm1 ; xmm7=data5 199 addps xmm4, xmm6 ; xmm4=data1 200 201 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 202 movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 203 movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 204 movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 205 206 add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT 207 dec rcx 208 jnz near .rowloop 209 210 ; ---- Pass 2: process columns. 211 212 mov rdx, r10 ; (FAST_FLOAT *) 213 mov rcx, DCTSIZE/4 214 .columnloop: 215 216 movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] 217 movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] 218 movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] 219 movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] 220 221 ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) 222 ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) 223 224 movaps xmm4, xmm0 ; transpose coefficients(phase 1) 225 unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) 226 unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) 227 movaps xmm5, xmm2 ; transpose coefficients(phase 1) 228 unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) 229 unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) 230 231 movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 232 movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 233 movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] 234 movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] 235 236 ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) 237 ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) 238 239 movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) 240 movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) 241 242 movaps xmm4, xmm6 ; transpose coefficients(phase 1) 243 unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) 244 unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) 245 movaps xmm2, xmm1 ; transpose coefficients(phase 1) 246 unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) 247 unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) 248 249 movaps xmm7, xmm6 ; transpose coefficients(phase 2) 250 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 251 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 252 movaps xmm3, xmm2 ; transpose coefficients(phase 2) 253 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 254 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 255 256 movaps xmm0, xmm7 257 movaps xmm5, xmm6 258 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 259 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 260 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 261 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 262 263 movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) 264 movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) 265 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 266 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 267 268 movaps xmm7, xmm4 ; transpose coefficients(phase 2) 269 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 270 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 271 movaps xmm6, xmm1 ; transpose coefficients(phase 2) 272 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 273 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 274 275 movaps xmm2, xmm7 276 movaps xmm3, xmm4 277 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 278 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 279 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 280 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 281 282 ; -- Even part 283 284 movaps xmm1, xmm5 285 movaps xmm6, xmm0 286 subps xmm5, xmm7 ; xmm5=tmp13 287 subps xmm0, xmm4 ; xmm0=tmp12 288 addps xmm1, xmm7 ; xmm1=tmp10 289 addps xmm6, xmm4 ; xmm6=tmp11 290 291 addps xmm0, xmm5 292 mulps xmm0, [rel PD_0_707] ; xmm0=z1 293 294 movaps xmm7, xmm1 295 movaps xmm4, xmm5 296 subps xmm1, xmm6 ; xmm1=data4 297 subps xmm5, xmm0 ; xmm5=data6 298 addps xmm7, xmm6 ; xmm7=data0 299 addps xmm4, xmm0 ; xmm4=data2 300 301 movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 302 movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 303 movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 304 movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 305 306 ; -- Odd part 307 308 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 309 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 310 311 addps xmm2, xmm3 ; xmm2=tmp10 312 addps xmm3, xmm6 ; xmm3=tmp11 313 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 314 315 mulps xmm3, [rel PD_0_707] ; xmm3=z3 316 317 movaps xmm1, xmm2 ; xmm1=tmp10 318 subps xmm2, xmm6 319 mulps xmm2, [rel PD_0_382] ; xmm2=z5 320 mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) 321 mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) 322 addps xmm1, xmm2 ; xmm1=z2 323 addps xmm6, xmm2 ; xmm6=z4 324 325 movaps xmm5, xmm0 326 subps xmm0, xmm3 ; xmm0=z13 327 addps xmm5, xmm3 ; xmm5=z11 328 329 movaps xmm7, xmm0 330 movaps xmm4, xmm5 331 subps xmm0, xmm1 ; xmm0=data3 332 subps xmm5, xmm6 ; xmm5=data7 333 addps xmm7, xmm1 ; xmm7=data5 334 addps xmm4, xmm6 ; xmm4=data1 335 336 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 337 movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 338 movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 339 movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 340 341 add rdx, byte 4*SIZEOF_FAST_FLOAT 342 dec rcx 343 jnz near .columnloop 344 345 UNCOLLECT_ARGS 1 346 lea rsp, [rbp-8] 347 pop r15 348 pop rbp 349 ret 350 351 ; For some reason, the OS X linker does not honor the request to align the 352 ; segment unless we do this. 353 align 32