jfdctfst-sse2.asm (16893B)
1 ; 2 ; jfdctfst.asm - fast integer FDCT (64-bit SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2023, Aliaksiej Kandracienka. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 ; 14 ; This file contains a fast, not so accurate integer implementation of 15 ; the forward DCT (Discrete Cosine Transform). The following code is 16 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c 17 ; for more details. 18 19 %include "jsimdext.inc" 20 %include "jdct.inc" 21 22 ; -------------------------------------------------------------------------- 23 24 %define CONST_BITS 8 ; 14 is also OK. 25 26 %if CONST_BITS == 8 27 F_0_382 equ 98 ; FIX(0.382683433) 28 F_0_541 equ 139 ; FIX(0.541196100) 29 F_0_707 equ 181 ; FIX(0.707106781) 30 F_1_306 equ 334 ; FIX(1.306562965) 31 %else 32 ; NASM cannot do compile-time arithmetic on floating-point constants. 33 %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 34 F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) 35 F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 36 F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) 37 F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) 38 %endif 39 40 ; -------------------------------------------------------------------------- 41 SECTION SEG_CONST 42 43 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 44 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 45 46 %define PRE_MULTIPLY_SCALE_BITS 2 47 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 48 49 ALIGNZ 32 50 GLOBAL_DATA(jconst_fdct_ifast_sse2) 51 52 EXTN(jconst_fdct_ifast_sse2): 53 54 PW_F0707 times 8 dw F_0_707 << CONST_SHIFT 55 PW_F0382 times 8 dw F_0_382 << CONST_SHIFT 56 PW_F0541 times 8 dw F_0_541 << CONST_SHIFT 57 PW_F1306 times 8 dw F_1_306 << CONST_SHIFT 58 59 ALIGNZ 32 60 61 ; -------------------------------------------------------------------------- 62 SECTION SEG_TEXT 63 BITS 64 64 ; 65 ; Perform the forward DCT on one block of samples. 66 ; 67 ; GLOBAL(void) 68 ; jsimd_fdct_ifast_sse2(DCTELEM *data) 69 ; 70 71 ; r10 = DCTELEM *data 72 73 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 74 %define WK_NUM 2 75 76 align 32 77 GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2) 78 79 EXTN(jsimd_fdct_ifast_sse2): 80 ENDBR64 81 push rbp 82 mov rbp, rsp 83 push r15 84 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 85 ; Allocate stack space for wk array. r15 is used to access it. 86 mov r15, rsp 87 sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) 88 COLLECT_ARGS 1 89 90 ; ---- Pass 1: process rows. 91 92 mov rdx, r10 ; (DCTELEM *) 93 94 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] 95 movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] 96 movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] 97 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] 98 99 ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) 100 ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) 101 102 movdqa xmm4, xmm0 ; transpose coefficients(phase 1) 103 punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) 104 punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) 105 movdqa xmm5, xmm2 ; transpose coefficients(phase 1) 106 punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) 107 punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) 108 109 movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] 110 movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] 111 movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] 112 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] 113 114 ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) 115 ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) 116 117 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) 118 movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) 119 120 movdqa xmm2, xmm6 ; transpose coefficients(phase 1) 121 punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) 122 punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) 123 movdqa xmm5, xmm1 ; transpose coefficients(phase 1) 124 punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) 125 punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) 126 127 movdqa xmm7, xmm6 ; transpose coefficients(phase 2) 128 punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) 129 punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) 130 movdqa xmm3, xmm2 ; transpose coefficients(phase 2) 131 punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) 132 punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) 133 134 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) 135 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) 136 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) 137 movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) 138 139 movdqa xmm7, xmm0 ; transpose coefficients(phase 2) 140 punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) 141 punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) 142 movdqa xmm2, xmm4 ; transpose coefficients(phase 2) 143 punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) 144 punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) 145 146 movdqa xmm1, xmm0 ; transpose coefficients(phase 3) 147 punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 148 punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 149 movdqa xmm5, xmm2 ; transpose coefficients(phase 3) 150 punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 151 punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 152 153 movdqa xmm6, xmm1 154 movdqa xmm3, xmm0 155 psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 156 psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 157 paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 158 paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 159 160 movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) 161 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) 162 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 163 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 164 165 movdqa xmm1, xmm7 ; transpose coefficients(phase 3) 166 punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 167 punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 168 movdqa xmm0, xmm4 ; transpose coefficients(phase 3) 169 punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 170 punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 171 172 movdqa xmm2, xmm1 173 movdqa xmm5, xmm7 174 paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 175 paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 176 psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 177 psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 178 179 ; -- Even part 180 181 movdqa xmm4, xmm3 182 movdqa xmm0, xmm6 183 psubw xmm3, xmm1 ; xmm3=tmp13 184 psubw xmm6, xmm7 ; xmm6=tmp12 185 paddw xmm4, xmm1 ; xmm4=tmp10 186 paddw xmm0, xmm7 ; xmm0=tmp11 187 188 paddw xmm6, xmm3 189 psllw xmm6, PRE_MULTIPLY_SCALE_BITS 190 pmulhw xmm6, [rel PW_F0707] ; xmm6=z1 191 192 movdqa xmm1, xmm4 193 movdqa xmm7, xmm3 194 psubw xmm4, xmm0 ; xmm4=data4 195 psubw xmm3, xmm6 ; xmm3=data6 196 paddw xmm1, xmm0 ; xmm1=data0 197 paddw xmm7, xmm6 ; xmm7=data2 198 199 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 200 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 201 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 202 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 203 204 ; -- Odd part 205 206 paddw xmm2, xmm5 ; xmm2=tmp10 207 paddw xmm5, xmm0 ; xmm5=tmp11 208 paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 209 210 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 211 psllw xmm0, PRE_MULTIPLY_SCALE_BITS 212 213 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 214 pmulhw xmm5, [rel PW_F0707] ; xmm5=z3 215 216 movdqa xmm4, xmm2 ; xmm4=tmp10 217 psubw xmm2, xmm0 218 pmulhw xmm2, [rel PW_F0382] ; xmm2=z5 219 pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) 220 pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) 221 paddw xmm4, xmm2 ; xmm4=z2 222 paddw xmm0, xmm2 ; xmm0=z4 223 224 movdqa xmm3, xmm6 225 psubw xmm6, xmm5 ; xmm6=z13 226 paddw xmm3, xmm5 ; xmm3=z11 227 228 movdqa xmm2, xmm6 229 movdqa xmm5, xmm3 230 psubw xmm6, xmm4 ; xmm6=data3 231 psubw xmm3, xmm0 ; xmm3=data7 232 paddw xmm2, xmm4 ; xmm2=data5 233 paddw xmm5, xmm0 ; xmm5=data1 234 235 ; ---- Pass 2: process columns. 236 237 ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) 238 ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) 239 240 movdqa xmm4, xmm1 ; transpose coefficients(phase 1) 241 punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) 242 punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) 243 movdqa xmm0, xmm7 ; transpose coefficients(phase 1) 244 punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) 245 punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) 246 247 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 248 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 249 250 ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) 251 ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) 252 253 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) 254 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) 255 256 movdqa xmm7, xmm5 ; transpose coefficients(phase 1) 257 punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) 258 punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) 259 movdqa xmm0, xmm6 ; transpose coefficients(phase 1) 260 punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) 261 punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) 262 263 movdqa xmm2, xmm5 ; transpose coefficients(phase 2) 264 punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) 265 punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) 266 movdqa xmm3, xmm7 ; transpose coefficients(phase 2) 267 punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) 268 punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) 269 270 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) 271 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) 272 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) 273 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) 274 275 movdqa xmm2, xmm1 ; transpose coefficients(phase 2) 276 punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) 277 punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) 278 movdqa xmm7, xmm4 ; transpose coefficients(phase 2) 279 punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) 280 punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) 281 282 movdqa xmm6, xmm1 ; transpose coefficients(phase 3) 283 punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 284 punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 285 movdqa xmm0, xmm7 ; transpose coefficients(phase 3) 286 punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 287 punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 288 289 movdqa xmm5, xmm6 290 movdqa xmm3, xmm1 291 psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 292 psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 293 paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 294 paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 295 296 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) 297 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) 298 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 299 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 300 301 movdqa xmm6, xmm2 ; transpose coefficients(phase 3) 302 punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 303 punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 304 movdqa xmm1, xmm4 ; transpose coefficients(phase 3) 305 punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 306 punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 307 308 movdqa xmm7, xmm6 309 movdqa xmm0, xmm2 310 paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 311 paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 312 psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 313 psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 314 315 ; -- Even part 316 317 movdqa xmm4, xmm3 318 movdqa xmm1, xmm5 319 psubw xmm3, xmm6 ; xmm3=tmp13 320 psubw xmm5, xmm2 ; xmm5=tmp12 321 paddw xmm4, xmm6 ; xmm4=tmp10 322 paddw xmm1, xmm2 ; xmm1=tmp11 323 324 paddw xmm5, xmm3 325 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 326 pmulhw xmm5, [rel PW_F0707] ; xmm5=z1 327 328 movdqa xmm6, xmm4 329 movdqa xmm2, xmm3 330 psubw xmm4, xmm1 ; xmm4=data4 331 psubw xmm3, xmm5 ; xmm3=data6 332 paddw xmm6, xmm1 ; xmm6=data0 333 paddw xmm2, xmm5 ; xmm2=data2 334 335 movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 336 movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 337 movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 338 movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 339 340 ; -- Odd part 341 342 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 343 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 344 345 paddw xmm7, xmm0 ; xmm7=tmp10 346 paddw xmm0, xmm1 ; xmm0=tmp11 347 paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 348 349 psllw xmm7, PRE_MULTIPLY_SCALE_BITS 350 psllw xmm1, PRE_MULTIPLY_SCALE_BITS 351 352 psllw xmm0, PRE_MULTIPLY_SCALE_BITS 353 pmulhw xmm0, [rel PW_F0707] ; xmm0=z3 354 355 movdqa xmm4, xmm7 ; xmm4=tmp10 356 psubw xmm7, xmm1 357 pmulhw xmm7, [rel PW_F0382] ; xmm7=z5 358 pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) 359 pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) 360 paddw xmm4, xmm7 ; xmm4=z2 361 paddw xmm1, xmm7 ; xmm1=z4 362 363 movdqa xmm3, xmm5 364 psubw xmm5, xmm0 ; xmm5=z13 365 paddw xmm3, xmm0 ; xmm3=z11 366 367 movdqa xmm6, xmm5 368 movdqa xmm2, xmm3 369 psubw xmm5, xmm4 ; xmm5=data3 370 psubw xmm3, xmm1 ; xmm3=data7 371 paddw xmm6, xmm4 ; xmm6=data5 372 paddw xmm2, xmm1 ; xmm2=data1 373 374 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 375 movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 376 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 377 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 378 379 UNCOLLECT_ARGS 1 380 lea rsp, [rbp-8] 381 pop r15 382 pop rbp 383 ret 384 385 ; For some reason, the OS X linker does not honor the request to align the 386 ; segment unless we do this. 387 align 32