simple_idct.asm (37993B)
1 ; 2 ; Simple IDCT MMX 3 ; 4 ; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> 5 ; 6 ; Conversion from gcc syntax to x264asm syntax with minimal modifications 7 ; by James Darnley <jdarnley@obe.tv>. 8 ; 9 ; This file is part of FFmpeg. 10 ; 11 ; FFmpeg is free software; you can redistribute it and/or 12 ; modify it under the terms of the GNU Lesser General Public 13 ; License as published by the Free Software Foundation; either 14 ; version 2.1 of the License, or (at your option) any later version. 15 ; 16 ; FFmpeg is distributed in the hope that it will be useful, 17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of 18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 ; Lesser General Public License for more details. 20 ; 21 ; You should have received a copy of the GNU Lesser General Public 22 ; License along with FFmpeg; if not, write to the Free Software 23 ; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 ;/ 25 26 %include "libavutil/x86/x86util.asm" 27 28 SECTION_RODATA 29 30 %if ARCH_X86_32 31 cextern pb_80 32 33 wm1010: dw 0, 0xffff, 0, 0xffff 34 d40000: dd 4 << 16, 0 35 36 ; 23170.475006 37 ; 22725.260826 38 ; 21406.727617 39 ; 19265.545870 40 ; 16384.000000 41 ; 12872.826198 42 ; 8866.956905 43 ; 4520.335430 44 45 %define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 46 %define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 47 %define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 48 %define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 49 %define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 50 %define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 51 %define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 52 %define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 53 54 %define ROW_SHIFT 11 55 %define COL_SHIFT 20 ; 6 56 57 coeffs: 58 dw 1 << (ROW_SHIFT - 1), 0 59 dw 1 << (ROW_SHIFT - 1), 0 60 dw 1 << (ROW_SHIFT - 1), 1 61 dw 1 << (ROW_SHIFT - 1), 0 62 63 dw C4, C4, C4, C4 64 dw C4, -C4, C4, -C4 65 66 dw C2, C6, C2, C6 67 dw C6, -C2, C6, -C2 68 69 dw C1, C3, C1, C3 70 dw C5, C7, C5, C7 71 72 dw C3, -C7, C3, -C7 73 dw -C1, -C5, -C1, -C5 74 75 dw C5, -C1, C5, -C1 76 dw C7, C3, C7, C3 77 78 dw C7, -C5, C7, -C5 79 dw C3, -C1, C3, -C1 80 81 SECTION .text 82 83 %macro DC_COND_IDCT 7 84 movq mm0, [blockq + %1] ; R4 R0 r4 r0 85 movq mm1, [blockq + %2] ; R6 R2 r6 r2 86 movq mm2, [blockq + %3] ; R3 R1 r3 r1 87 movq mm3, [blockq + %4] ; R7 R5 r7 r5 88 movq mm4, [wm1010] 89 pand mm4, mm0 90 por mm4, mm1 91 por mm4, mm2 92 por mm4, mm3 93 packssdw mm4, mm4 94 movd t0d, mm4 95 or t0d, t0d 96 jz %%1 97 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 98 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 99 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 100 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 101 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 102 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 103 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 104 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 105 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 106 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 107 paddd mm4, [coeffs + 8] 108 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 109 paddd mm4, mm5 ; A0 a0 110 psubd mm6, mm5 ; A3 a3 111 movq mm5, [coeffs + 56] ; C7 C5 C7 C5 112 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 113 paddd mm0, [coeffs + 8] 114 paddd mm1, mm0 ; A1 a1 115 paddd mm0, mm0 116 psubd mm0, mm1 ; A2 a2 117 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 118 paddd mm7, mm5 ; B0 b0 119 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 120 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 121 paddd mm7, mm4 ; A0+B0 a0+b0 122 paddd mm4, mm4 ; 2A0 2a0 123 psubd mm4, mm7 ; A0-B0 a0-b0 124 paddd mm5, mm2 ; B1 b1 125 psrad mm7, %7 126 psrad mm4, %7 127 movq mm2, mm1 ; A1 a1 128 paddd mm1, mm5 ; A1+B1 a1+b1 129 psubd mm2, mm5 ; A1-B1 a1-b1 130 psrad mm1, %7 131 psrad mm2, %7 132 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 133 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 134 movq [%5], mm7 135 movq mm1, [blockq + %3] ; R3 R1 r3 r1 136 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 137 movq [24 + %5], mm2 138 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 139 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 140 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 141 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 142 movq mm2, mm0 ; A2 a2 143 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 144 paddd mm4, mm7 ; B2 b2 145 paddd mm2, mm4 ; A2+B2 a2+b2 146 psubd mm0, mm4 ; a2-B2 a2-b2 147 psrad mm2, %7 148 psrad mm0, %7 149 movq mm4, mm6 ; A3 a3 150 paddd mm3, mm1 ; B3 b3 151 paddd mm6, mm3 ; A3+B3 a3+b3 152 psubd mm4, mm3 ; a3-B3 a3-b3 153 psrad mm6, %7 154 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 155 movq [8 + %5], mm2 156 psrad mm4, %7 157 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 158 movq [16 + %5], mm4 159 jmp %%2 160 %%1: 161 pslld mm0, 16 162 paddd mm0, [d40000] 163 psrad mm0, 13 164 packssdw mm0, mm0 165 movq [%5], mm0 166 movq [8 + %5], mm0 167 movq [16 + %5], mm0 168 movq [24 + %5], mm0 169 %%2: 170 %endmacro 171 172 %macro Z_COND_IDCT 8 173 movq mm0, [blockq + %1] ; R4 R0 r4 r0 174 movq mm1, [blockq + %2] ; R6 R2 r6 r2 175 movq mm2, [blockq + %3] ; R3 R1 r3 r1 176 movq mm3, [blockq + %4] ; R7 R5 r7 r5 177 movq mm4, mm0 178 por mm4, mm1 179 por mm4, mm2 180 por mm4, mm3 181 packssdw mm4, mm4 182 movd t0d, mm4 183 or t0d, t0d 184 jz %8 185 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 186 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 187 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 188 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 189 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 190 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 191 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 192 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 193 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 194 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 195 paddd mm4, [coeffs] 196 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 197 paddd mm4, mm5 ; A0 a0 198 psubd mm6, mm5 ; A3 a3 199 movq mm5, [coeffs + 56] ; C7 C5 C7 C5 200 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 201 paddd mm0, [coeffs] 202 paddd mm1, mm0 ; A1 a1 203 paddd mm0, mm0 204 psubd mm0, mm1 ; A2 a2 205 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 206 paddd mm7, mm5 ; B0 b0 207 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 208 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 209 paddd mm7, mm4 ; A0+B0 a0+b0 210 paddd mm4, mm4 ; 2A0 2a0 211 psubd mm4, mm7 ; A0-B0 a0-b0 212 paddd mm5, mm2 ; B1 b1 213 psrad mm7, %7 214 psrad mm4, %7 215 movq mm2, mm1 ; A1 a1 216 paddd mm1, mm5 ; A1+B1 a1+b1 217 psubd mm2, mm5 ; A1-B1 a1-b1 218 psrad mm1, %7 219 psrad mm2, %7 220 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 221 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 222 movq [%5], mm7 223 movq mm1, [blockq + %3] ; R3 R1 r3 r1 224 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 225 movq [24 + %5], mm2 226 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 227 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 228 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 229 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 230 movq mm2, mm0 ; A2 a2 231 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 232 paddd mm4, mm7 ; B2 b2 233 paddd mm2, mm4 ; A2+B2 a2+b2 234 psubd mm0, mm4 ; a2-B2 a2-b2 235 psrad mm2, %7 236 psrad mm0, %7 237 movq mm4, mm6 ; A3 a3 238 paddd mm3, mm1 ; B3 b3 239 paddd mm6, mm3 ; A3+B3 a3+b3 240 psubd mm4, mm3 ; a3-B3 a3-b3 241 psrad mm6, %7 242 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 243 movq [8 + %5], mm2 244 psrad mm4, %7 245 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 246 movq [16 + %5], mm4 247 %endmacro 248 249 %macro IDCT1 6 250 movq mm0, %1 ; R4 R0 r4 r0 251 movq mm1, %2 ; R6 R2 r6 r2 252 movq mm2, %3 ; R3 R1 r3 r1 253 movq mm3, %4 ; R7 R5 r7 r5 254 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 255 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 256 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 257 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 258 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 259 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 260 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 261 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 262 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 263 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 264 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 265 paddd mm4, mm5 ; A0 a0 266 psubd mm6, mm5 ; A3 a3 267 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 268 paddd mm0, mm1 ; A1 a1 269 psubd mm5, mm1 ; A2 a2 270 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 271 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 272 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 273 paddd mm7, mm1 ; B0 b0 274 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 275 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 276 paddd mm7, mm4 ; A0+B0 a0+b0 277 paddd mm4, mm4 ; 2A0 2a0 278 psubd mm4, mm7 ; A0-B0 a0-b0 279 paddd mm1, mm2 ; B1 b1 280 psrad mm7, %6 281 psrad mm4, %6 282 movq mm2, mm0 ; A1 a1 283 paddd mm0, mm1 ; A1+B1 a1+b1 284 psubd mm2, mm1 ; A1-B1 a1-b1 285 psrad mm0, %6 286 psrad mm2, %6 287 packssdw mm7, mm7 ; A0+B0 a0+b0 288 movd [%5], mm7 289 packssdw mm0, mm0 ; A1+B1 a1+b1 290 movd [16 + %5], mm0 291 packssdw mm2, mm2 ; A1-B1 a1-b1 292 movd [96 + %5], mm2 293 packssdw mm4, mm4 ; A0-B0 a0-b0 294 movd [112 + %5], mm4 295 movq mm0, %3 ; R3 R1 r3 r1 296 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 297 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 298 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 299 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 300 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 301 movq mm2, mm5 ; A2 a2 302 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 303 paddd mm4, mm7 ; B2 b2 304 paddd mm2, mm4 ; A2+B2 a2+b2 305 psubd mm5, mm4 ; a2-B2 a2-b2 306 psrad mm2, %6 307 psrad mm5, %6 308 movq mm4, mm6 ; A3 a3 309 paddd mm3, mm0 ; B3 b3 310 paddd mm6, mm3 ; A3+B3 a3+b3 311 psubd mm4, mm3 ; a3-B3 a3-b3 312 psrad mm6, %6 313 psrad mm4, %6 314 packssdw mm2, mm2 ; A2+B2 a2+b2 315 packssdw mm6, mm6 ; A3+B3 a3+b3 316 movd [32 + %5], mm2 317 packssdw mm4, mm4 ; A3-B3 a3-b3 318 packssdw mm5, mm5 ; A2-B2 a2-b2 319 movd [48 + %5], mm6 320 movd [64 + %5], mm4 321 movd [80 + %5], mm5 322 %endmacro 323 324 %macro IDCT2 6 325 movq mm0, %1 ; R4 R0 r4 r0 326 movq mm1, %2 ; R6 R2 r6 r2 327 movq mm3, %4 ; R7 R5 r7 r5 328 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 329 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 330 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 331 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 332 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 333 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 334 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 335 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 336 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 337 paddd mm4, mm5 ; A0 a0 338 psubd mm6, mm5 ; A3 a3 339 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 340 paddd mm0, mm1 ; A1 a1 341 psubd mm5, mm1 ; A2 a2 342 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 343 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 344 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 345 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 346 paddd mm1, mm4 ; A0+B0 a0+b0 347 paddd mm4, mm4 ; 2A0 2a0 348 psubd mm4, mm1 ; A0-B0 a0-b0 349 psrad mm1, %6 350 psrad mm4, %6 351 movq mm2, mm0 ; A1 a1 352 paddd mm0, mm7 ; A1+B1 a1+b1 353 psubd mm2, mm7 ; A1-B1 a1-b1 354 psrad mm0, %6 355 psrad mm2, %6 356 packssdw mm1, mm1 ; A0+B0 a0+b0 357 movd [%5], mm1 358 packssdw mm0, mm0 ; A1+B1 a1+b1 359 movd [16 + %5], mm0 360 packssdw mm2, mm2 ; A1-B1 a1-b1 361 movd [96 + %5], mm2 362 packssdw mm4, mm4 ; A0-B0 a0-b0 363 movd [112 + %5], mm4 364 movq mm1, [coeffs + 88] ; C3 C7 C3 C7 365 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 366 movq mm2, mm5 ; A2 a2 367 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 368 paddd mm2, mm1 ; A2+B2 a2+b2 369 psubd mm5, mm1 ; a2-B2 a2-b2 370 psrad mm2, %6 371 psrad mm5, %6 372 movq mm1, mm6 ; A3 a3 373 paddd mm6, mm3 ; A3+B3 a3+b3 374 psubd mm1, mm3 ; a3-B3 a3-b3 375 psrad mm6, %6 376 psrad mm1, %6 377 packssdw mm2, mm2 ; A2+B2 a2+b2 378 packssdw mm6, mm6 ; A3+B3 a3+b3 379 movd [32 + %5], mm2 380 packssdw mm1, mm1 ; A3-B3 a3-b3 381 packssdw mm5, mm5 ; A2-B2 a2-b2 382 movd [48 + %5], mm6 383 movd [64 + %5], mm1 384 movd [80 + %5], mm5 385 %endmacro 386 387 %macro IDCT3 6 388 movq mm0, %1 ; R4 R0 r4 r0 389 movq mm3, %4 ; R7 R5 r7 r5 390 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 391 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 392 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 393 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 394 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 395 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 396 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 397 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 398 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 399 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 400 paddd mm1, mm4 ; A0+B0 a0+b0 401 paddd mm4, mm4 ; 2A0 2a0 402 psubd mm4, mm1 ; A0-B0 a0-b0 403 psrad mm1, %6 404 psrad mm4, %6 405 movq mm2, mm0 ; A1 a1 406 paddd mm0, mm7 ; A1+B1 a1+b1 407 psubd mm2, mm7 ; A1-B1 a1-b1 408 psrad mm0, %6 409 psrad mm2, %6 410 packssdw mm1, mm1 ; A0+B0 a0+b0 411 movd [%5], mm1 412 packssdw mm0, mm0 ; A1+B1 a1+b1 413 movd [16 + %5], mm0 414 packssdw mm2, mm2 ; A1-B1 a1-b1 415 movd [96 + %5], mm2 416 packssdw mm4, mm4 ; A0-B0 a0-b0 417 movd [112 + %5], mm4 418 movq mm1, [coeffs + 88] ; C3 C7 C3 C7 419 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 420 movq mm2, mm5 ; A2 a2 421 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 422 paddd mm2, mm1 ; A2+B2 a2+b2 423 psubd mm5, mm1 ; a2-B2 a2-b2 424 psrad mm2, %6 425 psrad mm5, %6 426 movq mm1, mm6 ; A3 a3 427 paddd mm6, mm3 ; A3+B3 a3+b3 428 psubd mm1, mm3 ; a3-B3 a3-b3 429 psrad mm6, %6 430 psrad mm1, %6 431 packssdw mm2, mm2 ; A2+B2 a2+b2 432 packssdw mm6, mm6 ; A3+B3 a3+b3 433 movd [32 + %5], mm2 434 packssdw mm1, mm1 ; A3-B3 a3-b3 435 packssdw mm5, mm5 ; A2-B2 a2-b2 436 movd [48 + %5], mm6 437 movd [64 + %5], mm1 438 movd [80 + %5], mm5 439 %endmacro 440 441 %macro IDCT4 6 442 movq mm0, %1 ; R4 R0 r4 r0 443 movq mm2, %3 ; R3 R1 r3 r1 444 movq mm3, %4 ; R7 R5 r7 r5 445 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 446 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 447 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 448 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 449 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 450 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 451 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 452 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 453 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 454 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 455 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 456 paddd mm7, mm1 ; B0 b0 457 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 458 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 459 paddd mm7, mm4 ; A0+B0 a0+b0 460 paddd mm4, mm4 ; 2A0 2a0 461 psubd mm4, mm7 ; A0-B0 a0-b0 462 paddd mm1, mm2 ; B1 b1 463 psrad mm7, %6 464 psrad mm4, %6 465 movq mm2, mm0 ; A1 a1 466 paddd mm0, mm1 ; A1+B1 a1+b1 467 psubd mm2, mm1 ; A1-B1 a1-b1 468 psrad mm0, %6 469 psrad mm2, %6 470 packssdw mm7, mm7 ; A0+B0 a0+b0 471 movd [%5], mm7 472 packssdw mm0, mm0 ; A1+B1 a1+b1 473 movd [16 + %5], mm0 474 packssdw mm2, mm2 ; A1-B1 a1-b1 475 movd [96 + %5], mm2 476 packssdw mm4, mm4 ; A0-B0 a0-b0 477 movd [112 + %5], mm4 478 movq mm0, %3 ; R3 R1 r3 r1 479 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 480 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 481 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 482 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 483 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 484 movq mm2, mm5 ; A2 a2 485 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 486 paddd mm4, mm7 ; B2 b2 487 paddd mm2, mm4 ; A2+B2 a2+b2 488 psubd mm5, mm4 ; a2-B2 a2-b2 489 psrad mm2, %6 490 psrad mm5, %6 491 movq mm4, mm6 ; A3 a3 492 paddd mm3, mm0 ; B3 b3 493 paddd mm6, mm3 ; A3+B3 a3+b3 494 psubd mm4, mm3 ; a3-B3 a3-b3 495 psrad mm6, %6 496 psrad mm4, %6 497 packssdw mm2, mm2 ; A2+B2 a2+b2 498 packssdw mm6, mm6 ; A3+B3 a3+b3 499 movd [32 + %5], mm2 500 packssdw mm4, mm4 ; A3-B3 a3-b3 501 packssdw mm5, mm5 ; A2-B2 a2-b2 502 movd [48 + %5], mm6 503 movd [64 + %5], mm4 504 movd [80 + %5], mm5 505 %endmacro 506 507 %macro IDCT5 6 508 movq mm0, %1 ; R4 R0 r4 r0 509 movq mm2, %3 ; R3 R1 r3 r1 510 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 511 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 512 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 513 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 514 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 515 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 516 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 517 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 518 movq mm3, [coeffs + 64] 519 pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1 520 paddd mm7, mm4 ; A0+B0 a0+b0 521 paddd mm4, mm4 ; 2A0 2a0 522 psubd mm4, mm7 ; A0-B0 a0-b0 523 psrad mm7, %6 524 psrad mm4, %6 525 movq mm1, mm0 ; A1 a1 526 paddd mm0, mm3 ; A1+B1 a1+b1 527 psubd mm1, mm3 ; A1-B1 a1-b1 528 psrad mm0, %6 529 psrad mm1, %6 530 packssdw mm7, mm7 ; A0+B0 a0+b0 531 movd [%5], mm7 532 packssdw mm0, mm0 ; A1+B1 a1+b1 533 movd [16 + %5], mm0 534 packssdw mm1, mm1 ; A1-B1 a1-b1 535 movd [96 + %5], mm1 536 packssdw mm4, mm4 ; A0-B0 a0-b0 537 movd [112 + %5], mm4 538 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 539 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 540 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 541 movq mm1, mm5 ; A2 a2 542 paddd mm1, mm4 ; A2+B2 a2+b2 543 psubd mm5, mm4 ; a2-B2 a2-b2 544 psrad mm1, %6 545 psrad mm5, %6 546 movq mm4, mm6 ; A3 a3 547 paddd mm6, mm2 ; A3+B3 a3+b3 548 psubd mm4, mm2 ; a3-B3 a3-b3 549 psrad mm6, %6 550 psrad mm4, %6 551 packssdw mm1, mm1 ; A2+B2 a2+b2 552 packssdw mm6, mm6 ; A3+B3 a3+b3 553 movd [32 + %5], mm1 554 packssdw mm4, mm4 ; A3-B3 a3-b3 555 packssdw mm5, mm5 ; A2-B2 a2-b2 556 movd [48 + %5], mm6 557 movd [64 + %5], mm4 558 movd [80 + %5], mm5 559 %endmacro 560 561 %macro IDCT6 6 562 movq mm0, [%1] ; R4 R0 r4 r0 563 movq mm1, [%2] ; R6 R2 r6 r2 564 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 565 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 566 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 567 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 568 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 569 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 570 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 571 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 572 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 573 paddd mm4, mm5 ; A0 a0 574 psubd mm6, mm5 ; A3 a3 575 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 576 paddd mm0, mm1 ; A1 a1 577 psubd mm5, mm1 ; A2 a2 578 movq mm2, [8 + %1] ; R4 R0 r4 r0 579 movq mm3, [8 + %2] ; R6 R2 r6 r2 580 movq mm1, [coeffs + 16] ; C4 C4 C4 C4 581 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 582 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 583 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 584 movq mm7, [coeffs + 32] ; C6 C2 C6 C2 585 pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2 586 pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2 587 paddd mm7, mm1 ; A0 a0 588 paddd mm1, mm1 ; 2C0 2c0 589 psubd mm1, mm7 ; A3 a3 590 paddd mm3, mm2 ; A1 a1 591 paddd mm2, mm2 ; 2C1 2c1 592 psubd mm2, mm3 ; A2 a2 593 psrad mm4, %6 594 psrad mm7, %6 595 psrad mm3, %6 596 packssdw mm4, mm7 ; A0 a0 597 movq [%5], mm4 598 psrad mm0, %6 599 packssdw mm0, mm3 ; A1 a1 600 movq [16 + %5], mm0 601 movq [96 + %5], mm0 602 movq [112 + %5], mm4 603 psrad mm5, %6 604 psrad mm6, %6 605 psrad mm2, %6 606 packssdw mm5, mm2 ; A2-B2 a2-b2 607 movq [32 + %5], mm5 608 psrad mm1, %6 609 packssdw mm6, mm1 ; A3+B3 a3+b3 610 movq [48 + %5], mm6 611 movq [64 + %5], mm6 612 movq [80 + %5], mm5 613 %endmacro 614 615 %macro IDCT7 6 616 movq mm0, %1 ; R4 R0 r4 r0 617 movq mm1, %2 ; R6 R2 r6 r2 618 movq mm2, %3 ; R3 R1 r3 r1 619 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 620 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 621 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 622 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 623 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 624 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 625 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 626 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 627 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 628 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 629 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 630 paddd mm4, mm5 ; A0 a0 631 psubd mm6, mm5 ; A3 a3 632 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 633 paddd mm0, mm1 ; A1 a1 634 psubd mm5, mm1 ; A2 a2 635 movq mm1, [coeffs + 64] 636 pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1 637 paddd mm7, mm4 ; A0+B0 a0+b0 638 paddd mm4, mm4 ; 2A0 2a0 639 psubd mm4, mm7 ; A0-B0 a0-b0 640 psrad mm7, %6 641 psrad mm4, %6 642 movq mm3, mm0 ; A1 a1 643 paddd mm0, mm1 ; A1+B1 a1+b1 644 psubd mm3, mm1 ; A1-B1 a1-b1 645 psrad mm0, %6 646 psrad mm3, %6 647 packssdw mm7, mm7 ; A0+B0 a0+b0 648 movd [%5], mm7 649 packssdw mm0, mm0 ; A1+B1 a1+b1 650 movd [16 + %5], mm0 651 packssdw mm3, mm3 ; A1-B1 a1-b1 652 movd [96 + %5], mm3 653 packssdw mm4, mm4 ; A0-B0 a0-b0 654 movd [112 + %5], mm4 655 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 656 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 657 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 658 movq mm3, mm5 ; A2 a2 659 paddd mm3, mm4 ; A2+B2 a2+b2 660 psubd mm5, mm4 ; a2-B2 a2-b2 661 psrad mm3, %6 662 psrad mm5, %6 663 movq mm4, mm6 ; A3 a3 664 paddd mm6, mm2 ; A3+B3 a3+b3 665 psubd mm4, mm2 ; a3-B3 a3-b3 666 psrad mm6, %6 667 packssdw mm3, mm3 ; A2+B2 a2+b2 668 movd [32 + %5], mm3 669 psrad mm4, %6 670 packssdw mm6, mm6 ; A3+B3 a3+b3 671 movd [48 + %5], mm6 672 packssdw mm4, mm4 ; A3-B3 a3-b3 673 packssdw mm5, mm5 ; A2-B2 a2-b2 674 movd [64 + %5], mm4 675 movd [80 + %5], mm5 676 %endmacro 677 678 %macro IDCT8 6 679 movq mm0, [%1] ; R4 R0 r4 r0 680 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 681 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 682 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 683 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 684 psrad mm4, %6 685 psrad mm0, %6 686 movq mm2, [8 + %1] ; R4 R0 r4 r0 687 movq mm1, [coeffs + 16] ; C4 C4 C4 C4 688 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 689 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 690 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 691 movq mm7, [coeffs + 32] ; C6 C2 C6 C2 692 psrad mm1, %6 693 packssdw mm4, mm1 ; A0 a0 694 movq [%5], mm4 695 psrad mm2, %6 696 packssdw mm0, mm2 ; A1 a1 697 movq [16 + %5], mm0 698 movq [96 + %5], mm0 699 movq [112 + %5], mm4 700 movq [32 + %5], mm0 701 movq [48 + %5], mm4 702 movq [64 + %5], mm4 703 movq [80 + %5], mm0 704 %endmacro 705 706 %macro IDCT 0 707 DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11 708 Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4 709 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2 710 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1 711 712 IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 713 IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 714 IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 715 IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 716 jmp %%9 717 718 ALIGN 16 719 %%4: 720 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6 721 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5 722 723 IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 724 IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 725 IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 726 IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 727 jmp %%9 728 729 ALIGN 16 730 %%6: 731 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7 732 733 IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 734 IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 735 IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 736 IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 737 jmp %%9 738 739 ALIGN 16 740 %%2: 741 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3 742 743 IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 744 IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 745 IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 746 IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 747 jmp %%9 748 749 ALIGN 16 750 %%3: 751 752 IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 753 IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 754 IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 755 IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 756 jmp %%9 757 758 ALIGN 16 759 %%5: 760 761 IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 762 IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 763 jmp %%9 764 765 ALIGN 16 766 %%1: 767 768 IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 769 IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 770 IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 771 IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 772 jmp %%9 773 774 ALIGN 16 775 %%7: 776 777 IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 778 IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 779 780 %%9: 781 %endmacro 782 783 %macro PUT_PIXELS_CLAMPED_HALF 1 784 mova m0, [blockq+mmsize*0+%1] 785 mova m1, [blockq+mmsize*2+%1] 786 packuswb m0, [blockq+mmsize*1+%1] 787 packuswb m1, [blockq+mmsize*3+%1] 788 movq [pixelsq], m0 789 movhps [lsizeq+pixelsq], m0 790 movq [2*lsizeq+pixelsq], m1 791 movhps [lsize3q+pixelsq], m1 792 %endmacro 793 794 %macro ADD_PIXELS_CLAMPED 1 795 mova m0, [blockq+mmsize*0+%1] 796 mova m1, [blockq+mmsize*1+%1] 797 movq m2, [pixelsq] 798 movq m3, [pixelsq+lsizeq] 799 punpcklbw m2, m4 800 punpcklbw m3, m4 801 paddsw m0, m2 802 paddsw m1, m3 803 packuswb m0, m1 804 movq [pixelsq], m0 805 movhps [pixelsq+lsizeq], m0 806 %endmacro 807 808 INIT_MMX mmx 809 810 cglobal simple_idct, 1, 2, 8, 128, block, t0 811 IDCT 812 emms 813 RET 814 815 INIT_XMM sse2 816 817 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 818 IDCT 819 lea lsize3q, [lsizeq*3] 820 PUT_PIXELS_CLAMPED_HALF 0 821 lea pixelsq, [pixelsq+lsizeq*4] 822 PUT_PIXELS_CLAMPED_HALF 64 823 RET 824 825 cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0 826 IDCT 827 pxor m4, m4 828 ADD_PIXELS_CLAMPED 0 829 lea pixelsq, [pixelsq+lsizeq*2] 830 ADD_PIXELS_CLAMPED 32 831 lea pixelsq, [pixelsq+lsizeq*2] 832 ADD_PIXELS_CLAMPED 64 833 lea pixelsq, [pixelsq+lsizeq*2] 834 ADD_PIXELS_CLAMPED 96 835 RET 836 %endif