fdct.c (11407B)
1 /* 2 * SIMD-optimized forward DCT 3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard. 4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. 6 * 7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT 8 * 9 * Intel Application Note AP-922 - fast, precise implementation of DCT 10 * http://developer.intel.com/vtune/cbts/appnotes.htm 11 * 12 * Also of inspiration: 13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm 14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html 15 * 16 * This file is part of FFmpeg. 17 * 18 * FFmpeg is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU Lesser General Public 20 * License as published by the Free Software Foundation; either 21 * version 2.1 of the License, or (at your option) any later version. 22 * 23 * FFmpeg is distributed in the hope that it will be useful, 24 * but WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * Lesser General Public License for more details. 27 * 28 * You should have received a copy of the GNU Lesser General Public 29 * License along with FFmpeg; if not, write to the Free Software 30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 31 */ 32 33 #include "config.h" 34 #include "libavutil/attributes.h" 35 #include "libavutil/macros.h" 36 #include "libavutil/mem_internal.h" 37 #include "libavutil/x86/asm.h" 38 #include "fdct.h" 39 40 #if HAVE_SSE2_INLINE 41 42 ////////////////////////////////////////////////////////////////////// 43 // 44 // constants for the forward DCT 45 // ----------------------------- 46 // 47 // Be sure to check that your compiler is aligning all constants to QWORD 48 // (8-byte) memory boundaries! Otherwise the unaligned memory access will 49 // severely stall MMX execution. 50 // 51 ////////////////////////////////////////////////////////////////////// 52 53 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy 54 #define SHIFT_FRW_COL BITS_FRW_ACC 55 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) 56 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) 57 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) 58 59 #define X8(x) x,x,x,x,x,x,x,x 60 61 //concatenated table, for forward DCT transformation 62 DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { 63 X8(13036), // tg * (2<<16) + 0.5 64 X8(27146), // tg * (2<<16) + 0.5 65 X8(-21746) // tg * (2<<16) + 0.5 66 }; 67 68 DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { 69 X8(23170) //cos * (2<<15) + 0.5 70 }; 71 72 DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; 73 74 static const struct 75 { 76 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; 77 } fdct_r_row_sse2 = 78 {{ 79 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW 80 }}; 81 //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; 82 83 static const struct 84 { 85 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; 86 } tab_frw_01234567_sse2 = 87 {{ 88 //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table 89 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ 90 C4, C4, C5, C7, C2, C6, C3, -C7, \ 91 -C4, C4, C7, C3, C6, -C2, C7, -C5, \ 92 C4, -C4, C5, -C1, C2, -C6, C3, -C1, 93 // c1..c7 * cos(pi/4) * 2^15 94 #define C1 22725 95 #define C2 21407 96 #define C3 19266 97 #define C4 16384 98 #define C5 12873 99 #define C6 8867 100 #define C7 4520 101 TABLE_SSE2 102 103 #undef C1 104 #undef C2 105 #undef C3 106 #undef C4 107 #undef C5 108 #undef C6 109 #undef C7 110 #define C1 31521 111 #define C2 29692 112 #define C3 26722 113 #define C4 22725 114 #define C5 17855 115 #define C6 12299 116 #define C7 6270 117 TABLE_SSE2 118 119 #undef C1 120 #undef C2 121 #undef C3 122 #undef C4 123 #undef C5 124 #undef C6 125 #undef C7 126 #define C1 29692 127 #define C2 27969 128 #define C3 25172 129 #define C4 21407 130 #define C5 16819 131 #define C6 11585 132 #define C7 5906 133 TABLE_SSE2 134 135 #undef C1 136 #undef C2 137 #undef C3 138 #undef C4 139 #undef C5 140 #undef C6 141 #undef C7 142 #define C1 26722 143 #define C2 25172 144 #define C3 22654 145 #define C4 19266 146 #define C5 15137 147 #define C6 10426 148 #define C7 5315 149 TABLE_SSE2 150 151 #undef C1 152 #undef C2 153 #undef C3 154 #undef C4 155 #undef C5 156 #undef C6 157 #undef C7 158 #define C1 22725 159 #define C2 21407 160 #define C3 19266 161 #define C4 16384 162 #define C5 12873 163 #define C6 8867 164 #define C7 4520 165 TABLE_SSE2 166 167 #undef C1 168 #undef C2 169 #undef C3 170 #undef C4 171 #undef C5 172 #undef C6 173 #undef C7 174 #define C1 26722 175 #define C2 25172 176 #define C3 22654 177 #define C4 19266 178 #define C5 15137 179 #define C6 10426 180 #define C7 5315 181 TABLE_SSE2 182 183 #undef C1 184 #undef C2 185 #undef C3 186 #undef C4 187 #undef C5 188 #undef C6 189 #undef C7 190 #define C1 29692 191 #define C2 27969 192 #define C3 25172 193 #define C4 21407 194 #define C5 16819 195 #define C6 11585 196 #define C7 5906 197 TABLE_SSE2 198 199 #undef C1 200 #undef C2 201 #undef C3 202 #undef C4 203 #undef C5 204 #undef C6 205 #undef C7 206 #define C1 31521 207 #define C2 29692 208 #define C3 26722 209 #define C4 22725 210 #define C5 17855 211 #define C6 12299 212 #define C7 6270 213 TABLE_SSE2 214 }}; 215 216 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long 217 218 #define FDCT_COL(cpu, mm, mov)\ 219 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ 220 {\ 221 __asm__ volatile (\ 222 #mov" 16(%0), %%"#mm"0 \n\t" \ 223 #mov" 96(%0), %%"#mm"1 \n\t" \ 224 #mov" %%"#mm"0, %%"#mm"2 \n\t" \ 225 #mov" 32(%0), %%"#mm"3 \n\t" \ 226 "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ 227 #mov" 80(%0), %%"#mm"4 \n\t" \ 228 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ 229 #mov" (%0), %%"#mm"5 \n\t" \ 230 "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ 231 "paddsw 112(%0), %%"#mm"5 \n\t" \ 232 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ 233 #mov" %%"#mm"0, %%"#mm"6 \n\t" \ 234 "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ 235 #mov" 16(%1), %%"#mm"1 \n\t" \ 236 "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ 237 #mov" 48(%0), %%"#mm"7 \n\t" \ 238 "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ 239 "paddsw 64(%0), %%"#mm"7 \n\t" \ 240 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ 241 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ 242 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ 243 #mov" %%"#mm"5, %%"#mm"4 \n\t" \ 244 "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ 245 "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ 246 "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ 247 "por (%2), %%"#mm"1 \n\t" \ 248 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ 249 "pmulhw 16(%1), %%"#mm"5 \n\t" \ 250 #mov" %%"#mm"4, %%"#mm"7 \n\t" \ 251 "psubsw 80(%0), %%"#mm"3 \n\t" \ 252 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ 253 #mov" %%"#mm"1, 32(%3) \n\t" \ 254 "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ 255 #mov" 48(%0), %%"#mm"1 \n\t" \ 256 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ 257 "psubsw 64(%0), %%"#mm"1 \n\t" \ 258 #mov" %%"#mm"2, %%"#mm"6 \n\t" \ 259 #mov" %%"#mm"4, 64(%3) \n\t" \ 260 "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ 261 "pmulhw (%4), %%"#mm"2 \n\t" \ 262 "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ 263 "pmulhw (%4), %%"#mm"6 \n\t" \ 264 "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ 265 "por (%2), %%"#mm"5 \n\t" \ 266 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ 267 "por (%2), %%"#mm"2 \n\t" \ 268 #mov" %%"#mm"1, %%"#mm"4 \n\t" \ 269 #mov" (%0), %%"#mm"3 \n\t" \ 270 "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ 271 "psubsw 112(%0), %%"#mm"3 \n\t" \ 272 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ 273 #mov" (%1), %%"#mm"0 \n\t" \ 274 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ 275 #mov" 32(%1), %%"#mm"6 \n\t" \ 276 "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ 277 #mov" %%"#mm"7, (%3) \n\t" \ 278 "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ 279 #mov" %%"#mm"5, 96(%3) \n\t" \ 280 #mov" %%"#mm"3, %%"#mm"7 \n\t" \ 281 #mov" 32(%1), %%"#mm"5 \n\t" \ 282 "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ 283 "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ 284 "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ 285 "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ 286 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ 287 "pmulhw (%1), %%"#mm"3 \n\t" \ 288 "por (%2), %%"#mm"0 \n\t" \ 289 "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ 290 "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ 291 #mov" %%"#mm"0, 16(%3) \n\t" \ 292 "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ 293 #mov" %%"#mm"7, 48(%3) \n\t" \ 294 "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ 295 #mov" %%"#mm"5, 80(%3) \n\t" \ 296 #mov" %%"#mm"3, 112(%3) \n\t" \ 297 : \ 298 : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ 299 "r" (out + offset), "r" (ocos_4_16)); \ 300 } 301 302 FDCT_COL(sse2, xmm, movdqa) 303 304 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) 305 { 306 __asm__ volatile( 307 #define FDCT_ROW_SSE2_H1(i,t) \ 308 "movq " #i "(%0), %%xmm2 \n\t" \ 309 "movq " #i "+8(%0), %%xmm0 \n\t" \ 310 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ 311 "movdqa " #t "+48(%1), %%xmm7 \n\t" \ 312 "movdqa " #t "(%1), %%xmm4 \n\t" \ 313 "movdqa " #t "+16(%1), %%xmm5 \n\t" 314 315 #define FDCT_ROW_SSE2_H2(i,t) \ 316 "movq " #i "(%0), %%xmm2 \n\t" \ 317 "movq " #i "+8(%0), %%xmm0 \n\t" \ 318 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ 319 "movdqa " #t "+48(%1), %%xmm7 \n\t" 320 321 #define FDCT_ROW_SSE2(i) \ 322 "movq %%xmm2, %%xmm1 \n\t" \ 323 "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ 324 "paddsw %%xmm0, %%xmm1 \n\t" \ 325 "psubsw %%xmm0, %%xmm2 \n\t" \ 326 "punpckldq %%xmm2, %%xmm1 \n\t" \ 327 "pshufd $78, %%xmm1, %%xmm2 \n\t" \ 328 "pmaddwd %%xmm2, %%xmm3 \n\t" \ 329 "pmaddwd %%xmm1, %%xmm7 \n\t" \ 330 "pmaddwd %%xmm5, %%xmm2 \n\t" \ 331 "pmaddwd %%xmm4, %%xmm1 \n\t" \ 332 "paddd %%xmm7, %%xmm3 \n\t" \ 333 "paddd %%xmm2, %%xmm1 \n\t" \ 334 "paddd %%xmm6, %%xmm3 \n\t" \ 335 "paddd %%xmm6, %%xmm1 \n\t" \ 336 "psrad %3, %%xmm3 \n\t" \ 337 "psrad %3, %%xmm1 \n\t" \ 338 "packssdw %%xmm3, %%xmm1 \n\t" \ 339 "movdqa %%xmm1, " #i "(%4) \n\t" 340 341 "movdqa (%2), %%xmm6 \n\t" 342 FDCT_ROW_SSE2_H1(0,0) 343 FDCT_ROW_SSE2(0) 344 FDCT_ROW_SSE2_H2(64,0) 345 FDCT_ROW_SSE2(64) 346 347 FDCT_ROW_SSE2_H1(16,64) 348 FDCT_ROW_SSE2(16) 349 FDCT_ROW_SSE2_H2(112,64) 350 FDCT_ROW_SSE2(112) 351 352 FDCT_ROW_SSE2_H1(32,128) 353 FDCT_ROW_SSE2(32) 354 FDCT_ROW_SSE2_H2(96,128) 355 FDCT_ROW_SSE2(96) 356 357 FDCT_ROW_SSE2_H1(48,192) 358 FDCT_ROW_SSE2(48) 359 FDCT_ROW_SSE2_H2(80,192) 360 FDCT_ROW_SSE2(80) 361 : 362 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), 363 "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) 364 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", 365 "%xmm4", "%xmm5", "%xmm6", "%xmm7") 366 ); 367 } 368 369 void ff_fdct_sse2(int16_t *block) 370 { 371 DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; 372 int16_t * const block1= (int16_t*)align_tmp; 373 374 fdct_col_sse2(block, block1, 0); 375 fdct_row_sse2(block1, block); 376 } 377 378 #endif /* HAVE_SSE2_INLINE */