tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

fdct.c (11407B)


      1 /*
      2 * SIMD-optimized forward DCT
      3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
      4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
      5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
      6 *
      7 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
      8 *
      9 *  Intel Application Note AP-922 - fast, precise implementation of DCT
     10 *        http://developer.intel.com/vtune/cbts/appnotes.htm
     11 *
     12 * Also of inspiration:
     13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
     14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html
     15 *
     16 * This file is part of FFmpeg.
     17 *
     18 * FFmpeg is free software; you can redistribute it and/or
     19 * modify it under the terms of the GNU Lesser General Public
     20 * License as published by the Free Software Foundation; either
     21 * version 2.1 of the License, or (at your option) any later version.
     22 *
     23 * FFmpeg is distributed in the hope that it will be useful,
     24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     26 * Lesser General Public License for more details.
     27 *
     28 * You should have received a copy of the GNU Lesser General Public
     29 * License along with FFmpeg; if not, write to the Free Software
     30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     31 */
     32 
     33 #include "config.h"
     34 #include "libavutil/attributes.h"
     35 #include "libavutil/macros.h"
     36 #include "libavutil/mem_internal.h"
     37 #include "libavutil/x86/asm.h"
     38 #include "fdct.h"
     39 
     40 #if HAVE_SSE2_INLINE
     41 
     42 //////////////////////////////////////////////////////////////////////
     43 //
     44 // constants for the forward DCT
     45 // -----------------------------
     46 //
     47 // Be sure to check that your compiler is aligning all constants to QWORD
     48 // (8-byte) memory boundaries!  Otherwise the unaligned memory access will
     49 // severely stall MMX execution.
     50 //
     51 //////////////////////////////////////////////////////////////////////
     52 
     53 #define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
     54 #define SHIFT_FRW_COL  BITS_FRW_ACC
     55 #define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
     56 #define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
     57 //#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
     58 
     59 #define X8(x) x,x,x,x,x,x,x,x
     60 
     61 //concatenated table, for forward DCT transformation
     62 DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
     63    X8(13036),  // tg * (2<<16) + 0.5
     64    X8(27146),  // tg * (2<<16) + 0.5
     65    X8(-21746)  // tg * (2<<16) + 0.5
     66 };
     67 
     68 DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
     69    X8(23170)   //cos * (2<<15) + 0.5
     70 };
     71 
     72 DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
     73 
     74 static const struct
     75 {
     76 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
     77 } fdct_r_row_sse2 =
     78 {{
     79 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
     80 }};
     81 //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
     82 
     83 static const struct
     84 {
     85 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
     86 } tab_frw_01234567_sse2 =
     87 {{
     88 //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = {  // forward_dct coeff table
     89 #define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
     90                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
     91                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
     92                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
     93 // c1..c7 * cos(pi/4) * 2^15
     94 #define C1 22725
     95 #define C2 21407
     96 #define C3 19266
     97 #define C4 16384
     98 #define C5 12873
     99 #define C6 8867
    100 #define C7 4520
    101 TABLE_SSE2
    102 
    103 #undef C1
    104 #undef C2
    105 #undef C3
    106 #undef C4
    107 #undef C5
    108 #undef C6
    109 #undef C7
    110 #define C1 31521
    111 #define C2 29692
    112 #define C3 26722
    113 #define C4 22725
    114 #define C5 17855
    115 #define C6 12299
    116 #define C7 6270
    117 TABLE_SSE2
    118 
    119 #undef C1
    120 #undef C2
    121 #undef C3
    122 #undef C4
    123 #undef C5
    124 #undef C6
    125 #undef C7
    126 #define C1 29692
    127 #define C2 27969
    128 #define C3 25172
    129 #define C4 21407
    130 #define C5 16819
    131 #define C6 11585
    132 #define C7 5906
    133 TABLE_SSE2
    134 
    135 #undef C1
    136 #undef C2
    137 #undef C3
    138 #undef C4
    139 #undef C5
    140 #undef C6
    141 #undef C7
    142 #define C1 26722
    143 #define C2 25172
    144 #define C3 22654
    145 #define C4 19266
    146 #define C5 15137
    147 #define C6 10426
    148 #define C7 5315
    149 TABLE_SSE2
    150 
    151 #undef C1
    152 #undef C2
    153 #undef C3
    154 #undef C4
    155 #undef C5
    156 #undef C6
    157 #undef C7
    158 #define C1 22725
    159 #define C2 21407
    160 #define C3 19266
    161 #define C4 16384
    162 #define C5 12873
    163 #define C6 8867
    164 #define C7 4520
    165 TABLE_SSE2
    166 
    167 #undef C1
    168 #undef C2
    169 #undef C3
    170 #undef C4
    171 #undef C5
    172 #undef C6
    173 #undef C7
    174 #define C1 26722
    175 #define C2 25172
    176 #define C3 22654
    177 #define C4 19266
    178 #define C5 15137
    179 #define C6 10426
    180 #define C7 5315
    181 TABLE_SSE2
    182 
    183 #undef C1
    184 #undef C2
    185 #undef C3
    186 #undef C4
    187 #undef C5
    188 #undef C6
    189 #undef C7
    190 #define C1 29692
    191 #define C2 27969
    192 #define C3 25172
    193 #define C4 21407
    194 #define C5 16819
    195 #define C6 11585
    196 #define C7 5906
    197 TABLE_SSE2
    198 
    199 #undef C1
    200 #undef C2
    201 #undef C3
    202 #undef C4
    203 #undef C5
    204 #undef C6
    205 #undef C7
    206 #define C1 31521
    207 #define C2 29692
    208 #define C3 26722
    209 #define C4 22725
    210 #define C5 17855
    211 #define C6 12299
    212 #define C7 6270
    213 TABLE_SSE2
    214 }};
    215 
    216 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
    217 
    218 #define FDCT_COL(cpu, mm, mov)\
    219 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
    220 {\
    221    __asm__ volatile (\
    222        #mov"      16(%0),  %%"#mm"0 \n\t" \
    223        #mov"      96(%0),  %%"#mm"1 \n\t" \
    224        #mov"    %%"#mm"0,  %%"#mm"2 \n\t" \
    225        #mov"      32(%0),  %%"#mm"3 \n\t" \
    226        "paddsw  %%"#mm"1,  %%"#mm"0 \n\t" \
    227        #mov"      80(%0),  %%"#mm"4 \n\t" \
    228        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
    229        #mov"        (%0),  %%"#mm"5 \n\t" \
    230        "paddsw  %%"#mm"3,  %%"#mm"4 \n\t" \
    231        "paddsw   112(%0),  %%"#mm"5 \n\t" \
    232        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
    233        #mov"    %%"#mm"0,  %%"#mm"6 \n\t" \
    234        "psubsw  %%"#mm"1,  %%"#mm"2 \n\t" \
    235        #mov"      16(%1),  %%"#mm"1 \n\t" \
    236        "psubsw  %%"#mm"4,  %%"#mm"0 \n\t" \
    237        #mov"      48(%0),  %%"#mm"7 \n\t" \
    238        "pmulhw  %%"#mm"0,  %%"#mm"1 \n\t" \
    239        "paddsw    64(%0),  %%"#mm"7 \n\t" \
    240        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
    241        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
    242        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
    243        #mov"    %%"#mm"5,  %%"#mm"4 \n\t" \
    244        "psubsw  %%"#mm"7,  %%"#mm"5 \n\t" \
    245        "paddsw  %%"#mm"5,  %%"#mm"1 \n\t" \
    246        "paddsw  %%"#mm"7,  %%"#mm"4 \n\t" \
    247        "por         (%2),  %%"#mm"1 \n\t" \
    248        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
    249        "pmulhw    16(%1),  %%"#mm"5 \n\t" \
    250        #mov"    %%"#mm"4,  %%"#mm"7 \n\t" \
    251        "psubsw    80(%0),  %%"#mm"3 \n\t" \
    252        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
    253        #mov"    %%"#mm"1,    32(%3) \n\t" \
    254        "paddsw  %%"#mm"6,  %%"#mm"7 \n\t" \
    255        #mov"      48(%0),  %%"#mm"1 \n\t" \
    256        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
    257        "psubsw    64(%0),  %%"#mm"1 \n\t" \
    258        #mov"    %%"#mm"2,  %%"#mm"6 \n\t" \
    259        #mov"    %%"#mm"4,    64(%3) \n\t" \
    260        "paddsw  %%"#mm"3,  %%"#mm"2 \n\t" \
    261        "pmulhw      (%4),  %%"#mm"2 \n\t" \
    262        "psubsw  %%"#mm"3,  %%"#mm"6 \n\t" \
    263        "pmulhw      (%4),  %%"#mm"6 \n\t" \
    264        "psubsw  %%"#mm"0,  %%"#mm"5 \n\t" \
    265        "por         (%2),  %%"#mm"5 \n\t" \
    266        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
    267        "por         (%2),  %%"#mm"2 \n\t" \
    268        #mov"    %%"#mm"1,  %%"#mm"4 \n\t" \
    269        #mov"        (%0),  %%"#mm"3 \n\t" \
    270        "paddsw  %%"#mm"6,  %%"#mm"1 \n\t" \
    271        "psubsw   112(%0),  %%"#mm"3 \n\t" \
    272        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
    273        #mov"        (%1),  %%"#mm"0 \n\t" \
    274        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
    275        #mov"      32(%1),  %%"#mm"6 \n\t" \
    276        "pmulhw  %%"#mm"1,  %%"#mm"0 \n\t" \
    277        #mov"    %%"#mm"7,      (%3) \n\t" \
    278        "pmulhw  %%"#mm"4,  %%"#mm"6 \n\t" \
    279        #mov"    %%"#mm"5,    96(%3) \n\t" \
    280        #mov"    %%"#mm"3,  %%"#mm"7 \n\t" \
    281        #mov"      32(%1),  %%"#mm"5 \n\t" \
    282        "psubsw  %%"#mm"2,  %%"#mm"7 \n\t" \
    283        "paddsw  %%"#mm"2,  %%"#mm"3 \n\t" \
    284        "pmulhw  %%"#mm"7,  %%"#mm"5 \n\t" \
    285        "paddsw  %%"#mm"3,  %%"#mm"0 \n\t" \
    286        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
    287        "pmulhw      (%1),  %%"#mm"3 \n\t" \
    288        "por         (%2),  %%"#mm"0 \n\t" \
    289        "paddsw  %%"#mm"7,  %%"#mm"5 \n\t" \
    290        "psubsw  %%"#mm"6,  %%"#mm"7 \n\t" \
    291        #mov"    %%"#mm"0,    16(%3) \n\t" \
    292        "paddsw  %%"#mm"4,  %%"#mm"5 \n\t" \
    293        #mov"    %%"#mm"7,    48(%3) \n\t" \
    294        "psubsw  %%"#mm"1,  %%"#mm"3 \n\t" \
    295        #mov"    %%"#mm"5,    80(%3) \n\t" \
    296        #mov"    %%"#mm"3,   112(%3) \n\t" \
    297        : \
    298        : "r" (in  + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
    299          "r" (out + offset), "r" (ocos_4_16)); \
    300 }
    301 
    302 FDCT_COL(sse2, xmm, movdqa)
    303 
    304 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
    305 {
    306    __asm__ volatile(
    307 #define FDCT_ROW_SSE2_H1(i,t)                    \
    308        "movq      " #i "(%0), %%xmm2      \n\t" \
    309        "movq      " #i "+8(%0), %%xmm0    \n\t" \
    310        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
    311        "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
    312        "movdqa    " #t "(%1), %%xmm4      \n\t" \
    313        "movdqa    " #t "+16(%1), %%xmm5   \n\t"
    314 
    315 #define FDCT_ROW_SSE2_H2(i,t)                    \
    316        "movq      " #i "(%0), %%xmm2      \n\t" \
    317        "movq      " #i "+8(%0), %%xmm0    \n\t" \
    318        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
    319        "movdqa    " #t "+48(%1), %%xmm7   \n\t"
    320 
    321 #define FDCT_ROW_SSE2(i)                      \
    322        "movq      %%xmm2, %%xmm1       \n\t" \
    323        "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
    324        "paddsw    %%xmm0, %%xmm1       \n\t" \
    325        "psubsw    %%xmm0, %%xmm2       \n\t" \
    326        "punpckldq %%xmm2, %%xmm1       \n\t" \
    327        "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
    328        "pmaddwd   %%xmm2, %%xmm3       \n\t" \
    329        "pmaddwd   %%xmm1, %%xmm7       \n\t" \
    330        "pmaddwd   %%xmm5, %%xmm2       \n\t" \
    331        "pmaddwd   %%xmm4, %%xmm1       \n\t" \
    332        "paddd     %%xmm7, %%xmm3       \n\t" \
    333        "paddd     %%xmm2, %%xmm1       \n\t" \
    334        "paddd     %%xmm6, %%xmm3       \n\t" \
    335        "paddd     %%xmm6, %%xmm1       \n\t" \
    336        "psrad     %3, %%xmm3           \n\t" \
    337        "psrad     %3, %%xmm1           \n\t" \
    338        "packssdw  %%xmm3, %%xmm1       \n\t" \
    339        "movdqa    %%xmm1, " #i "(%4)   \n\t"
    340 
    341        "movdqa    (%2), %%xmm6         \n\t"
    342        FDCT_ROW_SSE2_H1(0,0)
    343        FDCT_ROW_SSE2(0)
    344        FDCT_ROW_SSE2_H2(64,0)
    345        FDCT_ROW_SSE2(64)
    346 
    347        FDCT_ROW_SSE2_H1(16,64)
    348        FDCT_ROW_SSE2(16)
    349        FDCT_ROW_SSE2_H2(112,64)
    350        FDCT_ROW_SSE2(112)
    351 
    352        FDCT_ROW_SSE2_H1(32,128)
    353        FDCT_ROW_SSE2(32)
    354        FDCT_ROW_SSE2_H2(96,128)
    355        FDCT_ROW_SSE2(96)
    356 
    357        FDCT_ROW_SSE2_H1(48,192)
    358        FDCT_ROW_SSE2(48)
    359        FDCT_ROW_SSE2_H2(80,192)
    360        FDCT_ROW_SSE2(80)
    361        :
    362        : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
    363          "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
    364          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
    365                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
    366    );
    367 }
    368 
    369 void ff_fdct_sse2(int16_t *block)
    370 {
    371    DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
    372    int16_t * const block1= (int16_t*)align_tmp;
    373 
    374    fdct_col_sse2(block, block1, 0);
    375    fdct_row_sse2(block1, block);
    376 }
    377 
    378 #endif /* HAVE_SSE2_INLINE */