[ tor-browser ].git.dasho

mpegaudiodsp.c (10688B)
      1 /*
      2 * SIMD-optimized MP3 decoding functions
      3 * Copyright (c) 2010 Vitor Sessak
      4 *
      5 * This file is part of FFmpeg.
      6 *
      7 * FFmpeg is free software; you can redistribute it and/or
      8 * modify it under the terms of the GNU Lesser General Public
      9 * License as published by the Free Software Foundation; either
     10 * version 2.1 of the License, or (at your option) any later version.
     11 *
     12 * FFmpeg is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 * Lesser General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU Lesser General Public
     18 * License along with FFmpeg; if not, write to the Free Software
     19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 */
     21 
     22 #include <stddef.h>
     23 
     24 #include "config.h"
     25 #include "libavutil/attributes.h"
     26 #include "libavutil/cpu.h"
     27 #include "libavutil/mem_internal.h"
     28 #include "libavutil/x86/asm.h"
     29 #include "libavutil/x86/cpu.h"
     30 #include "libavcodec/mpegaudiodsp.h"
     31 
     32 #define DECL(CPU)\
     33 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
     34 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
     35 
     36 #if HAVE_X86ASM
     37 DECL(sse2)
     38 DECL(sse3)
     39 DECL(ssse3)
     40 DECL(avx)
     41 #endif /* HAVE_X86ASM */
     42 
     43 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
     44                               float *tmpbuf);
     45 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
     46                               float *tmpbuf);
     47 
     48 void ff_dct32_float_sse2(float *out, const float *in);
     49 void ff_dct32_float_avx (float *out, const float *in);
     50 
     51 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
     52 
     53 #if HAVE_6REGS && HAVE_SSE_INLINE
     54 
     55 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
     56 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
     57 
     58 #define SUM8(op, sum, w, p)               \
     59 {                                         \
     60    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
     61    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
     62    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
     63    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
     64    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
     65    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
     66    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
     67    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
     68 }
     69 
     70 static void apply_window(const float *buf, const float *win1,
     71                         const float *win2, float *sum1, float *sum2, int len)
     72 {
     73    x86_reg count = - 4*len;
     74    const float *win1a = win1+len;
     75    const float *win2a = win2+len;
     76    const float *bufa  = buf+len;
     77    float *sum1a = sum1+len;
     78    float *sum2a = sum2+len;
     79 
     80 
     81 #define MULT(a, b)                                 \
     82    "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
     83    "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
     84    "mulps         %%xmm2, %%xmm1           \n\t"  \
     85    "subps         %%xmm1, %%xmm0           \n\t"  \
     86    "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
     87    "subps         %%xmm2, %%xmm4           \n\t"  \
     88 
     89    __asm__ volatile(
     90            "1:                                   \n\t"
     91            "xorps       %%xmm0, %%xmm0           \n\t"
     92            "xorps       %%xmm4, %%xmm4           \n\t"
     93 
     94            MULT(   0,   0)
     95            MULT( 256,  64)
     96            MULT( 512, 128)
     97            MULT( 768, 192)
     98            MULT(1024, 256)
     99            MULT(1280, 320)
    100            MULT(1536, 384)
    101            MULT(1792, 448)
    102 
    103            "movaps      %%xmm0, (%4,%0)          \n\t"
    104            "movaps      %%xmm4, (%5,%0)          \n\t"
    105            "add            $16,  %0              \n\t"
    106            "jl              1b                   \n\t"
    107            :"+&r"(count)
    108            :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
    109            );
    110 
    111 #undef MULT
    112 }
    113 
    114 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
    115                             ptrdiff_t incr)
    116 {
    117    LOCAL_ALIGNED_16(float, suma, [17]);
    118    LOCAL_ALIGNED_16(float, sumb, [17]);
    119    LOCAL_ALIGNED_16(float, sumc, [17]);
    120    LOCAL_ALIGNED_16(float, sumd, [17]);
    121 
    122    float sum;
    123 
    124    /* copy to avoid wrap */
    125    __asm__ volatile(
    126            "movaps    0(%0), %%xmm0   \n\t" \
    127            "movaps   16(%0), %%xmm1   \n\t" \
    128            "movaps   32(%0), %%xmm2   \n\t" \
    129            "movaps   48(%0), %%xmm3   \n\t" \
    130            "movaps   %%xmm0,   0(%1) \n\t" \
    131            "movaps   %%xmm1,  16(%1) \n\t" \
    132            "movaps   %%xmm2,  32(%1) \n\t" \
    133            "movaps   %%xmm3,  48(%1) \n\t" \
    134            "movaps   64(%0), %%xmm0   \n\t" \
    135            "movaps   80(%0), %%xmm1   \n\t" \
    136            "movaps   96(%0), %%xmm2   \n\t" \
    137            "movaps  112(%0), %%xmm3   \n\t" \
    138            "movaps   %%xmm0,  64(%1) \n\t" \
    139            "movaps   %%xmm1,  80(%1) \n\t" \
    140            "movaps   %%xmm2,  96(%1) \n\t" \
    141            "movaps   %%xmm3, 112(%1) \n\t"
    142            ::"r"(in), "r"(in+512)
    143            :"memory"
    144            );
    145 
    146    apply_window(in + 16, win     , win + 512, suma, sumc, 16);
    147    apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
    148 
    149    SUM8(MACS, suma[0], win + 32, in + 48);
    150 
    151    sumc[ 0] = 0;
    152    sumb[16] = 0;
    153    sumd[16] = 0;
    154 
    155 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
    156            "movups " #sumd "(%4),       %%xmm0          \n\t" \
    157            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
    158            "subps  " #suma "(%1),       %%xmm0          \n\t" \
    159            "movaps        %%xmm0," #out1 "(%0)          \n\t" \
    160 \
    161            "movups " #sumc "(%3),       %%xmm0          \n\t" \
    162            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
    163            "addps  " #sumb "(%2),       %%xmm0          \n\t" \
    164            "movaps        %%xmm0," #out2 "(%0)          \n\t"
    165 
    166    if (incr == 1) {
    167        __asm__ volatile(
    168            SUMS( 0, 48,  4, 52,  0, 112)
    169            SUMS(16, 32, 20, 36, 16,  96)
    170            SUMS(32, 16, 36, 20, 32,  80)
    171            SUMS(48,  0, 52,  4, 48,  64)
    172 
    173            :"+&r"(out)
    174            :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
    175            :"memory"
    176            );
    177        out += 16*incr;
    178    } else {
    179        int j;
    180        float *out2 = out + 32 * incr;
    181        out[0  ]  = -suma[   0];
    182        out += incr;
    183        out2 -= incr;
    184        for(j=1;j<16;j++) {
    185            *out  = -suma[   j] + sumd[16-j];
    186            *out2 =  sumb[16-j] + sumc[   j];
    187            out  += incr;
    188            out2 -= incr;
    189        }
    190    }
    191 
    192    sum = 0;
    193    SUM8(MLSS, sum, win + 16 + 32, in + 32);
    194    *out = sum;
    195 }
    196 
    197 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
    198 
    199 #if HAVE_X86ASM
    200 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
    201 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
    202                               int count, int switch_point, int block_type) \
    203 {                                                                           \
    204    int align_end = count - (count & 3);                                \
    205    int j;                                                              \
    206    for (j = 0; j < align_end; j+= 4) {                                 \
    207        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
    208        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
    209        /* apply window & overlap with previous buffer */               \
    210                                                                        \
    211        /* select window */                                             \
    212        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
    213        in      += 4*18;                                                \
    214        buf     += 4*18;                                                \
    215        out     += 4;                                                   \
    216    }                                                                   \
    217    for (; j < count; j++) {                                            \
    218        /* apply window & overlap with previous buffer */               \
    219                                                                        \
    220        /* select window */                                             \
    221        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
    222        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
    223                                                                        \
    224        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
    225                                                                        \
    226        in  += 18;                                                      \
    227        buf++;                                                          \
    228        out++;                                                          \
    229    }                                                                   \
    230 }
    231 
    232 #if HAVE_SSE
    233 DECL_IMDCT_BLOCKS(sse2,sse)
    234 DECL_IMDCT_BLOCKS(sse3,sse)
    235 DECL_IMDCT_BLOCKS(ssse3,sse)
    236 #endif
    237 #if HAVE_AVX_EXTERNAL
    238 DECL_IMDCT_BLOCKS(avx,avx)
    239 #endif
    240 #endif /* HAVE_X86ASM */
    241 
    242 av_cold void ff_mpadsp_init_x86_tabs(void)
    243 {
    244    int i, j;
    245    for (j = 0; j < 4; j++) {
    246        for (i = 0; i < 40; i ++) {
    247            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
    248            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
    249            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
    250            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
    251            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
    252            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
    253            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
    254            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
    255        }
    256    }
    257 }
    258 
    259 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
    260 {
    261    av_unused int cpu_flags = av_get_cpu_flags();
    262 
    263 #if HAVE_6REGS && HAVE_SSE_INLINE
    264    if (INLINE_SSE(cpu_flags)) {
    265        s->apply_window_float = apply_window_mp3;
    266    }
    267 #endif /* HAVE_SSE_INLINE */
    268 
    269 #if HAVE_X86ASM
    270 #if HAVE_SSE
    271    if (EXTERNAL_SSE2(cpu_flags)) {
    272        s->imdct36_blocks_float = imdct36_blocks_sse2;
    273        s->dct32_float          = ff_dct32_float_sse2;
    274    }
    275    if (EXTERNAL_SSE3(cpu_flags)) {
    276        s->imdct36_blocks_float = imdct36_blocks_sse3;
    277    }
    278    if (EXTERNAL_SSSE3(cpu_flags)) {
    279        s->imdct36_blocks_float = imdct36_blocks_ssse3;
    280    }
    281 #endif
    282 #if HAVE_AVX_EXTERNAL
    283    if (EXTERNAL_AVX(cpu_flags)) {
    284        s->imdct36_blocks_float = imdct36_blocks_avx;
    285    }
    286    if (EXTERNAL_AVX_FAST(cpu_flags))
    287        s->dct32_float          = ff_dct32_float_avx;
    288 #endif
    289 #endif /* HAVE_X86ASM */
    290 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE