[ tor-browser ].git.dasho

02_simd-detect-runtime.patch (12856B)
      1 diff --git a/src/resample.c b/src/resample.c
      2 --- a/src/resample.c
      3 +++ b/src/resample.c
      4 @@ -91,23 +91,17 @@ static void speex_free(void *ptr) {free(
      5 #ifndef NULL
      6 #define NULL 0
      7 #endif
      8 
      9 #ifndef UINT32_MAX
     10 #define UINT32_MAX 4294967295U
     11 #endif
     12 
     13 -#ifdef USE_SSE
     14 -#include "resample_sse.h"
     15 -#endif
     16 -
     17 -#ifdef USE_NEON
     18 -#include "resample_neon.h"
     19 -#endif
     20 +#include "simd_detect.h"
     21 
     22 /* Number of elements to allocate on the stack */
     23 #ifdef VAR_ARRAYS
     24 #define FIXED_STACK_ALLOC 8192
     25 #else
     26 #define FIXED_STACK_ALLOC 1024
     27 #endif
     28 
     29 @@ -341,17 +335,19 @@ static int resampler_basic_direct_single
     30    const spx_uint32_t den_rate = st->den_rate;
     31    spx_word32_t sum;
     32 
     33    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
     34    {
     35       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
     36       const spx_word16_t *iptr = & in[last_sample];
     37 
     38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
     39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
     40 +      if (!moz_speex_have_single_simd()) {
     41 +#endif
     42       int j;
     43       sum = 0;
     44       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
     45 
     46 /*    This code is slower on most DSPs which have only 2 accumulators.
     47       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
     48       I think we can trust the compiler and let it vectorize and/or unroll itself.
     49       spx_word32_t accum[4] = {0,0,0,0};
     50 @@ -359,18 +355,20 @@ static int resampler_basic_direct_single
     51         accum[0] += MULT16_16(sinct[j], iptr[j]);
     52         accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
     53         accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
     54         accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
     55       }
     56       sum = accum[0] + accum[1] + accum[2] + accum[3];
     57 */
     58       sum = SATURATE32PSHR(sum, 15, 32767);
     59 -#else
     60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
     61 +      } else {
     62       sum = inner_product_single(sinct, iptr, N);
     63 +      }
     64 #endif
     65 
     66       out[out_stride * out_sample++] = sum;
     67       last_sample += int_advance;
     68       samp_frac_num += frac_advance;
     69       if (samp_frac_num >= den_rate)
     70       {
     71          samp_frac_num -= den_rate;
     72 @@ -399,29 +397,33 @@ static int resampler_basic_direct_double
     73    const spx_uint32_t den_rate = st->den_rate;
     74    double sum;
     75 
     76    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
     77    {
     78       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
     79       const spx_word16_t *iptr = & in[last_sample];
     80 
     81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
     82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
     83 +      if(moz_speex_have_double_simd()) {
     84 +#endif
     85       int j;
     86       double accum[4] = {0,0,0,0};
     87 
     88       for(j=0;j<N;j+=4) {
     89         accum[0] += sinct[j]*iptr[j];
     90         accum[1] += sinct[j+1]*iptr[j+1];
     91         accum[2] += sinct[j+2]*iptr[j+2];
     92         accum[3] += sinct[j+3]*iptr[j+3];
     93       }
     94       sum = accum[0] + accum[1] + accum[2] + accum[3];
     95 -#else
     96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
     97 +      } else {
     98       sum = inner_product_double(sinct, iptr, N);
     99 +      }
    100 #endif
    101 
    102       out[out_stride * out_sample++] = PSHR32(sum, 15);
    103       last_sample += int_advance;
    104       samp_frac_num += frac_advance;
    105       if (samp_frac_num >= den_rate)
    106       {
    107          samp_frac_num -= den_rate;
    108 @@ -455,34 +457,38 @@ static int resampler_basic_interpolate_s
    109 #ifdef FIXED_POINT
    110       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
    111 #else
    112       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
    113 #endif
    114       spx_word16_t interp[4];
    115 
    116 
    117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
    118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
    119 +      if (!moz_speex_have_single_simd()) {
    120 +#endif
    121       int j;
    122       spx_word32_t accum[4] = {0,0,0,0};
    123 
    124       for(j=0;j<N;j++) {
    125         const spx_word16_t curr_in=iptr[j];
    126         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
    127         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
    128         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
    129         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
    130       }
    131 
    132       cubic_coef(frac, interp);
    133       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
    134       sum = SATURATE32PSHR(sum, 15, 32767);
    135 -#else
    136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
    137 +      } else {
    138       cubic_coef(frac, interp);
    139       sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
    140 +      }
    141 #endif
    142 
    143       out[out_stride * out_sample++] = sum;
    144       last_sample += int_advance;
    145       samp_frac_num += frac_advance;
    146       if (samp_frac_num >= den_rate)
    147       {
    148          samp_frac_num -= den_rate;
    149 @@ -518,33 +524,37 @@ static int resampler_basic_interpolate_d
    150 #ifdef FIXED_POINT
    151       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
    152 #else
    153       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
    154 #endif
    155       spx_word16_t interp[4];
    156 
    157 
    158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
    159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
    160 +      if (!moz_speex_have_double_simd()) {
    161 +#endif
    162       int j;
    163       double accum[4] = {0,0,0,0};
    164 
    165       for(j=0;j<N;j++) {
    166         const double curr_in=iptr[j];
    167         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
    168         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
    169         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
    170         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
    171       }
    172 
    173       cubic_coef(frac, interp);
    174       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
    175 -#else
    176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
    177 +      } else {
    178       cubic_coef(frac, interp);
    179       sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
    180 +      }
    181 #endif
    182 
    183       out[out_stride * out_sample++] = PSHR32(sum,15);
    184       last_sample += int_advance;
    185       samp_frac_num += frac_advance;
    186       if (samp_frac_num >= den_rate)
    187       {
    188          samp_frac_num -= den_rate;
    189 diff --git a/src/resample_neon.c b/src/resample_neon.c
    190 --- a/src/resample_neon.c
    191 +++ b/src/resample_neon.c
    192 @@ -32,16 +32,17 @@
    193    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    194    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    195    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    196    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    197    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    198 */
    199 
    200 #include <stdint.h>
    201 +#include "simd_detect.h"
    202 
    203 #ifdef FIXED_POINT
    204 #if defined(__aarch64__)
    205 static inline int32_t saturate_32bit_to_16bit(int32_t a) {
    206     int32_t ret;
    207     asm ("fmov s0, %w[a]\n"
    208          "sqxtn h0, s0\n"
    209          "sxtl v0.4s, v0.4h\n"
    210 @@ -73,17 +74,17 @@
    211 }
    212 #endif
    213 #undef WORD2INT
    214 #define WORD2INT(x) (saturate_32bit_to_16bit(x))
    215 
    216 #define OVERRIDE_INNER_PRODUCT_SINGLE
    217 /* Only works when len % 4 == 0 and len >= 4 */
    218 #if defined(__aarch64__)
    219 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
    220 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
    221 {
    222     int32_t ret;
    223     uint32_t remainder = len % 16;
    224     len = len - remainder;
    225 
    226     asm volatile ("	 cmp %w[len], #0\n"
    227 		  "	 b.ne 1f\n"
    228 		  "	 ld1 {v16.4h}, [%[b]], #8\n"
    229 @@ -128,17 +129,17 @@
    230 		  : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
    231 		    [len] "+r" (len), [remainder] "+r" (remainder)
    232 		  :
    233 		  : "cc", "v0",
    234 		    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
    235     return ret;
    236 }
    237 #else
    238 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
    239 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
    240 {
    241     int32_t ret;
    242     uint32_t remainder = len % 16;
    243     len = len - remainder;
    244 
    245     asm volatile ("	 cmp %[len], #0\n"
    246 		  "	 bne 1f\n"
    247 		  "	 vld1.16 {d16}, [%[b]]!\n"
    248 @@ -218,17 +219,17 @@
    249 #endif
    250 
    251 #undef WORD2INT
    252 #define WORD2INT(x) (saturate_float_to_16bit(x))
    253 
    254 #define OVERRIDE_INNER_PRODUCT_SINGLE
    255 /* Only works when len % 4 == 0 and len >= 4 */
    256 #if defined(__aarch64__)
    257 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
    258 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
    259 {
    260     float ret;
    261     uint32_t remainder = len % 16;
    262     len = len - remainder;
    263 
    264     asm volatile ("	 cmp %w[len], #0\n"
    265 		  "	 b.ne 1f\n"
    266 		  "	 ld1 {v16.4s}, [%[b]], #16\n"
    267 @@ -273,17 +274,17 @@
    268 		  : [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
    269 		    [len] "+r" (len), [remainder] "+r" (remainder)
    270 		  :
    271 		  : "cc", "v1", "v2", "v3", "v4",
    272 		    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
    273     return ret;
    274 }
    275 #else
    276 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
    277 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
    278 {
    279     float ret;
    280     uint32_t remainder = len % 16;
    281     len = len - remainder;
    282 
    283     asm volatile ("	 cmp %[len], #0\n"
    284 		  "	 bne 1f\n"
    285 		  "	 vld1.32 {q4}, [%[b]]!\n"
    286 diff --git a/src/resample_sse.c b/src/resample_sse.c
    287 --- a/src/resample_sse.c
    288 +++ b/src/resample_sse.c
    289 @@ -29,37 +29,39 @@
    290    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    291    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    292    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    293    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    294    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    295    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    296 */
    297 
    298 +#include "simd_detect.h"
    299 +
    300 #include <xmmintrin.h>
    301 
    302 #define OVERRIDE_INNER_PRODUCT_SINGLE
    303 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
    304 +float inner_product_single(const float *a, const float *b, unsigned int len)
    305 {
    306    int i;
    307    float ret;
    308    __m128 sum = _mm_setzero_ps();
    309    for (i=0;i<len;i+=8)
    310    {
    311       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
    312       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
    313    }
    314    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
    315    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
    316    _mm_store_ss(&ret, sum);
    317    return ret;
    318 }
    319 
    320 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
    321 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
    322 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
    323   int i;
    324   float ret;
    325   __m128 sum = _mm_setzero_ps();
    326   __m128 f = _mm_loadu_ps(frac);
    327   for(i=0;i<len;i+=2)
    328   {
    329     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
    330     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
    331 @@ -70,17 +72,17 @@ static inline float interpolate_product_
    332    _mm_store_ss(&ret, sum);
    333    return ret;
    334 }
    335 
    336 #ifdef USE_SSE2
    337 #include <emmintrin.h>
    338 #define OVERRIDE_INNER_PRODUCT_DOUBLE
    339 
    340 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
    341 +double inner_product_double(const float *a, const float *b, unsigned int len)
    342 {
    343    int i;
    344    double ret;
    345    __m128d sum = _mm_setzero_pd();
    346    __m128 t;
    347    for (i=0;i<len;i+=8)
    348    {
    349       t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
    350 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
    351       sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
    352    }
    353    sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
    354    _mm_store_sd(&ret, sum);
    355    return ret;
    356 }
    357 
    358 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
    359 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
    360 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
    361   int i;
    362   double ret;
    363   __m128d sum;
    364   __m128d sum1 = _mm_setzero_pd();
    365   __m128d sum2 = _mm_setzero_pd();
    366   __m128 f = _mm_loadu_ps(frac);
    367   __m128d f1 = _mm_cvtps_pd(f);
    368   __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE