02_simd-detect-runtime.patch (12856B)
1 diff --git a/src/resample.c b/src/resample.c 2 --- a/src/resample.c 3 +++ b/src/resample.c 4 @@ -91,23 +91,17 @@ static void speex_free(void *ptr) {free( 5 #ifndef NULL 6 #define NULL 0 7 #endif 8 9 #ifndef UINT32_MAX 10 #define UINT32_MAX 4294967295U 11 #endif 12 13 -#ifdef USE_SSE 14 -#include "resample_sse.h" 15 -#endif 16 - 17 -#ifdef USE_NEON 18 -#include "resample_neon.h" 19 -#endif 20 +#include "simd_detect.h" 21 22 /* Number of elements to allocate on the stack */ 23 #ifdef VAR_ARRAYS 24 #define FIXED_STACK_ALLOC 8192 25 #else 26 #define FIXED_STACK_ALLOC 1024 27 #endif 28 29 @@ -341,17 +335,19 @@ static int resampler_basic_direct_single 30 const spx_uint32_t den_rate = st->den_rate; 31 spx_word32_t sum; 32 33 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) 34 { 35 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; 36 const spx_word16_t *iptr = & in[last_sample]; 37 38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE 39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE 40 + if (!moz_speex_have_single_simd()) { 41 +#endif 42 int j; 43 sum = 0; 44 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]); 45 46 /* This code is slower on most DSPs which have only 2 accumulators. 47 Plus this this forces truncation to 32 bits and you lose the HW guard bits. 48 I think we can trust the compiler and let it vectorize and/or unroll itself. 49 spx_word32_t accum[4] = {0,0,0,0}; 50 @@ -359,18 +355,20 @@ static int resampler_basic_direct_single 51 accum[0] += MULT16_16(sinct[j], iptr[j]); 52 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]); 53 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]); 54 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]); 55 } 56 sum = accum[0] + accum[1] + accum[2] + accum[3]; 57 */ 58 sum = SATURATE32PSHR(sum, 15, 32767); 59 -#else 60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE 61 + } else { 62 sum = inner_product_single(sinct, iptr, N); 63 + } 64 #endif 65 66 out[out_stride * out_sample++] = sum; 67 last_sample += int_advance; 68 samp_frac_num += frac_advance; 69 if (samp_frac_num >= den_rate) 70 { 71 samp_frac_num -= den_rate; 72 @@ -399,29 +397,33 @@ static int resampler_basic_direct_double 73 const spx_uint32_t den_rate = st->den_rate; 74 double sum; 75 76 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) 77 { 78 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; 79 const spx_word16_t *iptr = & in[last_sample]; 80 81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE 82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE 83 + if(moz_speex_have_double_simd()) { 84 +#endif 85 int j; 86 double accum[4] = {0,0,0,0}; 87 88 for(j=0;j<N;j+=4) { 89 accum[0] += sinct[j]*iptr[j]; 90 accum[1] += sinct[j+1]*iptr[j+1]; 91 accum[2] += sinct[j+2]*iptr[j+2]; 92 accum[3] += sinct[j+3]*iptr[j+3]; 93 } 94 sum = accum[0] + accum[1] + accum[2] + accum[3]; 95 -#else 96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE 97 + } else { 98 sum = inner_product_double(sinct, iptr, N); 99 + } 100 #endif 101 102 out[out_stride * out_sample++] = PSHR32(sum, 15); 103 last_sample += int_advance; 104 samp_frac_num += frac_advance; 105 if (samp_frac_num >= den_rate) 106 { 107 samp_frac_num -= den_rate; 108 @@ -455,34 +457,38 @@ static int resampler_basic_interpolate_s 109 #ifdef FIXED_POINT 110 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); 111 #else 112 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; 113 #endif 114 spx_word16_t interp[4]; 115 116 117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 119 + if (!moz_speex_have_single_simd()) { 120 +#endif 121 int j; 122 spx_word32_t accum[4] = {0,0,0,0}; 123 124 for(j=0;j<N;j++) { 125 const spx_word16_t curr_in=iptr[j]; 126 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); 127 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); 128 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); 129 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); 130 } 131 132 cubic_coef(frac, interp); 133 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]); 134 sum = SATURATE32PSHR(sum, 15, 32767); 135 -#else 136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 137 + } else { 138 cubic_coef(frac, interp); 139 sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); 140 + } 141 #endif 142 143 out[out_stride * out_sample++] = sum; 144 last_sample += int_advance; 145 samp_frac_num += frac_advance; 146 if (samp_frac_num >= den_rate) 147 { 148 samp_frac_num -= den_rate; 149 @@ -518,33 +524,37 @@ static int resampler_basic_interpolate_d 150 #ifdef FIXED_POINT 151 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); 152 #else 153 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; 154 #endif 155 spx_word16_t interp[4]; 156 157 158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 160 + if (!moz_speex_have_double_simd()) { 161 +#endif 162 int j; 163 double accum[4] = {0,0,0,0}; 164 165 for(j=0;j<N;j++) { 166 const double curr_in=iptr[j]; 167 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); 168 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); 169 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); 170 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); 171 } 172 173 cubic_coef(frac, interp); 174 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]); 175 -#else 176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 177 + } else { 178 cubic_coef(frac, interp); 179 sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); 180 + } 181 #endif 182 183 out[out_stride * out_sample++] = PSHR32(sum,15); 184 last_sample += int_advance; 185 samp_frac_num += frac_advance; 186 if (samp_frac_num >= den_rate) 187 { 188 samp_frac_num -= den_rate; 189 diff --git a/src/resample_neon.c b/src/resample_neon.c 190 --- a/src/resample_neon.c 191 +++ b/src/resample_neon.c 192 @@ -32,16 +32,17 @@ 193 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 194 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 195 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 196 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 197 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 198 */ 199 200 #include <stdint.h> 201 +#include "simd_detect.h" 202 203 #ifdef FIXED_POINT 204 #if defined(__aarch64__) 205 static inline int32_t saturate_32bit_to_16bit(int32_t a) { 206 int32_t ret; 207 asm ("fmov s0, %w[a]\n" 208 "sqxtn h0, s0\n" 209 "sxtl v0.4s, v0.4h\n" 210 @@ -73,17 +74,17 @@ 211 } 212 #endif 213 #undef WORD2INT 214 #define WORD2INT(x) (saturate_32bit_to_16bit(x)) 215 216 #define OVERRIDE_INNER_PRODUCT_SINGLE 217 /* Only works when len % 4 == 0 and len >= 4 */ 218 #if defined(__aarch64__) 219 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len) 220 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len) 221 { 222 int32_t ret; 223 uint32_t remainder = len % 16; 224 len = len - remainder; 225 226 asm volatile (" cmp %w[len], #0\n" 227 " b.ne 1f\n" 228 " ld1 {v16.4h}, [%[b]], #8\n" 229 @@ -128,17 +129,17 @@ 230 : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b), 231 [len] "+r" (len), [remainder] "+r" (remainder) 232 : 233 : "cc", "v0", 234 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); 235 return ret; 236 } 237 #else 238 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len) 239 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len) 240 { 241 int32_t ret; 242 uint32_t remainder = len % 16; 243 len = len - remainder; 244 245 asm volatile (" cmp %[len], #0\n" 246 " bne 1f\n" 247 " vld1.16 {d16}, [%[b]]!\n" 248 @@ -218,17 +219,17 @@ 249 #endif 250 251 #undef WORD2INT 252 #define WORD2INT(x) (saturate_float_to_16bit(x)) 253 254 #define OVERRIDE_INNER_PRODUCT_SINGLE 255 /* Only works when len % 4 == 0 and len >= 4 */ 256 #if defined(__aarch64__) 257 -static inline float inner_product_single(const float *a, const float *b, unsigned int len) 258 +inline float inner_product_single(const float *a, const float *b, unsigned int len) 259 { 260 float ret; 261 uint32_t remainder = len % 16; 262 len = len - remainder; 263 264 asm volatile (" cmp %w[len], #0\n" 265 " b.ne 1f\n" 266 " ld1 {v16.4s}, [%[b]], #16\n" 267 @@ -273,17 +274,17 @@ 268 : [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b), 269 [len] "+r" (len), [remainder] "+r" (remainder) 270 : 271 : "cc", "v1", "v2", "v3", "v4", 272 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); 273 return ret; 274 } 275 #else 276 -static inline float inner_product_single(const float *a, const float *b, unsigned int len) 277 +inline float inner_product_single(const float *a, const float *b, unsigned int len) 278 { 279 float ret; 280 uint32_t remainder = len % 16; 281 len = len - remainder; 282 283 asm volatile (" cmp %[len], #0\n" 284 " bne 1f\n" 285 " vld1.32 {q4}, [%[b]]!\n" 286 diff --git a/src/resample_sse.c b/src/resample_sse.c 287 --- a/src/resample_sse.c 288 +++ b/src/resample_sse.c 289 @@ -29,37 +29,39 @@ 290 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 291 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 292 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 293 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 294 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 295 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 296 */ 297 298 +#include "simd_detect.h" 299 + 300 #include <xmmintrin.h> 301 302 #define OVERRIDE_INNER_PRODUCT_SINGLE 303 -static inline float inner_product_single(const float *a, const float *b, unsigned int len) 304 +float inner_product_single(const float *a, const float *b, unsigned int len) 305 { 306 int i; 307 float ret; 308 __m128 sum = _mm_setzero_ps(); 309 for (i=0;i<len;i+=8) 310 { 311 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i))); 312 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4))); 313 } 314 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); 315 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); 316 _mm_store_ss(&ret, sum); 317 return ret; 318 } 319 320 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 321 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) { 322 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) { 323 int i; 324 float ret; 325 __m128 sum = _mm_setzero_ps(); 326 __m128 f = _mm_loadu_ps(frac); 327 for(i=0;i<len;i+=2) 328 { 329 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample))); 330 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample))); 331 @@ -70,17 +72,17 @@ static inline float interpolate_product_ 332 _mm_store_ss(&ret, sum); 333 return ret; 334 } 335 336 #ifdef USE_SSE2 337 #include <emmintrin.h> 338 #define OVERRIDE_INNER_PRODUCT_DOUBLE 339 340 -static inline double inner_product_double(const float *a, const float *b, unsigned int len) 341 +double inner_product_double(const float *a, const float *b, unsigned int len) 342 { 343 int i; 344 double ret; 345 __m128d sum = _mm_setzero_pd(); 346 __m128 t; 347 for (i=0;i<len;i+=8) 348 { 349 t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)); 350 @@ -92,17 +94,17 @@ static inline double inner_product_doubl 351 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t))); 352 } 353 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum)); 354 _mm_store_sd(&ret, sum); 355 return ret; 356 } 357 358 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 359 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) { 360 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) { 361 int i; 362 double ret; 363 __m128d sum; 364 __m128d sum1 = _mm_setzero_pd(); 365 __m128d sum2 = _mm_setzero_pd(); 366 __m128 f = _mm_loadu_ps(frac); 367 __m128d f1 = _mm_cvtps_pd(f); 368 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));