[ tor-browser ].git.dasho

sse_optimized.cpp (12896B)
      1 ////////////////////////////////////////////////////////////////////////////////
      2 ///
      3 /// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE 
      4 /// optimized functions have been gathered into this single source 
      5 /// code file, regardless to their class or original source code file, in order 
      6 /// to ease porting the library to other compiler and processor platforms.
      7 ///
      8 /// The SSE-optimizations are programmed using SSE compiler intrinsics that
      9 /// are supported both by Microsoft Visual C++ and GCC compilers, so this file
     10 /// should compile with both toolsets.
     11 ///
     12 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
     13 /// 6.0 processor pack" update to support SSE instruction set. The update is 
     14 /// available for download at Microsoft Developers Network, see here:
     15 /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
     16 ///
     17 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
     18 /// perform a search with keywords "processor pack".
     19 ///
     20 /// Author        : Copyright (c) Olli Parviainen
     21 /// Author e-mail : oparviai 'at' iki.fi
     22 /// SoundTouch WWW: http://www.surina.net/soundtouch
     23 ///
     24 ////////////////////////////////////////////////////////////////////////////////
     25 //
     26 // License :
     27 //
     28 //  SoundTouch audio processing library
     29 //  Copyright (c) Olli Parviainen
     30 //
     31 //  This library is free software; you can redistribute it and/or
     32 //  modify it under the terms of the GNU Lesser General Public
     33 //  License as published by the Free Software Foundation; either
     34 //  version 2.1 of the License, or (at your option) any later version.
     35 //
     36 //  This library is distributed in the hope that it will be useful,
     37 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
     38 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     39 //  Lesser General Public License for more details.
     40 //
     41 //  You should have received a copy of the GNU Lesser General Public
     42 //  License along with this library; if not, write to the Free Software
     43 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     44 //
     45 ////////////////////////////////////////////////////////////////////////////////
     46 
     47 #include "cpu_detect.h"
     48 #include "STTypes.h"
     49 
     50 using namespace soundtouch;
     51 
     52 #ifdef SOUNDTOUCH_ALLOW_SSE
     53 
     54 // SSE routines available only with float sample type    
     55 
     56 //////////////////////////////////////////////////////////////////////////////
     57 //
     58 // implementation of SSE optimized functions of class 'TDStretchSSE'
     59 //
     60 //////////////////////////////////////////////////////////////////////////////
     61 
     62 #include "TDStretch.h"
     63 
     64 #ifdef SOUNDTOUCH_WASM_SIMD
     65 #include "simde/x86/avx2.h"
     66 #else
     67 #include <xmmintrin.h>
     68 #endif
     69 
     70 #include <math.h>
     71 
     72 // Calculates cross correlation of two buffers
     73 double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm)
     74 {
     75    int i;
     76    const float *pVec1;
     77    const __m128 *pVec2;
     78    __m128 vSum, vNorm;
     79 
     80    // Note. It means a major slow-down if the routine needs to tolerate 
     81    // unaligned __m128 memory accesses. It's way faster if we can skip 
     82    // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
     83    // This can mean up to ~ 10-fold difference (incl. part of which is
     84    // due to skipping every second round for stereo sound though).
     85    //
     86    // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
     87    // for choosing if this little cheating is allowed.
     88 
     89 #ifdef ST_SIMD_AVOID_UNALIGNED
     90    // Little cheating allowed, return valid correlation only for 
     91    // aligned locations, meaning every second round for stereo sound.
     92 
     93    #define _MM_LOAD    _mm_load_ps
     94 
     95    if (((ulongptr)pV1) & 15) return -1e50;    // skip unaligned locations
     96 
     97 #else
     98    // No cheating allowed, use unaligned load & take the resulting
     99    // performance hit.
    100    #define _MM_LOAD    _mm_loadu_ps
    101 #endif 
    102 
    103    // ensure overlapLength is divisible by 8
    104    assert((overlapLength % 8) == 0);
    105 
    106    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
    107    // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
    108    pVec1 = (const float*)pV1;
    109    pVec2 = (const __m128*)pV2;
    110    vSum = vNorm = _mm_setzero_ps();
    111 
    112    // Unroll the loop by factor of 4 * 4 operations. Use same routine for
    113    // stereo & mono, for mono it just means twice the amount of unrolling.
    114    for (i = 0; i < channels * overlapLength / 16; i ++) 
    115    {
    116        __m128 vTemp;
    117        // vSum += pV1[0..3] * pV2[0..3]
    118        vTemp = _MM_LOAD(pVec1);
    119        vSum  = _mm_add_ps(vSum,  _mm_mul_ps(vTemp ,pVec2[0]));
    120        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
    121 
    122        // vSum += pV1[4..7] * pV2[4..7]
    123        vTemp = _MM_LOAD(pVec1 + 4);
    124        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
    125        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
    126 
    127        // vSum += pV1[8..11] * pV2[8..11]
    128        vTemp = _MM_LOAD(pVec1 + 8);
    129        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
    130        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
    131 
    132        // vSum += pV1[12..15] * pV2[12..15]
    133        vTemp = _MM_LOAD(pVec1 + 12);
    134        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
    135        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
    136 
    137        pVec1 += 16;
    138        pVec2 += 4;
    139    }
    140 
    141    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
    142    float *pvNorm = (float*)&vNorm;
    143    float norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
    144    anorm = norm;
    145 
    146    float *pvSum = (float*)&vSum;
    147    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm);
    148 
    149    /* This is approximately corresponding routine in C-language yet without normalization:
    150    double corr, norm;
    151    uint i;
    152 
    153    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
    154    corr = norm = 0.0;
    155    for (i = 0; i < channels * overlapLength / 16; i ++) 
    156    {
    157        corr += pV1[0] * pV2[0] +
    158                pV1[1] * pV2[1] +
    159                pV1[2] * pV2[2] +
    160                pV1[3] * pV2[3] +
    161                pV1[4] * pV2[4] +
    162                pV1[5] * pV2[5] +
    163                pV1[6] * pV2[6] +
    164                pV1[7] * pV2[7] +
    165                pV1[8] * pV2[8] +
    166                pV1[9] * pV2[9] +
    167                pV1[10] * pV2[10] +
    168                pV1[11] * pV2[11] +
    169                pV1[12] * pV2[12] +
    170                pV1[13] * pV2[13] +
    171                pV1[14] * pV2[14] +
    172                pV1[15] * pV2[15];
    173 
    174    for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
    175 
    176        pV1 += 16;
    177        pV2 += 16;
    178    }
    179    return corr / sqrt(norm);
    180    */
    181 }
    182 
    183 
    184 
    185 double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm)
    186 {
    187    // call usual calcCrossCorr function because SSE does not show big benefit of 
    188    // accumulating "norm" value, and also the "norm" rolling algorithm would get 
    189    // complicated due to SSE-specific alignment-vs-nonexact correlation rules.
    190    return calcCrossCorr(pV1, pV2, norm);
    191 }
    192 
    193 
    194 //////////////////////////////////////////////////////////////////////////////
    195 //
    196 // implementation of SSE optimized functions of class 'FIRFilter'
    197 //
    198 //////////////////////////////////////////////////////////////////////////////
    199 
    200 #include "FIRFilter.h"
    201 
    202 FIRFilterSSE::FIRFilterSSE() : FIRFilter()
    203 {
    204    filterCoeffsAlign = NULL;
    205    filterCoeffsUnalign = NULL;
    206 }
    207 
    208 
    209 FIRFilterSSE::~FIRFilterSSE()
    210 {
    211    delete[] filterCoeffsUnalign;
    212    filterCoeffsAlign = NULL;
    213    filterCoeffsUnalign = NULL;
    214 }
    215 
    216 
    217 // (overloaded) Calculates filter coefficients for SSE routine
    218 void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
    219 {
    220    uint i;
    221    float fDivider;
    222 
    223    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
    224 
    225    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
    226    // also rearrange coefficients suitably for SSE
    227    // Ensure that filter coeffs array is aligned to 16-byte boundary
    228    delete[] filterCoeffsUnalign;
    229    filterCoeffsUnalign = new float[2 * newLength + 4];
    230    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
    231 
    232    fDivider = (float)resultDivider;
    233 
    234    // rearrange the filter coefficients for mmx routines 
    235    for (i = 0; i < newLength; i ++)
    236    {
    237        filterCoeffsAlign[2 * i + 0] =
    238        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
    239    }
    240 }
    241 
    242 
    243 
    244 // SSE-optimized version of the filter routine for stereo sound
    245 uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
    246 {
    247    int count = (int)((numSamples - length) & (uint)-2);
    248    int j;
    249 
    250    assert(count % 2 == 0);
    251 
    252    if (count < 2) return 0;
    253 
    254    assert(source != NULL);
    255    assert(dest != NULL);
    256    assert((length % 8) == 0);
    257    assert(filterCoeffsAlign != NULL);
    258    assert(((ulongptr)filterCoeffsAlign) % 16 == 0);
    259 
    260    // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
    261    #pragma omp parallel for
    262    for (j = 0; j < count; j += 2)
    263    {
    264        const float *pSrc;
    265        float *pDest;
    266        const __m128 *pFil;
    267        __m128 sum1, sum2;
    268        uint i;
    269 
    270        pSrc = (const float*)source + j * 2;      // source audio data
    271        pDest = dest + j * 2;                     // destination audio data
    272        pFil = (const __m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
    273                                                  // are aligned to 16-byte boundary
    274        sum1 = sum2 = _mm_setzero_ps();
    275 
    276        for (i = 0; i < length / 8; i ++) 
    277        {
    278            // Unroll loop for efficiency & calculate filter for 2*2 stereo samples 
    279            // at each pass
    280 
    281            // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
    282            // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
    283 
    284            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc)    , pFil[0]));
    285            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
    286 
    287            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
    288            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
    289 
    290            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) ,  pFil[2]));
    291            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
    292 
    293            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
    294            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
    295 
    296            pSrc += 16;
    297            pFil += 4;
    298        }
    299 
    300        // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
    301        // to sum the two hi- and lo-floats of these registers together.
    302 
    303        // post-shuffle & add the filtered values and store to dest.
    304        _mm_storeu_ps(pDest, _mm_add_ps(
    305                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)),   // s2_1 s2_0 s1_3 s1_2
    306                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0))    // s2_3 s2_2 s1_1 s1_0
    307                    ));
    308    }
    309 
    310    // Ideas for further improvement:
    311    // 1. If it could be guaranteed that 'source' were always aligned to 16-byte 
    312    //    boundary, a faster aligned '_mm_load_ps' instruction could be used.
    313    // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte 
    314    //    boundary, a faster '_mm_store_ps' instruction could be used.
    315 
    316    return (uint)count;
    317 
    318    /* original routine in C-language. please notice the C-version has differently 
    319       organized coefficients though.
    320    double suml1, suml2;
    321    double sumr1, sumr2;
    322    uint i, j;
    323 
    324    for (j = 0; j < count; j += 2)
    325    {
    326        const float *ptr;
    327        const float *pFil;
    328 
    329        suml1 = sumr1 = 0.0;
    330        suml2 = sumr2 = 0.0;
    331        ptr = src;
    332        pFil = filterCoeffs;
    333        for (i = 0; i < lengthLocal; i ++) 
    334        {
    335            // unroll loop for efficiency.
    336 
    337            suml1 += ptr[0] * pFil[0] + 
    338                     ptr[2] * pFil[2] +
    339                     ptr[4] * pFil[4] +
    340                     ptr[6] * pFil[6];
    341 
    342            sumr1 += ptr[1] * pFil[1] + 
    343                     ptr[3] * pFil[3] +
    344                     ptr[5] * pFil[5] +
    345                     ptr[7] * pFil[7];
    346 
    347            suml2 += ptr[8] * pFil[0] + 
    348                     ptr[10] * pFil[2] +
    349                     ptr[12] * pFil[4] +
    350                     ptr[14] * pFil[6];
    351 
    352            sumr2 += ptr[9] * pFil[1] + 
    353                     ptr[11] * pFil[3] +
    354                     ptr[13] * pFil[5] +
    355                     ptr[15] * pFil[7];
    356 
    357            ptr += 16;
    358            pFil += 8;
    359        }
    360        dest[0] = (float)suml1;
    361        dest[1] = (float)sumr1;
    362        dest[2] = (float)suml2;
    363        dest[3] = (float)sumr2;
    364 
    365        src += 4;
    366        dest += 4;
    367    }
    368    */
    369 }
    370 
    371 #endif  // SOUNDTOUCH_ALLOW_SSE
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE