tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

TDStretch.cpp (34468B)


      1 ///////////////////////////////////////////////////////////////////////////////
      2 /// 
      3 /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
      4 /// while maintaining the original pitch by using a time domain WSOLA-like 
      5 /// method with several performance-increasing tweaks.
      6 ///
      7 /// Notes : MMX optimized functions reside in a separate, platform-specific 
      8 /// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'.
      9 ///
     10 /// This source file contains OpenMP optimizations that allow speeding up the
     11 /// corss-correlation algorithm by executing it in several threads / CPU cores 
     12 /// in parallel. See the following article link for more detailed discussion 
     13 /// about SoundTouch OpenMP optimizations:
     14 /// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
     15 ///
     16 /// Author        : Copyright (c) Olli Parviainen
     17 /// Author e-mail : oparviai 'at' iki.fi
     18 /// SoundTouch WWW: http://www.surina.net/soundtouch
     19 ///
     20 ////////////////////////////////////////////////////////////////////////////////
     21 //
     22 // License :
     23 //
     24 //  SoundTouch audio processing library
     25 //  Copyright (c) Olli Parviainen
     26 //
     27 //  This library is free software; you can redistribute it and/or
     28 //  modify it under the terms of the GNU Lesser General Public
     29 //  License as published by the Free Software Foundation; either
     30 //  version 2.1 of the License, or (at your option) any later version.
     31 //
     32 //  This library is distributed in the hope that it will be useful,
     33 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
     34 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     35 //  Lesser General Public License for more details.
     36 //
     37 //  You should have received a copy of the GNU Lesser General Public
     38 //  License along with this library; if not, write to the Free Software
     39 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     40 //
     41 ////////////////////////////////////////////////////////////////////////////////
     42 
     43 #include <string.h>
     44 #include <limits.h>
     45 #include <assert.h>
     46 #include <math.h>
     47 #include <float.h>
     48 
     49 #include "STTypes.h"
     50 #include "cpu_detect.h"
     51 #include "TDStretch.h"
     52 
     53 using namespace soundtouch;
     54 
     55 #define max(x, y) (((x) > (y)) ? (x) : (y))
     56 
     57 /*****************************************************************************
     58 *
     59 * Constant definitions
     60 *
     61 *****************************************************************************/
     62 
     63 // Table for the hierarchical mixing position seeking algorithm
     64 const short _scanOffsets[5][24]={
     65    { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806,
     66      868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0},
     67    {-100,  -75,  -50,  -25,   25,   50,   75,  100,    0,    0,    0,   0,
     68        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
     69    { -20,  -15,  -10,   -5,    5,   10,   15,   20,    0,    0,    0,   0,
     70        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
     71    {  -4,   -3,   -2,   -1,    1,    2,    3,    4,    0,    0,    0,   0,
     72        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
     73    { 121,  114,   97,  114,   98,  105,  108,   32,  104,   99,  117,  111,
     74      116,  100,  110,  117,  111,  115,    0,    0,    0,    0,    0,   0}};
     75 
     76 /*****************************************************************************
     77 *
     78 * Implementation of the class 'TDStretch'
     79 *
     80 *****************************************************************************/
     81 
     82 
     83 TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
     84 {
     85    bQuickSeek = false;
     86    channels = 2;
     87 
     88    pMidBuffer = NULL;
     89    pMidBufferUnaligned = NULL;
     90    overlapLength = 0;
     91 
     92    bAutoSeqSetting = true;
     93    bAutoSeekSetting = true;
     94 
     95    tempo = 1.0f;
     96    setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
     97    setTempo(1.0f);
     98 
     99    clear();
    100 }
    101 
    102 
    103 
    104 TDStretch::~TDStretch()
    105 {
    106    delete[] pMidBufferUnaligned;
    107 }
    108 
    109 
    110 
    111 // Sets routine control parameters. These control are certain time constants
    112 // defining how the sound is stretched to the desired duration.
    113 //
    114 // 'sampleRate' = sample rate of the sound
    115 // 'sequenceMS' = one processing sequence length in milliseconds (default = 82 ms)
    116 // 'seekwindowMS' = seeking window length for scanning the best overlapping 
    117 //      position (default = 28 ms)
    118 // 'overlapMS' = overlapping length (default = 12 ms)
    119 
    120 void TDStretch::setParameters(int aSampleRate, int aSequenceMS, 
    121                              int aSeekWindowMS, int aOverlapMS)
    122 {
    123    // accept only positive parameter values - if zero or negative, use old values instead
    124    if (aSampleRate > 0)
    125    {
    126        if (aSampleRate > 192000) ST_THROW_RT_ERROR("Error: Excessive samplerate");
    127        this->sampleRate = aSampleRate;
    128    }
    129 
    130    if (aOverlapMS > 0) this->overlapMs = aOverlapMS;
    131 
    132    if (aSequenceMS > 0)
    133    {
    134        this->sequenceMs = aSequenceMS;
    135        bAutoSeqSetting = false;
    136    } 
    137    else if (aSequenceMS == 0)
    138    {
    139        // if zero, use automatic setting
    140        bAutoSeqSetting = true;
    141    }
    142 
    143    if (aSeekWindowMS > 0) 
    144    {
    145        this->seekWindowMs = aSeekWindowMS;
    146        bAutoSeekSetting = false;
    147    } 
    148    else if (aSeekWindowMS == 0) 
    149    {
    150        // if zero, use automatic setting
    151        bAutoSeekSetting = true;
    152    }
    153 
    154    calcSeqParameters();
    155 
    156    calculateOverlapLength(overlapMs);
    157 
    158    // set tempo to recalculate 'sampleReq'
    159    setTempo(tempo);
    160 }
    161 
    162 
    163 
    164 /// Get routine control parameters, see setParameters() function.
    165 /// Any of the parameters to this function can be NULL, in such case corresponding parameter
    166 /// value isn't returned.
    167 void TDStretch::getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWindowMs, int *pOverlapMs) const
    168 {
    169    if (pSampleRate)
    170    {
    171        *pSampleRate = sampleRate;
    172    }
    173 
    174    if (pSequenceMs)
    175    {
    176        *pSequenceMs = (bAutoSeqSetting) ? (USE_AUTO_SEQUENCE_LEN) : sequenceMs;
    177    }
    178 
    179    if (pSeekWindowMs)
    180    {
    181        *pSeekWindowMs = (bAutoSeekSetting) ? (USE_AUTO_SEEKWINDOW_LEN) : seekWindowMs;
    182    }
    183 
    184    if (pOverlapMs)
    185    {
    186        *pOverlapMs = overlapMs;
    187    }
    188 }
    189 
    190 
    191 // Overlaps samples in 'midBuffer' with the samples in 'pInput'
    192 void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
    193 {
    194    int i;
    195    SAMPLETYPE m1, m2;
    196 
    197    m1 = (SAMPLETYPE)0;
    198    m2 = (SAMPLETYPE)overlapLength;
    199 
    200    for (i = 0; i < overlapLength ; i ++)
    201    {
    202        pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
    203        m1 += 1;
    204        m2 -= 1;
    205    }
    206 }
    207 
    208 
    209 
    210 void TDStretch::clearMidBuffer()
    211 {
    212    memset(pMidBuffer, 0, channels * sizeof(SAMPLETYPE) * overlapLength);
    213 }
    214 
    215 
    216 void TDStretch::clearInput()
    217 {
    218    inputBuffer.clear();
    219    clearMidBuffer();
    220    isBeginning = true;
    221    maxnorm = 0;
    222    maxnormf = 1e8;
    223    skipFract = 0;
    224 }
    225 
    226 
    227 // Clears the sample buffers
    228 void TDStretch::clear()
    229 {
    230    outputBuffer.clear();
    231    clearInput();
    232 }
    233 
    234 
    235 
    236 // Enables/disables the quick position seeking algorithm. Zero to disable, nonzero
    237 // to enable
    238 void TDStretch::enableQuickSeek(bool enable)
    239 {
    240    bQuickSeek = enable;
    241 }
    242 
    243 
    244 // Returns nonzero if the quick seeking algorithm is enabled.
    245 bool TDStretch::isQuickSeekEnabled() const
    246 {
    247    return bQuickSeek;
    248 }
    249 
    250 
    251 // Seeks for the optimal overlap-mixing position.
    252 int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
    253 {
    254    if (bQuickSeek) 
    255    {
    256        return seekBestOverlapPositionQuick(refPos);
    257    }
    258    else 
    259    {
    260        return seekBestOverlapPositionFull(refPos);
    261    }
    262 }
    263 
    264 
    265 // Overlaps samples in 'midBuffer' with the samples in 'pInputBuffer' at position
    266 // of 'ovlPos'.
    267 inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, uint ovlPos) const
    268 {
    269 #ifndef USE_MULTICH_ALWAYS
    270    if (channels == 1)
    271    {
    272        // mono sound.
    273        overlapMono(pOutput, pInput + ovlPos);
    274    }
    275    else if (channels == 2)
    276    {
    277        // stereo sound
    278        overlapStereo(pOutput, pInput + 2 * ovlPos);
    279    } 
    280    else 
    281 #endif // USE_MULTICH_ALWAYS
    282    {
    283        assert(channels > 0);
    284        overlapMulti(pOutput, pInput + channels * ovlPos);
    285    }
    286 }
    287 
    288 
    289 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the
    290 // routine
    291 //
    292 // The best position is determined as the position where the two overlapped
    293 // sample sequences are 'most alike', in terms of the highest cross-correlation
    294 // value over the overlapping period
    295 int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) 
    296 {
    297    int bestOffs;
    298    double bestCorr;
    299    int i;
    300    double norm;
    301 
    302    bestCorr = -FLT_MAX;
    303    bestOffs = 0;
    304 
    305    // Scans for the best correlation value by testing each possible position
    306    // over the permitted range.
    307    bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
    308    bestCorr = (bestCorr + 0.1) * 0.75;
    309 
    310    #pragma omp parallel for
    311    for (i = 1; i < seekLength; i ++)
    312    {
    313        double corr;
    314        // Calculates correlation value for the mixing position corresponding to 'i'
    315 #if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
    316        // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
    317        // iterate the loop in sequential order
    318        // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
    319        corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
    320 #else
    321        // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
    322        // as "calcCrossCorr", but saves time by reusing & updating previously stored 
    323        // "norm" value
    324        corr = calcCrossCorrAccumulate(refPos + channels * i, pMidBuffer, norm);
    325 #endif
    326        // heuristic rule to slightly favour values close to mid of the range
    327        double tmp = (double)(2 * i - seekLength) / (double)seekLength;
    328        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
    329 
    330        // Checks for the highest correlation value
    331        if (corr > bestCorr) 
    332        {
    333            // For optimal performance, enter critical section only in case that best value found.
    334            // in such case repeat 'if' condition as it's possible that parallel execution may have
    335            // updated the bestCorr value in the mean time
    336            #pragma omp critical
    337            if (corr > bestCorr)
    338            {
    339                bestCorr = corr;
    340                bestOffs = i;
    341            }
    342        }
    343    }
    344 
    345 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
    346    adaptNormalizer();
    347 #endif
    348 
    349    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
    350    clearCrossCorrState();
    351 
    352    return bestOffs;
    353 }
    354 
    355 
    356 // Quick seek algorithm for improved runtime-performance: First roughly scans through the 
    357 // correlation area, and then scan surroundings of two best preliminary correlation candidates
    358 // with improved precision
    359 //
    360 // Based on testing:
    361 // - This algorithm gives on average 99% as good match as the full algorithm
    362 // - this quick seek algorithm finds the best match on ~90% of cases
    363 // - on those 10% of cases when this algorithm doesn't find best match, 
    364 //   it still finds on average ~90% match vs. the best possible match
    365 int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
    366 {
    367 #define _MIN(a, b)   (((a) < (b)) ? (a) : (b))
    368 #define SCANSTEP    16
    369 #define SCANWIND    8
    370 
    371    int bestOffs;
    372    int i;
    373    int bestOffs2;
    374    float bestCorr, corr;
    375    float bestCorr2;
    376    double norm;
    377 
    378    // note: 'float' types used in this function in case that the platform would need to use software-fp
    379 
    380    bestCorr =
    381    bestCorr2 = -FLT_MAX;
    382    bestOffs = 
    383    bestOffs2 = SCANWIND;
    384 
    385    // Scans for the best correlation value by testing each possible position
    386    // over the permitted range. Look for two best matches on the first pass to
    387    // increase possibility of ideal match.
    388    //
    389    // Begin from "SCANSTEP" instead of SCANWIND to make the calculation
    390    // catch the 'middlepoint' of seekLength vector as that's the a-priori 
    391    // expected best match position
    392    //
    393    // Roughly:
    394    // - 15% of cases find best result directly on the first round,
    395    // - 75% cases find better match on 2nd round around the best match from 1st round
    396    // - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round
    397    for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP)
    398    {
    399        // Calculates correlation value for the mixing position corresponding
    400        // to 'i'
    401        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
    402        // heuristic rule to slightly favour values close to mid of the seek range
    403        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
    404        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
    405 
    406        // Checks for the highest correlation value
    407        if (corr > bestCorr)
    408        {
    409            // found new best match. keep the previous best as 2nd best match
    410            bestCorr2 = bestCorr;
    411            bestOffs2 = bestOffs;
    412            bestCorr = corr;
    413            bestOffs = i;
    414        }
    415        else if (corr > bestCorr2)
    416        {
    417            // not new best, but still new 2nd best match
    418            bestCorr2 = corr;
    419            bestOffs2 = i;
    420        }
    421    }
    422 
    423    // Scans surroundings of the found best match with small stepping
    424    int end = _MIN(bestOffs + SCANWIND + 1, seekLength);
    425    for (i = bestOffs - SCANWIND; i < end; i++)
    426    {
    427        if (i == bestOffs) continue;    // this offset already calculated, thus skip
    428 
    429        // Calculates correlation value for the mixing position corresponding
    430        // to 'i'
    431        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
    432        // heuristic rule to slightly favour values close to mid of the range
    433        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
    434        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
    435 
    436        // Checks for the highest correlation value
    437        if (corr > bestCorr)
    438        {
    439            bestCorr = corr;
    440            bestOffs = i;
    441        }
    442    }
    443 
    444    // Scans surroundings of the 2nd best match with small stepping
    445    end = _MIN(bestOffs2 + SCANWIND + 1, seekLength);
    446    for (i = bestOffs2 - SCANWIND; i < end; i++)
    447    {
    448        if (i == bestOffs2) continue;    // this offset already calculated, thus skip
    449 
    450        // Calculates correlation value for the mixing position corresponding
    451        // to 'i'
    452        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
    453        // heuristic rule to slightly favour values close to mid of the range
    454        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
    455        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
    456 
    457        // Checks for the highest correlation value
    458        if (corr > bestCorr)
    459        {
    460            bestCorr = corr;
    461            bestOffs = i;
    462        }
    463    }
    464 
    465    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
    466    clearCrossCorrState();
    467 
    468 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
    469    adaptNormalizer();
    470 #endif
    471 
    472    return bestOffs;
    473 }
    474 
    475 
    476 
    477 
    478 /// For integer algorithm: adapt normalization factor divider with music so that 
    479 /// it'll not be pessimistically restrictive that can degrade quality on quieter sections
    480 /// yet won't cause integer overflows either
    481 void TDStretch::adaptNormalizer()
    482 {
    483    // Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to
    484    // too low values during pauses in music
    485    if ((maxnorm > 1000) || (maxnormf > 40000000))
    486    { 
    487        //norm averaging filter
    488        maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm;
    489 
    490        if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16))
    491        {
    492            // large values, so increase divider
    493            overlapDividerBitsNorm++;
    494            if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase
    495        }
    496        else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0))
    497        {
    498            // extra small values, decrease divider
    499            overlapDividerBitsNorm--;
    500        }
    501    }
    502 
    503    maxnorm = 0;
    504 }
    505 
    506 
    507 /// clear cross correlation routine state if necessary 
    508 void TDStretch::clearCrossCorrState()
    509 {
    510    // default implementation is empty.
    511 }
    512 
    513 
    514 /// Calculates processing sequence length according to tempo setting
    515 void TDStretch::calcSeqParameters()
    516 {
    517    // Adjust tempo param according to tempo, so that variating processing sequence length is used
    518    // at various tempo settings, between the given low...top limits
    519    #define AUTOSEQ_TEMPO_LOW   0.5     // auto setting low tempo range (-50%)
    520    #define AUTOSEQ_TEMPO_TOP   2.0     // auto setting top tempo range (+100%)
    521 
    522    // sequence-ms setting values at above low & top tempo
    523    #define AUTOSEQ_AT_MIN      90.0
    524    #define AUTOSEQ_AT_MAX      40.0
    525    #define AUTOSEQ_K           ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
    526    #define AUTOSEQ_C           (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
    527 
    528    // seek-window-ms setting values at above low & top tempoq
    529    #define AUTOSEEK_AT_MIN     20.0
    530    #define AUTOSEEK_AT_MAX     15.0
    531    #define AUTOSEEK_K          ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
    532    #define AUTOSEEK_C          (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
    533 
    534    #define CHECK_LIMITS(x, mi, ma) (((x) < (mi)) ? (mi) : (((x) > (ma)) ? (ma) : (x)))
    535 
    536    double seq, seek;
    537    
    538    if (bAutoSeqSetting)
    539    {
    540        seq = AUTOSEQ_C + AUTOSEQ_K * tempo;
    541        seq = CHECK_LIMITS(seq, AUTOSEQ_AT_MAX, AUTOSEQ_AT_MIN);
    542        sequenceMs = (int)(seq + 0.5);
    543    }
    544 
    545    if (bAutoSeekSetting)
    546    {
    547        seek = AUTOSEEK_C + AUTOSEEK_K * tempo;
    548        seek = CHECK_LIMITS(seek, AUTOSEEK_AT_MAX, AUTOSEEK_AT_MIN);
    549        seekWindowMs = (int)(seek + 0.5);
    550    }
    551 
    552    // Update seek window lengths
    553    seekWindowLength = (sampleRate * sequenceMs) / 1000;
    554    if (seekWindowLength < 2 * overlapLength) 
    555    {
    556        seekWindowLength = 2 * overlapLength;
    557    }
    558    seekLength = (sampleRate * seekWindowMs) / 1000;
    559 }
    560 
    561 
    562 
    563 // Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
    564 // tempo, larger faster tempo.
    565 void TDStretch::setTempo(double newTempo)
    566 {
    567    int intskip;
    568 
    569    tempo = newTempo;
    570 
    571    // Calculate new sequence duration
    572    calcSeqParameters();
    573 
    574    // Calculate ideal skip length (according to tempo value) 
    575    nominalSkip = tempo * (seekWindowLength - overlapLength);
    576    intskip = (int)(nominalSkip + 0.5);
    577 
    578    // Calculate how many samples are needed in the 'inputBuffer' to 
    579    // process another batch of samples
    580    //sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
    581    sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
    582 }
    583 
    584 
    585 
    586 // Sets the number of channels, 1 = mono, 2 = stereo
    587 void TDStretch::setChannels(int numChannels)
    588 {
    589    if (!verifyNumberOfChannels(numChannels) ||
    590        (channels == numChannels)) return;
    591 
    592    channels = numChannels;
    593    inputBuffer.setChannels(channels);
    594    outputBuffer.setChannels(channels);
    595 
    596    // re-init overlap/buffer
    597    overlapLength=0;
    598    setParameters(sampleRate);
    599 }
    600 
    601 
    602 // nominal tempo, no need for processing, just pass the samples through
    603 // to outputBuffer
    604 /*
    605 void TDStretch::processNominalTempo()
    606 {
    607    assert(tempo == 1.0f);
    608 
    609    if (bMidBufferDirty) 
    610    {
    611        // If there are samples in pMidBuffer waiting for overlapping,
    612        // do a single sliding overlapping with them in order to prevent a 
    613        // clicking distortion in the output sound
    614        if (inputBuffer.numSamples() < overlapLength) 
    615        {
    616            // wait until we've got overlapLength input samples
    617            return;
    618        }
    619        // Mix the samples in the beginning of 'inputBuffer' with the 
    620        // samples in 'midBuffer' using sliding overlapping 
    621        overlap(outputBuffer.ptrEnd(overlapLength), inputBuffer.ptrBegin(), 0);
    622        outputBuffer.putSamples(overlapLength);
    623        inputBuffer.receiveSamples(overlapLength);
    624        clearMidBuffer();
    625        // now we've caught the nominal sample flow and may switch to
    626        // bypass mode
    627    }
    628 
    629    // Simply bypass samples from input to output
    630    outputBuffer.moveSamples(inputBuffer);
    631 }
    632 */
    633 
    634 
    635 // Processes as many processing frames of the samples 'inputBuffer', store
    636 // the result into 'outputBuffer'
    637 void TDStretch::processSamples()
    638 {
    639    int ovlSkip;
    640    int offset = 0;
    641    int temp;
    642 
    643    /* Removed this small optimization - can introduce a click to sound when tempo setting
    644       crosses the nominal value
    645    if (tempo == 1.0f) 
    646    {
    647        // tempo not changed from the original, so bypass the processing
    648        processNominalTempo();
    649        return;
    650    }
    651    */
    652 
    653    // Process samples as long as there are enough samples in 'inputBuffer'
    654    // to form a processing frame.
    655    while ((int)inputBuffer.numSamples() >= sampleReq) 
    656    {
    657        if (isBeginning == false)
    658        {
    659            // apart from the very beginning of the track, 
    660            // scan for the best overlapping position & do overlap-add
    661            offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
    662 
    663            // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
    664            // samples in 'midBuffer' using sliding overlapping
    665            // ... first partially overlap with the end of the previous sequence
    666            // (that's in 'midBuffer')
    667            overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
    668            outputBuffer.putSamples((uint)overlapLength);
    669            offset += overlapLength;
    670        }
    671        else
    672        {
    673            // Adjust processing offset at beginning of track by not perform initial overlapping
    674            // and compensating that in the 'input buffer skip' calculation
    675            isBeginning = false;
    676            int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);
    677 
    678            #ifdef ST_SIMD_AVOID_UNALIGNED
    679            // in SIMD mode, round the skip amount to value corresponding to aligned memory address
    680            if (channels == 1)
    681            {
    682                skip &= -4;
    683            }
    684            else if (channels == 2)
    685            {
    686                skip &= -2;
    687            }
    688            #endif
    689            skipFract -= skip;
    690            if (skipFract <= -nominalSkip)
    691            {
    692                skipFract = -nominalSkip;
    693            }
    694        }
    695 
    696        // ... then copy sequence samples from 'inputBuffer' to output:
    697 
    698        // crosscheck that we don't have buffer overflow...
    699        if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength))
    700        {
    701            continue;    // just in case, shouldn't really happen
    702        }
    703 
    704        // length of sequence
    705        temp = (seekWindowLength - 2 * overlapLength);
    706        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp);
    707 
    708        // Copies the end of the current sequence from 'inputBuffer' to 
    709        // 'midBuffer' for being mixed with the beginning of the next 
    710        // processing sequence and so on
    711        assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples());
    712        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp), 
    713            channels * sizeof(SAMPLETYPE) * overlapLength);
    714 
    715        // Remove the processed samples from the input buffer. Update
    716        // the difference between integer & nominal skip step to 'skipFract'
    717        // in order to prevent the error from accumulating over time.
    718        skipFract += nominalSkip;   // real skip size
    719        ovlSkip = (int)skipFract;   // rounded to integer skip
    720        skipFract -= ovlSkip;       // maintain the fraction part, i.e. real vs. integer skip
    721        inputBuffer.receiveSamples((uint)ovlSkip);
    722    }
    723 }
    724 
    725 
    726 // Adds 'numsamples' pcs of samples from the 'samples' memory position into
    727 // the input of the object.
    728 void TDStretch::putSamples(const SAMPLETYPE *samples, uint nSamples)
    729 {
    730    // Add the samples into the input buffer
    731    inputBuffer.putSamples(samples, nSamples);
    732    // Process the samples in input buffer
    733    processSamples();
    734 }
    735 
    736 
    737 
    738 /// Set new overlap length parameter & reallocate RefMidBuffer if necessary.
    739 void TDStretch::acceptNewOverlapLength(int newOverlapLength)
    740 {
    741    int prevOvl;
    742 
    743    assert(newOverlapLength >= 0);
    744    prevOvl = overlapLength;
    745    overlapLength = newOverlapLength;
    746 
    747    if (overlapLength > prevOvl)
    748    {
    749        delete[] pMidBufferUnaligned;
    750 
    751        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + 16 / sizeof(SAMPLETYPE)];
    752        // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
    753        pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(pMidBufferUnaligned);
    754 
    755        clearMidBuffer();
    756    }
    757 }
    758 
    759 
    760 // Operator 'new' is overloaded so that it automatically creates a suitable instance 
    761 // depending on if we've a MMX/SSE/etc-capable CPU available or not.
    762 void * TDStretch::operator new(size_t s)
    763 {
    764    // Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead!
    765    ST_THROW_RT_ERROR("Error in TDStretch::new: Don't use 'new TDStretch' directly, use 'newInstance' member instead!");
    766    return newInstance();
    767 }
    768 
    769 
    770 TDStretch * TDStretch::newInstance()
    771 {
    772 #if defined(SOUNDTOUCH_ALLOW_MMX) || defined(SOUNDTOUCH_ALLOW_SSE)
    773    uint uExtensions;
    774 
    775    uExtensions = detectCPUextensions();
    776 #endif
    777 
    778    // Check if MMX/SSE instruction set extensions supported by CPU
    779 
    780 #ifdef SOUNDTOUCH_ALLOW_MMX
    781    // MMX routines available only with integer sample types
    782    if (uExtensions & SUPPORT_MMX)
    783    {
    784        return ::new TDStretchMMX;
    785    }
    786    else
    787 #endif // SOUNDTOUCH_ALLOW_MMX
    788 
    789 
    790 #ifdef SOUNDTOUCH_ALLOW_SSE
    791    if (uExtensions & SUPPORT_SSE)
    792    {
    793        // SSE support
    794        return ::new TDStretchSSE;
    795    }
    796    else
    797 #endif // SOUNDTOUCH_ALLOW_SSE
    798 
    799    {
    800        // ISA optimizations not supported, use plain C version
    801        return ::new TDStretch;
    802    }
    803 }
    804 
    805 
    806 //////////////////////////////////////////////////////////////////////////////
    807 //
    808 // Integer arithmetic specific algorithm implementations.
    809 //
    810 //////////////////////////////////////////////////////////////////////////////
    811 
    812 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
    813 
    814 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' 
    815 // version of the routine.
    816 void TDStretch::overlapStereo(short *poutput, const short *input) const
    817 {
    818    int i;
    819    short temp;
    820    int cnt2;
    821 
    822    for (i = 0; i < overlapLength ; i ++)
    823    {
    824        temp = (short)(overlapLength - i);
    825        cnt2 = 2 * i;
    826        poutput[cnt2] = (input[cnt2] * i + pMidBuffer[cnt2] * temp )  / overlapLength;
    827        poutput[cnt2 + 1] = (input[cnt2 + 1] * i + pMidBuffer[cnt2 + 1] * temp ) / overlapLength;
    828    }
    829 }
    830 
    831 
    832 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
    833 // version of the routine.
    834 void TDStretch::overlapMulti(short *poutput, const short *input) const
    835 {
    836    short m1;
    837    int i = 0;
    838 
    839    for (m1 = 0; m1 < overlapLength; m1 ++)
    840    {
    841        short m2 = (short)(overlapLength - m1);
    842        for (int c = 0; c < channels; c ++)
    843        {
    844            poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2)  / overlapLength;
    845            i++;
    846        }
    847    }
    848 }
    849 
    850 // Calculates the x having the closest 2^x value for the given value
    851 static int _getClosest2Power(double value)
    852 {
    853    return (int)(log(value) / log(2.0) + 0.5);
    854 }
    855 
    856 
    857 /// Calculates overlap period length in samples.
    858 /// Integer version rounds overlap length to closest power of 2
    859 /// for a divide scaling operation.
    860 void TDStretch::calculateOverlapLength(int aoverlapMs)
    861 {
    862    int newOvl;
    863 
    864    assert(aoverlapMs >= 0);
    865 
    866    // calculate overlap length so that it's power of 2 - thus it's easy to do
    867    // integer division by right-shifting. Term "-1" at end is to account for 
    868    // the extra most significatnt bit left unused in result by signed multiplication 
    869    overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
    870    if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9;
    871    if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3;
    872    newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1);    // +1 => account for -1 above
    873 
    874    acceptNewOverlapLength(newOvl);
    875 
    876    overlapDividerBitsNorm = overlapDividerBitsPure;
    877 
    878    // calculate sloping divider so that crosscorrelation operation won't 
    879    // overflow 32-bit register. Max. sum of the crosscorrelation sum without 
    880    // divider would be 2^30*(N^3-N)/3, where N = overlap length
    881    slopingDivider = (newOvl * newOvl - 1) / 3;
    882 }
    883 
    884 
    885 double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm)
    886 {
    887    long corr;
    888    unsigned long lnorm;
    889    int i;
    890 
    891    #ifdef ST_SIMD_AVOID_UNALIGNED
    892        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
    893        if (((ulongptr)mixingPos) & 15) return -1e50;
    894    #endif
    895 
    896    // hint compiler autovectorization that loop length is divisible by 8
    897    int ilength = (channels * overlapLength) & -8;
    898 
    899    corr = lnorm = 0;
    900    // Same routine for stereo and mono
    901    for (i = 0; i < ilength; i += 2)
    902    {
    903        corr += (mixingPos[i] * compare[i] + 
    904                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
    905        lnorm += (mixingPos[i] * mixingPos[i] + 
    906                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
    907        // do intermediate scalings to avoid integer overflow
    908    }
    909 
    910    if (lnorm > maxnorm)
    911    {
    912        // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
    913        #pragma omp critical
    914        if (lnorm > maxnorm)
    915        {
    916            maxnorm = lnorm;
    917        }
    918    }
    919    // Normalize result by dividing by sqrt(norm) - this step is easiest 
    920    // done using floating point operation
    921    norm = (double)lnorm;
    922    return (double)corr / sqrt((norm < 1e-9) ? 1.0 : norm);
    923 }
    924 
    925 
    926 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
    927 double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
    928 {
    929    long corr;
    930    long lnorm;
    931    int i;
    932 
    933    // hint compiler autovectorization that loop length is divisible by 8
    934    int ilength = (channels * overlapLength) & -8;
    935 
    936    // cancel first normalizer tap from previous round
    937    lnorm = 0;
    938    for (i = 1; i <= channels; i ++)
    939    {
    940        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm;
    941    }
    942 
    943    corr = 0;
    944    // Same routine for stereo and mono.
    945    for (i = 0; i < ilength; i += 2) 
    946    {
    947        corr += (mixingPos[i] * compare[i] + 
    948                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
    949    }
    950 
    951    // update normalizer with last samples of this round
    952    for (int j = 0; j < channels; j ++)
    953    {
    954        i --;
    955        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm;
    956    }
    957 
    958    norm += (double)lnorm;
    959    if (norm > maxnorm)
    960    {
    961        maxnorm = (unsigned long)norm;
    962    }
    963 
    964    // Normalize result by dividing by sqrt(norm) - this step is easiest 
    965    // done using floating point operation
    966    return (double)corr / sqrt((norm < 1e-9) ? 1.0 : norm);
    967 }
    968 
    969 #endif // SOUNDTOUCH_INTEGER_SAMPLES
    970 
    971 //////////////////////////////////////////////////////////////////////////////
    972 //
    973 // Floating point arithmetic specific algorithm implementations.
    974 //
    975 
    976 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
    977 
    978 // Overlaps samples in 'midBuffer' with the samples in 'pInput'
    979 void TDStretch::overlapStereo(float *pOutput, const float *pInput) const
    980 {
    981    int i;
    982    float fScale;
    983    float f1;
    984    float f2;
    985 
    986    fScale = 1.0f / (float)overlapLength;
    987 
    988    f1 = 0;
    989    f2 = 1.0f;
    990 
    991    for (i = 0; i < 2 * (int)overlapLength ; i += 2) 
    992    {
    993        pOutput[i + 0] = pInput[i + 0] * f1 + pMidBuffer[i + 0] * f2;
    994        pOutput[i + 1] = pInput[i + 1] * f1 + pMidBuffer[i + 1] * f2;
    995 
    996        f1 += fScale;
    997        f2 -= fScale;
    998    }
    999 }
   1000 
   1001 
   1002 // Overlaps samples in 'midBuffer' with the samples in 'input'. 
   1003 void TDStretch::overlapMulti(float *pOutput, const float *pInput) const
   1004 {
   1005    int i;
   1006    float fScale;
   1007    float f1;
   1008    float f2;
   1009 
   1010    fScale = 1.0f / (float)overlapLength;
   1011 
   1012    f1 = 0;
   1013    f2 = 1.0f;
   1014 
   1015    i=0;
   1016    for (int i2 = 0; i2 < overlapLength; i2 ++)
   1017    {
   1018        // note: Could optimize this slightly by taking into account that always channels > 2
   1019        for (int c = 0; c < channels; c ++)
   1020        {
   1021            pOutput[i] = pInput[i] * f1 + pMidBuffer[i] * f2;
   1022            i++;
   1023        }
   1024        f1 += fScale;
   1025        f2 -= fScale;
   1026    }
   1027 }
   1028 
   1029 
   1030 /// Calculates overlapInMsec period length in samples.
   1031 void TDStretch::calculateOverlapLength(int overlapInMsec)
   1032 {
   1033    int newOvl;
   1034 
   1035    assert(overlapInMsec >= 0);
   1036    newOvl = (sampleRate * overlapInMsec) / 1000;
   1037    if (newOvl < 16) newOvl = 16;
   1038 
   1039    // must be divisible by 8
   1040    newOvl -= newOvl % 8;
   1041 
   1042    acceptNewOverlapLength(newOvl);
   1043 }
   1044 
   1045 
   1046 /// Calculate cross-correlation
   1047 double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
   1048 {
   1049    float corr;
   1050    float norm;
   1051    int i;
   1052 
   1053    #ifdef ST_SIMD_AVOID_UNALIGNED
   1054        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
   1055        if (((ulongptr)mixingPos) & 15) return -1e50;
   1056    #endif
   1057 
   1058    // hint compiler autovectorization that loop length is divisible by 8
   1059    int ilength = (channels * overlapLength) & -8;
   1060 
   1061    corr = norm = 0;
   1062    // Same routine for stereo and mono
   1063    for (i = 0; i < ilength; i ++)
   1064    {
   1065        corr += mixingPos[i] * compare[i];
   1066        norm += mixingPos[i] * mixingPos[i];
   1067    }
   1068 
   1069    anorm = norm;
   1070    return corr / sqrt((norm < 1e-9 ? 1.0 : norm));
   1071 }
   1072 
   1073 
   1074 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
   1075 double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
   1076 {
   1077    float corr;
   1078    int i;
   1079 
   1080    corr = 0;
   1081 
   1082    // cancel first normalizer tap from previous round
   1083    for (i = 1; i <= channels; i ++)
   1084    {
   1085        norm -= mixingPos[-i] * mixingPos[-i];
   1086    }
   1087 
   1088    // hint compiler autovectorization that loop length is divisible by 8
   1089    int ilength = (channels * overlapLength) & -8;
   1090 
   1091    // Same routine for stereo and mono
   1092    for (i = 0; i < ilength; i ++)
   1093    {
   1094        corr += mixingPos[i] * compare[i];
   1095    }
   1096 
   1097    // update normalizer with last samples of this round
   1098    for (int j = 0; j < channels; j ++)
   1099    {
   1100        i --;
   1101        norm += mixingPos[i] * mixingPos[i];
   1102    }
   1103 
   1104    return corr / sqrt((norm < 1e-9 ? 1.0 : norm));
   1105 }
   1106 
   1107 
   1108 #endif // SOUNDTOUCH_FLOAT_SAMPLES