tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pitch_sse4_1.c (7815B)


      1 /* Copyright (c) 2014, Cisco Systems, INC
      2   Written by XiangMingZhu WeiZhou MinPeng YanWang
      3 
      4   Redistribution and use in source and binary forms, with or without
      5   modification, are permitted provided that the following conditions
      6   are met:
      7 
      8   - Redistributions of source code must retain the above copyright
      9   notice, this list of conditions and the following disclaimer.
     10 
     11   - Redistributions in binary form must reproduce the above copyright
     12   notice, this list of conditions and the following disclaimer in the
     13   documentation and/or other materials provided with the distribution.
     14 
     15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 
     35 #include "macros.h"
     36 #include "celt_lpc.h"
     37 #include "stack_alloc.h"
     38 #include "mathops.h"
     39 #include "pitch.h"
     40 
     41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
     42 #include <smmintrin.h>
     43 #include "x86cpu.h"
     44 
     45 opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
     46      int N)
     47 {
     48    opus_int  i, dataSize16;
     49    opus_int32 sum;
     50    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
     51    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
     52    __m128i inVec1_3210, inVec2_3210;
     53 
     54    sum = 0;
     55    dataSize16 = N & ~15;
     56 
     57    acc1 = _mm_setzero_si128();
     58    acc2 = _mm_setzero_si128();
     59 
     60    for (i=0;i<dataSize16;i+=16) {
     61        inVec1_76543210 = _mm_loadu_si128((__m128i *)(void*)(&x[i + 0]));
     62        inVec2_76543210 = _mm_loadu_si128((__m128i *)(void*)(&y[i + 0]));
     63 
     64        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(void*)(&x[i + 8]));
     65        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(void*)(&y[i + 8]));
     66 
     67        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
     68        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
     69 
     70        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
     71        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
     72    }
     73 
     74    acc1 = _mm_add_epi32(acc1, acc2);
     75 
     76    if (N - i >= 8)
     77    {
     78        inVec1_76543210 = _mm_loadu_si128((__m128i *)(void*)(&x[i + 0]));
     79        inVec2_76543210 = _mm_loadu_si128((__m128i *)(void*)(&y[i + 0]));
     80 
     81        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
     82 
     83        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
     84        i += 8;
     85    }
     86 
     87    if (N - i >= 4)
     88    {
     89        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
     90        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
     91 
     92        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
     93 
     94        acc1 = _mm_add_epi32(acc1, inVec1_3210);
     95        i += 4;
     96    }
     97 
     98    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
     99    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
    100 
    101    sum += _mm_cvtsi128_si32(acc1);
    102 
    103    for (;i<N;i++)
    104    {
    105        sum = silk_SMLABB(sum, x[i], y[i]);
    106    }
    107 
    108    return sum;
    109 }
    110 
    111 void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
    112 {
    113    int j;
    114 
    115    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
    116    __m128i vecY0, vecY1, vecY2, vecY3;
    117    __m128i sum0, sum1, sum2, sum3, vecSum;
    118    __m128i initSum;
    119 
    120 #ifdef OPUS_CHECK_ASM
    121    opus_val32 sum_c[4];
    122    for (j=0;j<4;j++) {
    123      sum_c[j] = sum[j];
    124    }
    125    xcorr_kernel_c(x, y, sum_c, len);
    126 #endif
    127 
    128    celt_assert(len >= 3);
    129 
    130    sum0 = _mm_setzero_si128();
    131    sum1 = _mm_setzero_si128();
    132    sum2 = _mm_setzero_si128();
    133    sum3 = _mm_setzero_si128();
    134 
    135    for (j=0;j<(len-7);j+=8)
    136    {
    137        vecX = _mm_loadu_si128((__m128i *)(void*)(&x[j + 0]));
    138        vecY0 = _mm_loadu_si128((__m128i *)(void*)(&y[j + 0]));
    139        vecY1 = _mm_loadu_si128((__m128i *)(void*)(&y[j + 1]));
    140        vecY2 = _mm_loadu_si128((__m128i *)(void*)(&y[j + 2]));
    141        vecY3 = _mm_loadu_si128((__m128i *)(void*)(&y[j + 3]));
    142 
    143        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
    144        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
    145        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
    146        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
    147    }
    148 
    149    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
    150    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
    151 
    152    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
    153    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
    154 
    155    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
    156    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
    157 
    158    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
    159    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
    160 
    161    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
    162          _mm_unpacklo_epi32(sum2, sum3));
    163 
    164    for (;j<(len-3);j+=4)
    165    {
    166        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
    167        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
    168        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
    169        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
    170        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
    171 
    172        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
    173        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
    174        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
    175        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
    176 
    177        sum0 = _mm_mullo_epi32(vecX0, vecY0);
    178        sum1 = _mm_mullo_epi32(vecX1, vecY1);
    179        sum2 = _mm_mullo_epi32(vecX2, vecY2);
    180        sum3 = _mm_mullo_epi32(vecX3, vecY3);
    181 
    182        sum0 = _mm_add_epi32(sum0, sum1);
    183        sum2 = _mm_add_epi32(sum2, sum3);
    184        vecSum = _mm_add_epi32(vecSum, sum0);
    185        vecSum = _mm_add_epi32(vecSum, sum2);
    186    }
    187 
    188    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
    189    if (len - j == 3)
    190    {
    191        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
    192        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
    193        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
    194 
    195        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
    196        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
    197        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
    198 
    199        sum0 = _mm_mullo_epi32(vecX0, vecY0);
    200        sum1 = _mm_mullo_epi32(vecX1, vecY1);
    201        sum2 = _mm_mullo_epi32(vecX2, vecY2);
    202 
    203        vecSum = _mm_add_epi32(vecSum, sum0);
    204        vecSum = _mm_add_epi32(vecSum, sum1);
    205        vecSum = _mm_add_epi32(vecSum, sum2);
    206    }
    207    else if (len - j == 2)
    208    {
    209        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
    210        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
    211 
    212        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
    213        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
    214 
    215        sum0 = _mm_mullo_epi32(vecX0, vecY0);
    216        sum1 = _mm_mullo_epi32(vecX1, vecY1);
    217 
    218        vecSum = _mm_add_epi32(vecSum, sum0);
    219        vecSum = _mm_add_epi32(vecSum, sum1);
    220    }
    221    else if (len - j == 1)
    222    {
    223        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
    224 
    225        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
    226 
    227        sum0 = _mm_mullo_epi32(vecX0, vecY0);
    228 
    229        vecSum = _mm_add_epi32(vecSum, sum0);
    230    }
    231 
    232    initSum = _mm_loadu_si128((__m128i *)(void*)(&sum[0]));
    233    initSum = _mm_add_epi32(initSum, vecSum);
    234    _mm_storeu_si128((__m128i *)(void*)sum, initSum);
    235 
    236 #ifdef OPUS_CHECK_ASM
    237    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
    238 #endif
    239 }
    240 #endif