[ tor-browser ].git.dasho

pitch_neon_intr.c (10415B)
      1 /***********************************************************************
      2 Copyright (c) 2017 Google Inc.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <arm_neon.h>
     33 #include "pitch.h"
     34 
     35 #ifdef FIXED_POINT
     36 
     37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
     38 {
     39    int i;
     40    opus_val32 xy;
     41    int16x8_t x_s16x8, y_s16x8;
     42    int32x4_t xy_s32x4 = vdupq_n_s32(0);
     43    int64x2_t xy_s64x2;
     44    int64x1_t xy_s64x1;
     45 
     46    for (i = 0; i < N - 7; i += 8) {
     47        x_s16x8  = vld1q_s16(&x[i]);
     48        y_s16x8  = vld1q_s16(&y[i]);
     49        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
     50        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
     51    }
     52 
     53    if (N - i >= 4) {
     54        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
     55        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
     56        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
     57        i += 4;
     58    }
     59 
     60    xy_s64x2 = vpaddlq_s32(xy_s32x4);
     61    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
     62    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
     63 
     64    for (; i < N; i++) {
     65        xy = MAC16_16(xy, x[i], y[i]);
     66    }
     67 
     68 #ifdef OPUS_CHECK_ASM
     69    celt_assert(celt_inner_prod_c(x, y, N) == xy);
     70 #endif
     71 
     72    return xy;
     73 }
     74 
     75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
     76        int N, opus_val32 *xy1, opus_val32 *xy2)
     77 {
     78    int i;
     79    opus_val32 xy01, xy02;
     80    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
     81    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
     82    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
     83    int64x2_t xy01_s64x2, xy02_s64x2;
     84    int64x1_t xy01_s64x1, xy02_s64x1;
     85 
     86    for (i = 0; i < N - 7; i += 8) {
     87        x_s16x8    = vld1q_s16(&x[i]);
     88        y01_s16x8  = vld1q_s16(&y01[i]);
     89        y02_s16x8  = vld1q_s16(&y02[i]);
     90        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
     91        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
     92        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
     93        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
     94    }
     95 
     96    if (N - i >= 4) {
     97        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
     98        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
     99        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
    100        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
    101        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
    102        i += 4;
    103    }
    104 
    105    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
    106    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
    107    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
    108    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
    109    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
    110    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
    111 
    112    for (; i < N; i++) {
    113        xy01 = MAC16_16(xy01, x[i], y01[i]);
    114        xy02 = MAC16_16(xy02, x[i], y02[i]);
    115    }
    116    *xy1 = xy01;
    117    *xy2 = xy02;
    118 
    119 #ifdef OPUS_CHECK_ASM
    120    {
    121        opus_val32 xy1_c, xy2_c;
    122        dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
    123        celt_assert(xy1_c == *xy1);
    124        celt_assert(xy2_c == *xy2);
    125    }
    126 #endif
    127 }
    128 
    129 #else /* !FIXED_POINT */
    130 
    131 /* ========================================================================== */
    132 
    133 #ifdef __ARM_FEATURE_FMA
    134 /* If we can, force the compiler to use an FMA instruction rather than break
    135   vmlaq_f32() into fmul/fadd. */
    136 #define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
    137 #endif
    138 
    139 
    140 #ifdef OPUS_CHECK_ASM
    141 
    142 /* This part of code simulates floating-point NEON operations. */
    143 
    144 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
    145 /* operations of celt_inner_prod_neon(), and both functions should have bit */
    146 /* exact output.                                                            */
    147 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
    148 {
    149   int i;
    150   *err = 0;
    151   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
    152   for (i = 0; i < N - 3; i += 4) {
    153      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
    154      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
    155      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
    156      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
    157      *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
    158   }
    159   xy0 += xy2;
    160   xy1 += xy3;
    161   xy = xy0 + xy1;
    162   *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
    163   for (; i < N; i++) {
    164      xy = MAC16_16(xy, x[i], y[i]);
    165      *err += ABS32(xy);
    166   }
    167   *err = *err*2e-7 + N*1e-37;
    168   return xy;
    169 }
    170 
    171 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
    172 /* operations of dual_inner_prod_neon(), and both functions should have bit */
    173 /* exact output.                                                            */
    174 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
    175      int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
    176 {
    177   *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
    178   *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
    179 }
    180 
    181 #endif /* OPUS_CHECK_ASM */
    182 
    183 /* ========================================================================== */
    184 
    185 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
    186 {
    187    int i;
    188    opus_val32 xy;
    189    float32x4_t xy_f32x4 = vdupq_n_f32(0);
    190    float32x2_t xy_f32x2;
    191 
    192    for (i = 0; i < N - 7; i += 8) {
    193        float32x4_t x_f32x4, y_f32x4;
    194        x_f32x4  = vld1q_f32(&x[i]);
    195        y_f32x4  = vld1q_f32(&y[i]);
    196        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
    197        x_f32x4  = vld1q_f32(&x[i + 4]);
    198        y_f32x4  = vld1q_f32(&y[i + 4]);
    199        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
    200    }
    201 
    202    if (N - i >= 4) {
    203        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
    204        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
    205        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
    206        i += 4;
    207    }
    208 
    209    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
    210    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
    211    xy       = vget_lane_f32(xy_f32x2, 0);
    212 
    213    for (; i < N; i++) {
    214        xy = MAC16_16(xy, x[i], y[i]);
    215    }
    216 
    217 #ifdef OPUS_CHECK_ASM
    218    {
    219        float err, res;
    220        res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
    221        /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
    222        celt_assert(ABS32(res - xy) <= err);
    223    }
    224 #endif
    225 
    226    return xy;
    227 }
    228 
    229 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
    230        int N, opus_val32 *xy1, opus_val32 *xy2)
    231 {
    232    int i;
    233    opus_val32 xy01, xy02;
    234    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
    235    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
    236    float32x2_t xy01_f32x2, xy02_f32x2;
    237 
    238    for (i = 0; i < N - 7; i += 8) {
    239        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
    240        x_f32x4    = vld1q_f32(&x[i]);
    241        y01_f32x4  = vld1q_f32(&y01[i]);
    242        y02_f32x4  = vld1q_f32(&y02[i]);
    243        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
    244        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
    245        x_f32x4    = vld1q_f32(&x[i + 4]);
    246        y01_f32x4  = vld1q_f32(&y01[i + 4]);
    247        y02_f32x4  = vld1q_f32(&y02[i + 4]);
    248        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
    249        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
    250    }
    251 
    252    if (N - i >= 4) {
    253        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
    254        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
    255        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
    256        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
    257        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
    258        i += 4;
    259    }
    260 
    261    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
    262    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
    263    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
    264    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
    265    xy01       = vget_lane_f32(xy01_f32x2, 0);
    266    xy02       = vget_lane_f32(xy02_f32x2, 0);
    267 
    268    for (; i < N; i++) {
    269        xy01 = MAC16_16(xy01, x[i], y01[i]);
    270        xy02 = MAC16_16(xy02, x[i], y02[i]);
    271    }
    272    *xy1 = xy01;
    273    *xy2 = xy02;
    274 
    275 #ifdef OPUS_CHECK_ASM
    276    {
    277        opus_val32 xy1_c, xy2_c;
    278        float err[2];
    279        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
    280        /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
    281        if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
    282        celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
    283        celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
    284    }
    285 #endif
    286 }
    287 
    288 #endif /* FIXED_POINT */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE