tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

celt_neon_intr.c (13324B)


      1 /* Copyright (c) 2014-2015 Xiph.Org Foundation
      2   Copyright (c) 2024 Arm Limited
      3   Written by Viswanath Puttagunta */
      4 /**
      5   @file celt_neon_intr.c
      6   @brief ARM Neon Intrinsic optimizations for celt
      7 */
      8 
      9 /*
     10   Redistribution and use in source and binary forms, with or without
     11   modification, are permitted provided that the following conditions
     12   are met:
     13 
     14   - Redistributions of source code must retain the above copyright
     15   notice, this list of conditions and the following disclaimer.
     16 
     17   - Redistributions in binary form must reproduce the above copyright
     18   notice, this list of conditions and the following disclaimer in the
     19   documentation and/or other materials provided with the distribution.
     20 
     21   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     22   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     23   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     24   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     25   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     26   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     27   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     28   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     29   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     30   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     31   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     32 */
     33 
     34 #ifdef HAVE_CONFIG_H
     35 #include "config.h"
     36 #endif
     37 
     38 #include <arm_neon.h>
     39 #include "../float_cast.h"
     40 #include "../mathops.h"
     41 #include "../pitch.h"
     42 #include <stddef.h>
     43 #if defined(OPUS_CHECK_ASM)
     44 #include <stdlib.h>
     45 #endif
     46 
     47 #if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
     48 
     49 void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
     50 {
     51   int i = 0;
     52 
     53 #if defined(__ARM_NEON)
     54   const int BLOCK_SIZE = 16;
     55   const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
     56 
     57   for (; i < blockedSize; i += BLOCK_SIZE)
     58   {
     59      float32x4_t orig_a = vld1q_f32(&in[i +  0]);
     60      float32x4_t orig_b = vld1q_f32(&in[i +  4]);
     61      float32x4_t orig_c = vld1q_f32(&in[i +  8]);
     62      float32x4_t orig_d = vld1q_f32(&in[i + 12]);
     63 
     64      int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
     65      int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
     66      int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
     67      int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
     68 
     69      vst1_s16(&out[i +  0], asShort_a);
     70      vst1_s16(&out[i +  4], asShort_b);
     71      vst1_s16(&out[i +  8], asShort_c);
     72      vst1_s16(&out[i + 12], asShort_d);
     73 # if defined(OPUS_CHECK_ASM)
     74      short out_c[BLOCK_SIZE];
     75      int j;
     76      for(j = 0; j < BLOCK_SIZE; j++)
     77      {
     78         out_c[j] = FLOAT2INT16(in[i + j]);
     79         celt_assert(abs((out_c[j] - out[i + j])) <= 1);
     80      }
     81 # endif
     82   }
     83 #endif
     84 
     85   for (; i < cnt; i++)
     86   {
     87      out[i] = FLOAT2INT16(in[i]);
     88   }
     89 }
     90 
     91 int opus_limit2_checkwithin1_neon(float *samples, int cnt)
     92 {
     93   const float hardclipMin = -2.0f;
     94   const float hardclipMax = 2.0f;
     95 
     96   int i = 0;
     97   int exceeding1 = 0;
     98   int nextIndex = 0;
     99 
    100 #if defined(__ARM_NEON)
    101   const int BLOCK_SIZE = 16;
    102   const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
    103 
    104   float32x4_t min_all_0 = vdupq_n_f32(0.0f);
    105   float32x4_t min_all_1 = vdupq_n_f32(0.0f);
    106   float32x4_t max_all_0 = vdupq_n_f32(0.0f);
    107   float32x4_t max_all_1 = vdupq_n_f32(0.0f);
    108 
    109   float max, min;
    110 
    111   for (i = 0; i < blockedSize; i += BLOCK_SIZE)
    112   {
    113      const float32x4_t orig_a = vld1q_f32(&samples[i +  0]);
    114      const float32x4_t orig_b = vld1q_f32(&samples[i +  4]);
    115      const float32x4_t orig_c = vld1q_f32(&samples[i +  8]);
    116      const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
    117      max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b));
    118      max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d));
    119      min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b));
    120      min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d));
    121   }
    122 
    123   max = vmaxvf(vmaxq_f32(max_all_0, max_all_1));
    124   min = vminvf(vminq_f32(min_all_0, min_all_1));
    125 
    126   if (min < hardclipMin || max > hardclipMax)
    127   {
    128      const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin);
    129      const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax);
    130      for (i = 0; i < blockedSize; i += BLOCK_SIZE)
    131      {
    132         const float32x4_t orig_a = vld1q_f32(&samples[i +  0]);
    133         const float32x4_t orig_b = vld1q_f32(&samples[i +  4]);
    134         const float32x4_t orig_c = vld1q_f32(&samples[i +  8]);
    135         const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
    136         const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg));
    137         const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg));
    138         const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg));
    139         const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg));
    140         vst1q_f32(&samples[i + 0], clipped_a);
    141         vst1q_f32(&samples[i + 4], clipped_b);
    142         vst1q_f32(&samples[i + 8], clipped_c);
    143         vst1q_f32(&samples[i + 12], clipped_d);
    144      }
    145   }
    146 
    147   nextIndex = blockedSize;
    148   exceeding1 |= max > 1.0f || min < -1.0f;
    149 
    150 #endif
    151 
    152   for (i = nextIndex; i < cnt; i++)
    153   {
    154      const float origVal = samples[i];
    155      float clippedVal = origVal;
    156      clippedVal = MAX16(hardclipMin, clippedVal);
    157      clippedVal = MIN16(hardclipMax, clippedVal);
    158      samples[i] = clippedVal;
    159 
    160      exceeding1 |= origVal > 1.0f || origVal < -1.0f;
    161   }
    162 
    163   return !exceeding1;
    164 }
    165 
    166 #endif
    167 
    168 
    169 #if defined(FIXED_POINT)
    170 #include <string.h>
    171 
    172 void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
    173 {
    174   int j;
    175   int32x4_t a = vld1q_s32(sum);
    176   /* Load y[0...3] */
    177   /* This requires len>0 to always be valid (which we assert in the C code). */
    178   int16x4_t y0 = vld1_s16(y);
    179   y += 4;
    180 
    181   /* This loop loads one y value more than we actually need.
    182      Therefore we have to stop as soon as there are 8 or fewer samples left
    183       (instead of 7), to avoid reading past the end of the array. */
    184   for (j = 0; j + 8 < len; j += 8)
    185   {
    186      /* Load x[0...7] */
    187      int16x8_t xx = vld1q_s16(x);
    188      int16x4_t x0 = vget_low_s16(xx);
    189      int16x4_t x4 = vget_high_s16(xx);
    190      /* Load y[4...11] */
    191      int16x8_t yy = vld1q_s16(y);
    192      int16x4_t y4 = vget_low_s16(yy);
    193      int16x4_t y8 = vget_high_s16(yy);
    194      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
    195      int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
    196 
    197      int16x4_t y1 = vext_s16(y0, y4, 1);
    198      int16x4_t y5 = vext_s16(y4, y8, 1);
    199      int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
    200      int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
    201 
    202      int16x4_t y2 = vext_s16(y0, y4, 2);
    203      int16x4_t y6 = vext_s16(y4, y8, 2);
    204      int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
    205      int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
    206 
    207      int16x4_t y3 = vext_s16(y0, y4, 3);
    208      int16x4_t y7 = vext_s16(y4, y8, 3);
    209      int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
    210      int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
    211 
    212      y0 = y8;
    213      a = a7;
    214      x += 8;
    215      y += 8;
    216   }
    217   if (j + 4 < len) {
    218      /* Load x[0...3] */
    219      int16x4_t x0 = vld1_s16(x);
    220      /* Load y[4...7] */
    221      int16x4_t y4 = vld1_s16(y);
    222      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
    223      int16x4_t y1 = vext_s16(y0, y4, 1);
    224      int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
    225      int16x4_t y2 = vext_s16(y0, y4, 2);
    226      int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
    227      int16x4_t y3 = vext_s16(y0, y4, 3);
    228      int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
    229      y0 = y4;
    230      a = a3;
    231      x += 4;
    232      y += 4;
    233      j += 4;
    234   }
    235   if (j + 2 < len) {
    236      /* Load x[0...1] */
    237      int16x4x2_t xx = vld2_dup_s16(x);
    238      int16x4_t x0 = xx.val[0];
    239      int16x4_t x1 = xx.val[1];
    240      /* Load y[4...5].
    241         We would like to use vld1_dup_s32(), but casting the pointer would
    242          break strict aliasing rules and potentially have alignment issues.
    243         Fortunately the compiler seems capable of translating this memcpy()
    244          and vdup_n_s32() into the equivalent vld1_dup_s32().*/
    245      int32_t yy;
    246      memcpy(&yy, y, sizeof(yy));
    247      int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
    248      int32x4_t a0 = vmlal_s16(a, y0, x0);
    249      int16x4_t y1 = vext_s16(y0, y4, 1);
    250      /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
    251          using VSRI instead of VEXT, since it's a data-processing
    252          instruction. */
    253      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
    254       vreinterpret_s64_s16(y0), 32));
    255      int32x4_t a1 = vmlal_s16(a0, y1, x1);
    256      a = a1;
    257      x += 2;
    258      y += 2;
    259      j += 2;
    260   }
    261   if (j + 1 < len) {
    262      /* Load next x. */
    263      int16x4_t x0 = vld1_dup_s16(x);
    264      int32x4_t a0 = vmlal_s16(a, y0, x0);
    265      /* Load last y. */
    266      int16x4_t y4 = vld1_dup_s16(y);
    267      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
    268       vreinterpret_s64_s16(y0), 16));
    269      a = a0;
    270      x++;
    271   }
    272   /* Load last x. */
    273   int16x4_t x0 = vld1_dup_s16(x);
    274   int32x4_t a0 = vmlal_s16(a, y0, x0);
    275   vst1q_s32(sum, a0);
    276 }
    277 
    278 #else
    279 
    280 #if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
    281 /* If we can, force the compiler to use an FMA instruction rather than break
    282 *    vmlaq_f32() into fmul/fadd. */
    283 #ifdef vmlaq_lane_f32
    284 #undef vmlaq_lane_f32
    285 #endif
    286 #define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
    287 #endif
    288 
    289 
    290 /*
    291 * Function: xcorr_kernel_neon_float
    292 * ---------------------------------
    293 * Computes 4 correlation values and stores them in sum[4]
    294 */
    295 static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
    296      float32_t sum[4], int len) {
    297   float32x4_t YY[3];
    298   float32x4_t YEXT[3];
    299   float32x4_t XX[2];
    300   float32x2_t XX_2;
    301   float32x4_t SUMM;
    302   const float32_t *xi = x;
    303   const float32_t *yi = y;
    304 
    305   celt_assert(len>0);
    306 
    307   YY[0] = vld1q_f32(yi);
    308   SUMM = vdupq_n_f32(0);
    309 
    310   /* Consume 8 elements in x vector and 12 elements in y
    311    * vector. However, the 12'th element never really gets
    312    * touched in this loop. So, if len == 8, then we only
    313    * must access y[0] to y[10]. y[11] must not be accessed
    314    * hence make sure len > 8 and not len >= 8
    315    */
    316   while (len > 8) {
    317      yi += 4;
    318      YY[1] = vld1q_f32(yi);
    319      yi += 4;
    320      YY[2] = vld1q_f32(yi);
    321 
    322      XX[0] = vld1q_f32(xi);
    323      xi += 4;
    324      XX[1] = vld1q_f32(xi);
    325      xi += 4;
    326 
    327      SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
    328      YEXT[0] = vextq_f32(YY[0], YY[1], 1);
    329      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
    330      YEXT[1] = vextq_f32(YY[0], YY[1], 2);
    331      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
    332      YEXT[2] = vextq_f32(YY[0], YY[1], 3);
    333      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
    334 
    335      SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
    336      YEXT[0] = vextq_f32(YY[1], YY[2], 1);
    337      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
    338      YEXT[1] = vextq_f32(YY[1], YY[2], 2);
    339      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
    340      YEXT[2] = vextq_f32(YY[1], YY[2], 3);
    341      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
    342 
    343      YY[0] = YY[2];
    344      len -= 8;
    345   }
    346 
    347   /* Consume 4 elements in x vector and 8 elements in y
    348    * vector. However, the 8'th element in y never really gets
    349    * touched in this loop. So, if len == 4, then we only
    350    * must access y[0] to y[6]. y[7] must not be accessed
    351    * hence make sure len>4 and not len>=4
    352    */
    353   if (len > 4) {
    354      yi += 4;
    355      YY[1] = vld1q_f32(yi);
    356 
    357      XX[0] = vld1q_f32(xi);
    358      xi += 4;
    359 
    360      SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
    361      YEXT[0] = vextq_f32(YY[0], YY[1], 1);
    362      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
    363      YEXT[1] = vextq_f32(YY[0], YY[1], 2);
    364      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
    365      YEXT[2] = vextq_f32(YY[0], YY[1], 3);
    366      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
    367 
    368      YY[0] = YY[1];
    369      len -= 4;
    370   }
    371 
    372   while (--len > 0) {
    373      XX_2 = vld1_dup_f32(xi++);
    374      SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
    375      YY[0]= vld1q_f32(++yi);
    376   }
    377 
    378   XX_2 = vld1_dup_f32(xi);
    379   SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
    380 
    381   vst1q_f32(sum, SUMM);
    382 }
    383 
    384 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
    385                        opus_val32 *xcorr, int len, int max_pitch, int arch) {
    386   int i;
    387   (void)arch;
    388   celt_assert(max_pitch > 0);
    389   celt_sig_assert((((size_t)_x)&3)==0);
    390 
    391   for (i = 0; i < (max_pitch-3); i += 4) {
    392      xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
    393            (float32_t *)xcorr+i, len);
    394   }
    395 
    396   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
    397   for (; i < max_pitch; i++) {
    398      xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
    399   }
    400 }
    401 #endif