tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

downsample_fast_neon.c (9646B)


      1 /*
      2 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "common_audio/signal_processing/include/signal_processing_library.h"
     14 #include "rtc_base/checks.h"
     15 
     16 // NEON intrinsics version of WebRtcSpl_DownsampleFast()
     17 // for ARM 32-bit/64-bit platforms.
     18 int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
     19                                 size_t data_in_length,
     20                                 int16_t* data_out,
     21                                 size_t data_out_length,
     22                                 const int16_t* __restrict coefficients,
     23                                 size_t coefficients_length,
     24                                 int factor,
     25                                 size_t delay) {
     26  // Using signed indexes to be able to compute negative i-j that
     27  // is used to index data_in.
     28  int i = 0;
     29  int j = 0;
     30  int32_t out_s32 = 0;
     31  int endpos = delay + factor * (data_out_length - 1) + 1;
     32  size_t res = data_out_length & 0x7;
     33  int endpos1 = endpos - factor * res;
     34 
     35  // Return error if any of the running conditions doesn't meet.
     36  if (data_out_length == 0 || coefficients_length == 0 ||
     37      (int)data_in_length < endpos) {
     38    return -1;
     39  }
     40 
     41  RTC_DCHECK_GE(endpos, 0);
     42  RTC_DCHECK_GE(endpos1, 0);
     43 
     44  // First part, unroll the loop 8 times, with 3 subcases
     45  // (factor == 2, 4, others).
     46  switch (factor) {
     47    case 2: {
     48      for (i = delay; i < endpos1; i += 16) {
     49        // Round value, 0.5 in Q12.
     50        int32x4_t out32x4_0 = vdupq_n_s32(2048);
     51        int32x4_t out32x4_1 = vdupq_n_s32(2048);
     52 
     53 #if defined(WEBRTC_ARCH_ARM64)
     54        // Unroll the loop 2 times.
     55        for (j = 0; j < (int)coefficients_length - 1; j += 2) {
     56          int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
     57          int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
     58          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
     59 
     60          // Mul and accumulate low 64-bit data.
     61          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
     62          int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
     63          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
     64          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
     65 
     66          // Mul and accumulate high 64-bit data.
     67          // TODO: vget_high_s16 need extra cost on ARM64. This could be
     68          // replaced by vmlal_high_lane_s16. But for the interface of
     69          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
     70          // This issue need to be tracked in the future.
     71          int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
     72          int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
     73          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
     74          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
     75        }
     76 
     77        for (; j < (int)coefficients_length; j++) {
     78          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
     79          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
     80 
     81          // Mul and accumulate low 64-bit data.
     82          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
     83          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
     84 
     85          // Mul and accumulate high 64-bit data.
     86          // TODO: vget_high_s16 need extra cost on ARM64. This could be
     87          // replaced by vmlal_high_lane_s16. But for the interface of
     88          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
     89          // This issue need to be tracked in the future.
     90          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
     91          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
     92        }
     93 #else
     94        // On ARMv7, the loop unrolling 2 times results in performance
     95        // regression.
     96        for (j = 0; j < (int)coefficients_length; j++) {
     97          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
     98          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
     99 
    100          // Mul and accumulate.
    101          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
    102          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
    103          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
    104          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
    105        }
    106 #endif
    107 
    108        // Saturate and store the output.
    109        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
    110        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
    111        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
    112        data_out += 8;
    113      }
    114      break;
    115    }
    116    case 4: {
    117      for (i = delay; i < endpos1; i += 32) {
    118        // Round value, 0.5 in Q12.
    119        int32x4_t out32x4_0 = vdupq_n_s32(2048);
    120        int32x4_t out32x4_1 = vdupq_n_s32(2048);
    121 
    122        // Unroll the loop 4 times.
    123        for (j = 0; j < (int)coefficients_length - 3; j += 4) {
    124          int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
    125          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
    126 
    127          // Mul and accumulate low 64-bit data.
    128          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
    129          int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
    130          int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
    131          int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
    132          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
    133          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
    134          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
    135          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
    136 
    137          // Mul and accumulate high 64-bit data.
    138          // TODO: vget_high_s16 need extra cost on ARM64. This could be
    139          // replaced by vmlal_high_lane_s16. But for the interface of
    140          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
    141          // This issue need to be tracked in the future.
    142          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
    143          int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
    144          int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
    145          int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
    146          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
    147          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
    148          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
    149          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
    150        }
    151 
    152        for (; j < (int)coefficients_length; j++) {
    153          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
    154          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
    155 
    156          // Mul and accumulate low 64-bit data.
    157          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
    158          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
    159 
    160          // Mul and accumulate high 64-bit data.
    161          // TODO: vget_high_s16 need extra cost on ARM64. This could be
    162          // replaced by vmlal_high_lane_s16. But for the interface of
    163          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
    164          // This issue need to be tracked in the future.
    165          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
    166          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
    167        }
    168 
    169        // Saturate and store the output.
    170        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
    171        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
    172        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
    173        data_out += 8;
    174      }
    175      break;
    176    }
    177    default: {
    178      for (i = delay; i < endpos1; i += factor * 8) {
    179        // Round value, 0.5 in Q12.
    180        int32x4_t out32x4_0 = vdupq_n_s32(2048);
    181        int32x4_t out32x4_1 = vdupq_n_s32(2048);
    182 
    183        for (j = 0; j < (int)coefficients_length; j++) {
    184          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
    185          int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
    186          in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
    187          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
    188          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
    189          int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
    190          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
    191          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
    192          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
    193 
    194          // Mul and accumulate.
    195          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
    196          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
    197        }
    198 
    199        // Saturate and store the output.
    200        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
    201        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
    202        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
    203        data_out += 8;
    204      }
    205      break;
    206    }
    207  }
    208 
    209  // Second part, do the rest iterations (if any).
    210  for (; i < endpos; i += factor) {
    211    out_s32 = 2048;  // Round value, 0.5 in Q12.
    212 
    213    for (j = 0; j < (int)coefficients_length; j++) {
    214      out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
    215    }
    216 
    217    // Saturate and store the output.
    218    out_s32 >>= 12;
    219    *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
    220  }
    221 
    222  return 0;
    223 }