tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

min_max_operations_neon.c (9732B)


      1 /*
      2 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include <arm_neon.h>
     12 #include <stdlib.h>
     13 
     14 #include "common_audio/signal_processing/include/signal_processing_library.h"
     15 #include "rtc_base/checks.h"
     16 
     17 // Maximum absolute value of word16 vector. C version for generic platforms.
     18 int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
     19  int absolute = 0, maximum = 0;
     20 
     21  RTC_DCHECK_GT(length, 0);
     22 
     23  const int16_t* p_start = vector;
     24  size_t rest = length & 7;
     25  const int16_t* p_end = vector + length - rest;
     26 
     27  int16x8_t v;
     28  uint16x8_t max_qv;
     29  max_qv = vdupq_n_u16(0);
     30 
     31  while (p_start < p_end) {
     32    v = vld1q_s16(p_start);
     33    // Note vabs doesn't change the value of -32768.
     34    v = vabsq_s16(v);
     35    // Use u16 so we don't lose the value -32768.
     36    max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
     37    p_start += 8;
     38  }
     39 
     40 #ifdef WEBRTC_ARCH_ARM64
     41  maximum = (int)vmaxvq_u16(max_qv);
     42 #else
     43  uint16x4_t max_dv;
     44  max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
     45  max_dv = vpmax_u16(max_dv, max_dv);
     46  max_dv = vpmax_u16(max_dv, max_dv);
     47 
     48  maximum = (int)vget_lane_u16(max_dv, 0);
     49 #endif
     50 
     51  p_end = vector + length;
     52  while (p_start < p_end) {
     53    absolute = abs((int)(*p_start));
     54 
     55    if (absolute > maximum) {
     56      maximum = absolute;
     57    }
     58    p_start++;
     59  }
     60 
     61  // Guard the case for abs(-32768).
     62  if (maximum > WEBRTC_SPL_WORD16_MAX) {
     63    maximum = WEBRTC_SPL_WORD16_MAX;
     64  }
     65 
     66  return (int16_t)maximum;
     67 }
     68 
     69 // Maximum absolute value of word32 vector. NEON intrinsics version for
     70 // ARM 32-bit/64-bit platforms.
     71 int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
     72  // Use uint32_t for the local variables, to accommodate the return value
     73  // of abs(0x80000000), which is 0x80000000.
     74 
     75  uint32_t absolute = 0, maximum = 0;
     76  size_t i = 0;
     77  size_t residual = length & 0x7;
     78 
     79  RTC_DCHECK_GT(length, 0);
     80 
     81  const int32_t* p_start = vector;
     82  uint32x4_t max32x4_0 = vdupq_n_u32(0);
     83  uint32x4_t max32x4_1 = vdupq_n_u32(0);
     84 
     85  // First part, unroll the loop 8 times.
     86  for (i = 0; i < length - residual; i += 8) {
     87    int32x4_t in32x4_0 = vld1q_s32(p_start);
     88    p_start += 4;
     89    int32x4_t in32x4_1 = vld1q_s32(p_start);
     90    p_start += 4;
     91    in32x4_0 = vabsq_s32(in32x4_0);
     92    in32x4_1 = vabsq_s32(in32x4_1);
     93    // vabs doesn't change the value of 0x80000000.
     94    // Use u32 so we don't lose the value 0x80000000.
     95    max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
     96    max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
     97  }
     98 
     99  uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
    100 #if defined(WEBRTC_ARCH_ARM64)
    101  maximum = vmaxvq_u32(max32x4);
    102 #else
    103  uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
    104  max32x2 = vpmax_u32(max32x2, max32x2);
    105 
    106  maximum = vget_lane_u32(max32x2, 0);
    107 #endif
    108 
    109  // Second part, do the remaining iterations (if any).
    110  for (i = residual; i > 0; i--) {
    111    absolute = abs((int)(*p_start));
    112    if (absolute > maximum) {
    113      maximum = absolute;
    114    }
    115    p_start++;
    116  }
    117 
    118  // Guard against the case for 0x80000000.
    119  maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
    120 
    121  return (int32_t)maximum;
    122 }
    123 
    124 // Maximum value of word16 vector. NEON intrinsics version for
    125 // ARM 32-bit/64-bit platforms.
    126 int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
    127  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
    128  size_t i = 0;
    129  size_t residual = length & 0x7;
    130 
    131  RTC_DCHECK_GT(length, 0);
    132 
    133  const int16_t* p_start = vector;
    134  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
    135 
    136  // First part, unroll the loop 8 times.
    137  for (i = 0; i < length - residual; i += 8) {
    138    int16x8_t in16x8 = vld1q_s16(p_start);
    139    max16x8 = vmaxq_s16(max16x8, in16x8);
    140    p_start += 8;
    141  }
    142 
    143 #if defined(WEBRTC_ARCH_ARM64)
    144  maximum = vmaxvq_s16(max16x8);
    145 #else
    146  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
    147  max16x4 = vpmax_s16(max16x4, max16x4);
    148  max16x4 = vpmax_s16(max16x4, max16x4);
    149 
    150  maximum = vget_lane_s16(max16x4, 0);
    151 #endif
    152 
    153  // Second part, do the remaining iterations (if any).
    154  for (i = residual; i > 0; i--) {
    155    if (*p_start > maximum)
    156      maximum = *p_start;
    157    p_start++;
    158  }
    159  return maximum;
    160 }
    161 
    162 // Maximum value of word32 vector. NEON intrinsics version for
    163 // ARM 32-bit/64-bit platforms.
    164 int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
    165  int32_t maximum = WEBRTC_SPL_WORD32_MIN;
    166  size_t i = 0;
    167  size_t residual = length & 0x7;
    168 
    169  RTC_DCHECK_GT(length, 0);
    170 
    171  const int32_t* p_start = vector;
    172  int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
    173  int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
    174 
    175  // First part, unroll the loop 8 times.
    176  for (i = 0; i < length - residual; i += 8) {
    177    int32x4_t in32x4_0 = vld1q_s32(p_start);
    178    p_start += 4;
    179    int32x4_t in32x4_1 = vld1q_s32(p_start);
    180    p_start += 4;
    181    max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
    182    max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
    183  }
    184 
    185  int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
    186 #if defined(WEBRTC_ARCH_ARM64)
    187  maximum = vmaxvq_s32(max32x4);
    188 #else
    189  int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
    190  max32x2 = vpmax_s32(max32x2, max32x2);
    191 
    192  maximum = vget_lane_s32(max32x2, 0);
    193 #endif
    194 
    195  // Second part, do the remaining iterations (if any).
    196  for (i = residual; i > 0; i--) {
    197    if (*p_start > maximum)
    198      maximum = *p_start;
    199    p_start++;
    200  }
    201  return maximum;
    202 }
    203 
    204 // Minimum value of word16 vector. NEON intrinsics version for
    205 // ARM 32-bit/64-bit platforms.
    206 int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
    207  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
    208  size_t i = 0;
    209  size_t residual = length & 0x7;
    210 
    211  RTC_DCHECK_GT(length, 0);
    212 
    213  const int16_t* p_start = vector;
    214  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
    215 
    216  // First part, unroll the loop 8 times.
    217  for (i = 0; i < length - residual; i += 8) {
    218    int16x8_t in16x8 = vld1q_s16(p_start);
    219    min16x8 = vminq_s16(min16x8, in16x8);
    220    p_start += 8;
    221  }
    222 
    223 #if defined(WEBRTC_ARCH_ARM64)
    224  minimum = vminvq_s16(min16x8);
    225 #else
    226  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
    227  min16x4 = vpmin_s16(min16x4, min16x4);
    228  min16x4 = vpmin_s16(min16x4, min16x4);
    229 
    230  minimum = vget_lane_s16(min16x4, 0);
    231 #endif
    232 
    233  // Second part, do the remaining iterations (if any).
    234  for (i = residual; i > 0; i--) {
    235    if (*p_start < minimum)
    236      minimum = *p_start;
    237    p_start++;
    238  }
    239  return minimum;
    240 }
    241 
    242 // Minimum value of word32 vector. NEON intrinsics version for
    243 // ARM 32-bit/64-bit platforms.
    244 int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
    245  int32_t minimum = WEBRTC_SPL_WORD32_MAX;
    246  size_t i = 0;
    247  size_t residual = length & 0x7;
    248 
    249  RTC_DCHECK_GT(length, 0);
    250 
    251  const int32_t* p_start = vector;
    252  int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
    253  int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
    254 
    255  // First part, unroll the loop 8 times.
    256  for (i = 0; i < length - residual; i += 8) {
    257    int32x4_t in32x4_0 = vld1q_s32(p_start);
    258    p_start += 4;
    259    int32x4_t in32x4_1 = vld1q_s32(p_start);
    260    p_start += 4;
    261    min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
    262    min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
    263  }
    264 
    265  int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
    266 #if defined(WEBRTC_ARCH_ARM64)
    267  minimum = vminvq_s32(min32x4);
    268 #else
    269  int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
    270  min32x2 = vpmin_s32(min32x2, min32x2);
    271 
    272  minimum = vget_lane_s32(min32x2, 0);
    273 #endif
    274 
    275  // Second part, do the remaining iterations (if any).
    276  for (i = residual; i > 0; i--) {
    277    if (*p_start < minimum)
    278      minimum = *p_start;
    279    p_start++;
    280  }
    281  return minimum;
    282 }
    283 
    284 // Finds both the minimum and maximum elements in an array of 16-bit integers.
    285 void WebRtcSpl_MinMaxW16Neon(const int16_t* vector,
    286                             size_t length,
    287                             int16_t* min_val,
    288                             int16_t* max_val) {
    289  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
    290  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
    291  size_t i = 0;
    292  size_t residual = length & 0x7;
    293 
    294  RTC_DCHECK_GT(length, 0);
    295 
    296  const int16_t* p_start = vector;
    297  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
    298  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
    299 
    300  // First part, unroll the loop 8 times.
    301  for (i = 0; i < length - residual; i += 8) {
    302    int16x8_t in16x8 = vld1q_s16(p_start);
    303    min16x8 = vminq_s16(min16x8, in16x8);
    304    max16x8 = vmaxq_s16(max16x8, in16x8);
    305    p_start += 8;
    306  }
    307 
    308 #if defined(WEBRTC_ARCH_ARM64)
    309  minimum = vminvq_s16(min16x8);
    310  maximum = vmaxvq_s16(max16x8);
    311 #else
    312  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
    313  min16x4 = vpmin_s16(min16x4, min16x4);
    314  min16x4 = vpmin_s16(min16x4, min16x4);
    315 
    316  minimum = vget_lane_s16(min16x4, 0);
    317 
    318  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
    319  max16x4 = vpmax_s16(max16x4, max16x4);
    320  max16x4 = vpmax_s16(max16x4, max16x4);
    321 
    322  maximum = vget_lane_s16(max16x4, 0);
    323 #endif
    324 
    325  // Second part, do the remaining iterations (if any).
    326  for (i = residual; i > 0; i--) {
    327    if (*p_start < minimum)
    328      minimum = *p_start;
    329    if (*p_start > maximum)
    330      maximum = *p_start;
    331    p_start++;
    332  }
    333  *min_val = minimum;
    334  *max_val = maximum;
    335 }