tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sum_neon.h (10311B)


      1 /*
      2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
     13 #define AOM_AOM_DSP_ARM_SUM_NEON_H_
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 #include "config/aom_config.h"
     17 
     18 #include "aom/aom_integer.h"
     19 #include "aom_ports/mem.h"
     20 
     21 static inline int horizontal_add_u8x8(const uint8x8_t a) {
     22 #if AOM_ARCH_AARCH64
     23  return vaddlv_u8(a);
     24 #else
     25  uint16x4_t b = vpaddl_u8(a);
     26  uint32x2_t c = vpaddl_u16(b);
     27  return vget_lane_u32(c, 0) + vget_lane_u32(c, 1);
     28 #endif
     29 }
     30 
     31 static inline int horizontal_add_s16x8(const int16x8_t a) {
     32 #if AOM_ARCH_AARCH64
     33  return vaddlvq_s16(a);
     34 #else
     35  const int32x4_t b = vpaddlq_s16(a);
     36  const int64x2_t c = vpaddlq_s32(b);
     37  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
     38                               vreinterpret_s32_s64(vget_high_s64(c)));
     39  return vget_lane_s32(d, 0);
     40 #endif
     41 }
     42 
     43 static inline int horizontal_add_s32x4(const int32x4_t a) {
     44 #if AOM_ARCH_AARCH64
     45  return vaddvq_s32(a);
     46 #else
     47  const int64x2_t b = vpaddlq_s32(a);
     48  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
     49                               vreinterpret_s32_s64(vget_high_s64(b)));
     50  return vget_lane_s32(c, 0);
     51 #endif
     52 }
     53 
     54 static inline int64_t horizontal_add_s64x2(const int64x2_t a) {
     55 #if AOM_ARCH_AARCH64
     56  return vaddvq_s64(a);
     57 #else
     58  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
     59 #endif
     60 }
     61 
     62 static inline uint64_t horizontal_add_u64x2(const uint64x2_t a) {
     63 #if AOM_ARCH_AARCH64
     64  return vaddvq_u64(a);
     65 #else
     66  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
     67 #endif
     68 }
     69 
     70 static inline uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
     71 #if AOM_ARCH_AARCH64
     72  return vaddlvq_u32(a);
     73 #else
     74  const uint64x2_t b = vpaddlq_u32(a);
     75  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
     76 #endif
     77 }
     78 
     79 static inline int64_t horizontal_long_add_s32x4(const int32x4_t a) {
     80 #if AOM_ARCH_AARCH64
     81  return vaddlvq_s32(a);
     82 #else
     83  const int64x2_t b = vpaddlq_s32(a);
     84  return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1);
     85 #endif
     86 }
     87 
     88 static inline uint32_t horizontal_add_u32x4(const uint32x4_t a) {
     89 #if AOM_ARCH_AARCH64
     90  return vaddvq_u32(a);
     91 #else
     92  const uint64x2_t b = vpaddlq_u32(a);
     93  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
     94                                vreinterpret_u32_u64(vget_high_u64(b)));
     95  return vget_lane_u32(c, 0);
     96 #endif
     97 }
     98 
     99 static inline uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
    100 #if AOM_ARCH_AARCH64
    101  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
    102  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
    103  return vpaddq_u32(res01, res23);
    104 #else
    105  uint32x4_t res = vdupq_n_u32(0);
    106  res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0);
    107  res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1);
    108  res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2);
    109  res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3);
    110  return res;
    111 #endif
    112 }
    113 
    114 static inline int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
    115 #if AOM_ARCH_AARCH64
    116  int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
    117  int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
    118  return vpaddq_s32(res01, res23);
    119 #else
    120  int32x4_t res = vdupq_n_s32(0);
    121  res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0);
    122  res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1);
    123  res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2);
    124  res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3);
    125  return res;
    126 #endif
    127 }
    128 
    129 static inline uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
    130                                                 const uint16x8_t vec_hi) {
    131 #if AOM_ARCH_AARCH64
    132  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
    133 #else
    134  const uint32x4_t vec_l_lo =
    135      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
    136  const uint32x4_t vec_l_hi =
    137      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
    138  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
    139  const uint64x2_t b = vpaddlq_u32(a);
    140  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
    141                                vreinterpret_u32_u64(vget_high_u64(b)));
    142  return vget_lane_u32(c, 0);
    143 #endif
    144 }
    145 
    146 static inline uint32x4_t horizontal_long_add_4d_u16x8(
    147    const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
    148  const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
    149  const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
    150  const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
    151  const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
    152  const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
    153  const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
    154  const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
    155  const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
    156 #if AOM_ARCH_AARCH64
    157  const uint32x4_t c0 = vpaddq_u32(b0, b1);
    158  const uint32x4_t c1 = vpaddq_u32(b2, b3);
    159  return vpaddq_u32(c0, c1);
    160 #else
    161  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
    162  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
    163  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
    164  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
    165  const uint32x2_t d0 = vpadd_u32(c0, c1);
    166  const uint32x2_t d1 = vpadd_u32(c2, c3);
    167  return vcombine_u32(d0, d1);
    168 #endif
    169 }
    170 
    171 static inline uint32_t horizontal_add_u16x8(const uint16x8_t a) {
    172 #if AOM_ARCH_AARCH64
    173  return vaddlvq_u16(a);
    174 #else
    175  const uint32x4_t b = vpaddlq_u16(a);
    176  const uint64x2_t c = vpaddlq_u32(b);
    177  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
    178                                vreinterpret_u32_u64(vget_high_u64(c)));
    179  return vget_lane_u32(d, 0);
    180 #endif
    181 }
    182 
    183 static inline uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
    184 #if AOM_ARCH_AARCH64
    185  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
    186  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
    187  const uint16x8_t b0 = vpaddq_u16(a0, a1);
    188  return vpaddlq_u16(b0);
    189 #else
    190  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
    191  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
    192  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
    193  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
    194  const uint16x4_t b0 = vpadd_u16(a0, a1);
    195  const uint16x4_t b1 = vpadd_u16(a2, a3);
    196  return vpaddlq_u16(vcombine_u16(b0, b1));
    197 #endif
    198 }
    199 
    200 static inline int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
    201 #if AOM_ARCH_AARCH64
    202  const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]);
    203  const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]);
    204  const int16x8_t b0 = vpaddq_s16(a0, a1);
    205  return vpaddlq_s16(b0);
    206 #else
    207  const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0]));
    208  const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1]));
    209  const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2]));
    210  const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3]));
    211  const int16x4_t b0 = vpadd_s16(a0, a1);
    212  const int16x4_t b1 = vpadd_s16(a2, a3);
    213  return vpaddlq_s16(vcombine_s16(b0, b1));
    214 #endif
    215 }
    216 
    217 static inline uint32_t horizontal_add_u32x2(const uint32x2_t a) {
    218 #if AOM_ARCH_AARCH64
    219  return vaddv_u32(a);
    220 #else
    221  const uint64x1_t b = vpaddl_u32(a);
    222  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
    223 #endif
    224 }
    225 
    226 static inline uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
    227 #if AOM_ARCH_AARCH64
    228  return vaddlv_u32(a);
    229 #else
    230  const uint64x1_t b = vpaddl_u32(a);
    231  return vget_lane_u64(b, 0);
    232 #endif
    233 }
    234 
    235 static inline uint32_t horizontal_add_u16x4(const uint16x4_t a) {
    236 #if AOM_ARCH_AARCH64
    237  return vaddlv_u16(a);
    238 #else
    239  const uint32x2_t b = vpaddl_u16(a);
    240  const uint64x1_t c = vpaddl_u32(b);
    241  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
    242 #endif
    243 }
    244 
    245 static inline int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
    246 #if AOM_ARCH_AARCH64
    247  return vpaddq_s32(a, b);
    248 #else
    249  const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
    250  const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
    251  return vcombine_s32(a0, b0);
    252 #endif
    253 }
    254 
    255 static inline int32x2_t add_pairwise_s32x4(int32x4_t a) {
    256 #if AOM_ARCH_AARCH64
    257  return vget_low_s32(vpaddq_s32(a, a));
    258 #else
    259  return vpadd_s32(vget_low_s32(a), vget_high_s32(a));
    260 #endif
    261 }
    262 
    263 static inline uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
    264  return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
    265 }
    266 
    267 static inline uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
    268  uint64x2_t sum = vpaddlq_u32(a[0]);
    269  sum = vpadalq_u32(sum, a[1]);
    270  sum = vpadalq_u32(sum, a[2]);
    271  sum = vpadalq_u32(sum, a[3]);
    272 
    273  return horizontal_add_u64x2(sum);
    274 }
    275 
    276 static inline uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
    277  uint64x2_t sum[2];
    278  sum[0] = vpaddlq_u32(a[0]);
    279  sum[1] = vpaddlq_u32(a[1]);
    280  sum[0] = vpadalq_u32(sum[0], a[2]);
    281  sum[1] = vpadalq_u32(sum[1], a[3]);
    282  sum[0] = vpadalq_u32(sum[0], a[4]);
    283  sum[1] = vpadalq_u32(sum[1], a[5]);
    284  sum[0] = vpadalq_u32(sum[0], a[6]);
    285  sum[1] = vpadalq_u32(sum[1], a[7]);
    286 
    287  return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
    288 }
    289 
    290 static inline uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
    291  uint64x2_t sum[2];
    292  sum[0] = vpaddlq_u32(a[0]);
    293  sum[1] = vpaddlq_u32(a[1]);
    294  sum[0] = vpadalq_u32(sum[0], a[2]);
    295  sum[1] = vpadalq_u32(sum[1], a[3]);
    296  sum[0] = vpadalq_u32(sum[0], a[4]);
    297  sum[1] = vpadalq_u32(sum[1], a[5]);
    298  sum[0] = vpadalq_u32(sum[0], a[6]);
    299  sum[1] = vpadalq_u32(sum[1], a[7]);
    300  sum[0] = vpadalq_u32(sum[0], a[8]);
    301  sum[1] = vpadalq_u32(sum[1], a[9]);
    302  sum[0] = vpadalq_u32(sum[0], a[10]);
    303  sum[1] = vpadalq_u32(sum[1], a[11]);
    304  sum[0] = vpadalq_u32(sum[0], a[12]);
    305  sum[1] = vpadalq_u32(sum[1], a[13]);
    306  sum[0] = vpadalq_u32(sum[0], a[14]);
    307  sum[1] = vpadalq_u32(sum[1], a[15]);
    308 
    309  return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
    310 }
    311 
    312 #endif  // AOM_AOM_DSP_ARM_SUM_NEON_H_