sum_neon.h (10311B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_ 13 #define AOM_AOM_DSP_ARM_SUM_NEON_H_ 14 15 #include "config/aom_dsp_rtcd.h" 16 #include "config/aom_config.h" 17 18 #include "aom/aom_integer.h" 19 #include "aom_ports/mem.h" 20 21 static inline int horizontal_add_u8x8(const uint8x8_t a) { 22 #if AOM_ARCH_AARCH64 23 return vaddlv_u8(a); 24 #else 25 uint16x4_t b = vpaddl_u8(a); 26 uint32x2_t c = vpaddl_u16(b); 27 return vget_lane_u32(c, 0) + vget_lane_u32(c, 1); 28 #endif 29 } 30 31 static inline int horizontal_add_s16x8(const int16x8_t a) { 32 #if AOM_ARCH_AARCH64 33 return vaddlvq_s16(a); 34 #else 35 const int32x4_t b = vpaddlq_s16(a); 36 const int64x2_t c = vpaddlq_s32(b); 37 const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), 38 vreinterpret_s32_s64(vget_high_s64(c))); 39 return vget_lane_s32(d, 0); 40 #endif 41 } 42 43 static inline int horizontal_add_s32x4(const int32x4_t a) { 44 #if AOM_ARCH_AARCH64 45 return vaddvq_s32(a); 46 #else 47 const int64x2_t b = vpaddlq_s32(a); 48 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 49 vreinterpret_s32_s64(vget_high_s64(b))); 50 return vget_lane_s32(c, 0); 51 #endif 52 } 53 54 static inline int64_t horizontal_add_s64x2(const int64x2_t a) { 55 #if AOM_ARCH_AARCH64 56 return vaddvq_s64(a); 57 #else 58 return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); 59 #endif 60 } 61 62 static inline uint64_t horizontal_add_u64x2(const uint64x2_t a) { 63 #if AOM_ARCH_AARCH64 64 return vaddvq_u64(a); 65 #else 66 return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); 67 #endif 68 } 69 70 static inline uint64_t horizontal_long_add_u32x4(const uint32x4_t a) { 71 #if AOM_ARCH_AARCH64 72 return vaddlvq_u32(a); 73 #else 74 const uint64x2_t b = vpaddlq_u32(a); 75 return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); 76 #endif 77 } 78 79 static inline int64_t horizontal_long_add_s32x4(const int32x4_t a) { 80 #if AOM_ARCH_AARCH64 81 return vaddlvq_s32(a); 82 #else 83 const int64x2_t b = vpaddlq_s32(a); 84 return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1); 85 #endif 86 } 87 88 static inline uint32_t horizontal_add_u32x4(const uint32x4_t a) { 89 #if AOM_ARCH_AARCH64 90 return vaddvq_u32(a); 91 #else 92 const uint64x2_t b = vpaddlq_u32(a); 93 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 94 vreinterpret_u32_u64(vget_high_u64(b))); 95 return vget_lane_u32(c, 0); 96 #endif 97 } 98 99 static inline uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) { 100 #if AOM_ARCH_AARCH64 101 uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); 102 uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); 103 return vpaddq_u32(res01, res23); 104 #else 105 uint32x4_t res = vdupq_n_u32(0); 106 res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0); 107 res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1); 108 res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2); 109 res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3); 110 return res; 111 #endif 112 } 113 114 static inline int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) { 115 #if AOM_ARCH_AARCH64 116 int32x4_t res01 = vpaddq_s32(sum[0], sum[1]); 117 int32x4_t res23 = vpaddq_s32(sum[2], sum[3]); 118 return vpaddq_s32(res01, res23); 119 #else 120 int32x4_t res = vdupq_n_s32(0); 121 res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0); 122 res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1); 123 res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2); 124 res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3); 125 return res; 126 #endif 127 } 128 129 static inline uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo, 130 const uint16x8_t vec_hi) { 131 #if AOM_ARCH_AARCH64 132 return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); 133 #else 134 const uint32x4_t vec_l_lo = 135 vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); 136 const uint32x4_t vec_l_hi = 137 vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); 138 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); 139 const uint64x2_t b = vpaddlq_u32(a); 140 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 141 vreinterpret_u32_u64(vget_high_u64(b))); 142 return vget_lane_u32(c, 0); 143 #endif 144 } 145 146 static inline uint32x4_t horizontal_long_add_4d_u16x8( 147 const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { 148 const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); 149 const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); 150 const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); 151 const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); 152 const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); 153 const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); 154 const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); 155 const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); 156 #if AOM_ARCH_AARCH64 157 const uint32x4_t c0 = vpaddq_u32(b0, b1); 158 const uint32x4_t c1 = vpaddq_u32(b2, b3); 159 return vpaddq_u32(c0, c1); 160 #else 161 const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); 162 const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); 163 const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); 164 const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); 165 const uint32x2_t d0 = vpadd_u32(c0, c1); 166 const uint32x2_t d1 = vpadd_u32(c2, c3); 167 return vcombine_u32(d0, d1); 168 #endif 169 } 170 171 static inline uint32_t horizontal_add_u16x8(const uint16x8_t a) { 172 #if AOM_ARCH_AARCH64 173 return vaddlvq_u16(a); 174 #else 175 const uint32x4_t b = vpaddlq_u16(a); 176 const uint64x2_t c = vpaddlq_u32(b); 177 const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), 178 vreinterpret_u32_u64(vget_high_u64(c))); 179 return vget_lane_u32(d, 0); 180 #endif 181 } 182 183 static inline uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) { 184 #if AOM_ARCH_AARCH64 185 const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); 186 const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); 187 const uint16x8_t b0 = vpaddq_u16(a0, a1); 188 return vpaddlq_u16(b0); 189 #else 190 const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); 191 const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); 192 const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); 193 const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); 194 const uint16x4_t b0 = vpadd_u16(a0, a1); 195 const uint16x4_t b1 = vpadd_u16(a2, a3); 196 return vpaddlq_u16(vcombine_u16(b0, b1)); 197 #endif 198 } 199 200 static inline int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) { 201 #if AOM_ARCH_AARCH64 202 const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]); 203 const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]); 204 const int16x8_t b0 = vpaddq_s16(a0, a1); 205 return vpaddlq_s16(b0); 206 #else 207 const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0])); 208 const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1])); 209 const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2])); 210 const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3])); 211 const int16x4_t b0 = vpadd_s16(a0, a1); 212 const int16x4_t b1 = vpadd_s16(a2, a3); 213 return vpaddlq_s16(vcombine_s16(b0, b1)); 214 #endif 215 } 216 217 static inline uint32_t horizontal_add_u32x2(const uint32x2_t a) { 218 #if AOM_ARCH_AARCH64 219 return vaddv_u32(a); 220 #else 221 const uint64x1_t b = vpaddl_u32(a); 222 return vget_lane_u32(vreinterpret_u32_u64(b), 0); 223 #endif 224 } 225 226 static inline uint64_t horizontal_long_add_u32x2(const uint32x2_t a) { 227 #if AOM_ARCH_AARCH64 228 return vaddlv_u32(a); 229 #else 230 const uint64x1_t b = vpaddl_u32(a); 231 return vget_lane_u64(b, 0); 232 #endif 233 } 234 235 static inline uint32_t horizontal_add_u16x4(const uint16x4_t a) { 236 #if AOM_ARCH_AARCH64 237 return vaddlv_u16(a); 238 #else 239 const uint32x2_t b = vpaddl_u16(a); 240 const uint64x1_t c = vpaddl_u32(b); 241 return vget_lane_u32(vreinterpret_u32_u64(c), 0); 242 #endif 243 } 244 245 static inline int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) { 246 #if AOM_ARCH_AARCH64 247 return vpaddq_s32(a, b); 248 #else 249 const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); 250 const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); 251 return vcombine_s32(a0, b0); 252 #endif 253 } 254 255 static inline int32x2_t add_pairwise_s32x4(int32x4_t a) { 256 #if AOM_ARCH_AARCH64 257 return vget_low_s32(vpaddq_s32(a, a)); 258 #else 259 return vpadd_s32(vget_low_s32(a), vget_high_s32(a)); 260 #endif 261 } 262 263 static inline uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) { 264 return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]); 265 } 266 267 static inline uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) { 268 uint64x2_t sum = vpaddlq_u32(a[0]); 269 sum = vpadalq_u32(sum, a[1]); 270 sum = vpadalq_u32(sum, a[2]); 271 sum = vpadalq_u32(sum, a[3]); 272 273 return horizontal_add_u64x2(sum); 274 } 275 276 static inline uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) { 277 uint64x2_t sum[2]; 278 sum[0] = vpaddlq_u32(a[0]); 279 sum[1] = vpaddlq_u32(a[1]); 280 sum[0] = vpadalq_u32(sum[0], a[2]); 281 sum[1] = vpadalq_u32(sum[1], a[3]); 282 sum[0] = vpadalq_u32(sum[0], a[4]); 283 sum[1] = vpadalq_u32(sum[1], a[5]); 284 sum[0] = vpadalq_u32(sum[0], a[6]); 285 sum[1] = vpadalq_u32(sum[1], a[7]); 286 287 return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1])); 288 } 289 290 static inline uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) { 291 uint64x2_t sum[2]; 292 sum[0] = vpaddlq_u32(a[0]); 293 sum[1] = vpaddlq_u32(a[1]); 294 sum[0] = vpadalq_u32(sum[0], a[2]); 295 sum[1] = vpadalq_u32(sum[1], a[3]); 296 sum[0] = vpadalq_u32(sum[0], a[4]); 297 sum[1] = vpadalq_u32(sum[1], a[5]); 298 sum[0] = vpadalq_u32(sum[0], a[6]); 299 sum[1] = vpadalq_u32(sum[1], a[7]); 300 sum[0] = vpadalq_u32(sum[0], a[8]); 301 sum[1] = vpadalq_u32(sum[1], a[9]); 302 sum[0] = vpadalq_u32(sum[0], a[10]); 303 sum[1] = vpadalq_u32(sum[1], a[11]); 304 sum[0] = vpadalq_u32(sum[0], a[12]); 305 sum[1] = vpadalq_u32(sum[1], a[13]); 306 sum[0] = vpadalq_u32(sum[0], a[14]); 307 sum[1] = vpadalq_u32(sum[1], a[15]); 308 309 return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1])); 310 } 311 312 #endif // AOM_AOM_DSP_ARM_SUM_NEON_H_