pitch_neon_intr.c (10415B)
1 /*********************************************************************** 2 Copyright (c) 2017 Google Inc. 3 Redistribution and use in source and binary forms, with or without 4 modification, are permitted provided that the following conditions 5 are met: 6 - Redistributions of source code must retain the above copyright notice, 7 this list of conditions and the following disclaimer. 8 - Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the 12 names of specific contributors, may be used to endorse or promote 13 products derived from this software without specific prior written 14 permission. 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***********************************************************************/ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include <arm_neon.h> 33 #include "pitch.h" 34 35 #ifdef FIXED_POINT 36 37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) 38 { 39 int i; 40 opus_val32 xy; 41 int16x8_t x_s16x8, y_s16x8; 42 int32x4_t xy_s32x4 = vdupq_n_s32(0); 43 int64x2_t xy_s64x2; 44 int64x1_t xy_s64x1; 45 46 for (i = 0; i < N - 7; i += 8) { 47 x_s16x8 = vld1q_s16(&x[i]); 48 y_s16x8 = vld1q_s16(&y[i]); 49 xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8)); 50 xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8)); 51 } 52 53 if (N - i >= 4) { 54 const int16x4_t x_s16x4 = vld1_s16(&x[i]); 55 const int16x4_t y_s16x4 = vld1_s16(&y[i]); 56 xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4); 57 i += 4; 58 } 59 60 xy_s64x2 = vpaddlq_s32(xy_s32x4); 61 xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2)); 62 xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0); 63 64 for (; i < N; i++) { 65 xy = MAC16_16(xy, x[i], y[i]); 66 } 67 68 #ifdef OPUS_CHECK_ASM 69 celt_assert(celt_inner_prod_c(x, y, N) == xy); 70 #endif 71 72 return xy; 73 } 74 75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, 76 int N, opus_val32 *xy1, opus_val32 *xy2) 77 { 78 int i; 79 opus_val32 xy01, xy02; 80 int16x8_t x_s16x8, y01_s16x8, y02_s16x8; 81 int32x4_t xy01_s32x4 = vdupq_n_s32(0); 82 int32x4_t xy02_s32x4 = vdupq_n_s32(0); 83 int64x2_t xy01_s64x2, xy02_s64x2; 84 int64x1_t xy01_s64x1, xy02_s64x1; 85 86 for (i = 0; i < N - 7; i += 8) { 87 x_s16x8 = vld1q_s16(&x[i]); 88 y01_s16x8 = vld1q_s16(&y01[i]); 89 y02_s16x8 = vld1q_s16(&y02[i]); 90 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8)); 91 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8)); 92 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8)); 93 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8)); 94 } 95 96 if (N - i >= 4) { 97 const int16x4_t x_s16x4 = vld1_s16(&x[i]); 98 const int16x4_t y01_s16x4 = vld1_s16(&y01[i]); 99 const int16x4_t y02_s16x4 = vld1_s16(&y02[i]); 100 xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4); 101 xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4); 102 i += 4; 103 } 104 105 xy01_s64x2 = vpaddlq_s32(xy01_s32x4); 106 xy02_s64x2 = vpaddlq_s32(xy02_s32x4); 107 xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2)); 108 xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2)); 109 xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0); 110 xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0); 111 112 for (; i < N; i++) { 113 xy01 = MAC16_16(xy01, x[i], y01[i]); 114 xy02 = MAC16_16(xy02, x[i], y02[i]); 115 } 116 *xy1 = xy01; 117 *xy2 = xy02; 118 119 #ifdef OPUS_CHECK_ASM 120 { 121 opus_val32 xy1_c, xy2_c; 122 dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c); 123 celt_assert(xy1_c == *xy1); 124 celt_assert(xy2_c == *xy2); 125 } 126 #endif 127 } 128 129 #else /* !FIXED_POINT */ 130 131 /* ========================================================================== */ 132 133 #ifdef __ARM_FEATURE_FMA 134 /* If we can, force the compiler to use an FMA instruction rather than break 135 vmlaq_f32() into fmul/fadd. */ 136 #define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c) 137 #endif 138 139 140 #ifdef OPUS_CHECK_ASM 141 142 /* This part of code simulates floating-point NEON operations. */ 143 144 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */ 145 /* operations of celt_inner_prod_neon(), and both functions should have bit */ 146 /* exact output. */ 147 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N) 148 { 149 int i; 150 *err = 0; 151 opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; 152 for (i = 0; i < N - 3; i += 4) { 153 xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); 154 xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); 155 xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); 156 xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); 157 *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3); 158 } 159 xy0 += xy2; 160 xy1 += xy3; 161 xy = xy0 + xy1; 162 *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy); 163 for (; i < N; i++) { 164 xy = MAC16_16(xy, x[i], y[i]); 165 *err += ABS32(xy); 166 } 167 *err = *err*2e-7 + N*1e-37; 168 return xy; 169 } 170 171 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */ 172 /* operations of dual_inner_prod_neon(), and both functions should have bit */ 173 /* exact output. */ 174 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, 175 int N, opus_val32 *xy1, opus_val32 *xy2, float *err) 176 { 177 *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N); 178 *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N); 179 } 180 181 #endif /* OPUS_CHECK_ASM */ 182 183 /* ========================================================================== */ 184 185 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) 186 { 187 int i; 188 opus_val32 xy; 189 float32x4_t xy_f32x4 = vdupq_n_f32(0); 190 float32x2_t xy_f32x2; 191 192 for (i = 0; i < N - 7; i += 8) { 193 float32x4_t x_f32x4, y_f32x4; 194 x_f32x4 = vld1q_f32(&x[i]); 195 y_f32x4 = vld1q_f32(&y[i]); 196 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); 197 x_f32x4 = vld1q_f32(&x[i + 4]); 198 y_f32x4 = vld1q_f32(&y[i + 4]); 199 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); 200 } 201 202 if (N - i >= 4) { 203 const float32x4_t x_f32x4 = vld1q_f32(&x[i]); 204 const float32x4_t y_f32x4 = vld1q_f32(&y[i]); 205 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); 206 i += 4; 207 } 208 209 xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4)); 210 xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2); 211 xy = vget_lane_f32(xy_f32x2, 0); 212 213 for (; i < N; i++) { 214 xy = MAC16_16(xy, x[i], y[i]); 215 } 216 217 #ifdef OPUS_CHECK_ASM 218 { 219 float err, res; 220 res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N); 221 /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/ 222 celt_assert(ABS32(res - xy) <= err); 223 } 224 #endif 225 226 return xy; 227 } 228 229 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, 230 int N, opus_val32 *xy1, opus_val32 *xy2) 231 { 232 int i; 233 opus_val32 xy01, xy02; 234 float32x4_t xy01_f32x4 = vdupq_n_f32(0); 235 float32x4_t xy02_f32x4 = vdupq_n_f32(0); 236 float32x2_t xy01_f32x2, xy02_f32x2; 237 238 for (i = 0; i < N - 7; i += 8) { 239 float32x4_t x_f32x4, y01_f32x4, y02_f32x4; 240 x_f32x4 = vld1q_f32(&x[i]); 241 y01_f32x4 = vld1q_f32(&y01[i]); 242 y02_f32x4 = vld1q_f32(&y02[i]); 243 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); 244 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); 245 x_f32x4 = vld1q_f32(&x[i + 4]); 246 y01_f32x4 = vld1q_f32(&y01[i + 4]); 247 y02_f32x4 = vld1q_f32(&y02[i + 4]); 248 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); 249 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); 250 } 251 252 if (N - i >= 4) { 253 const float32x4_t x_f32x4 = vld1q_f32(&x[i]); 254 const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]); 255 const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]); 256 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); 257 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); 258 i += 4; 259 } 260 261 xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4)); 262 xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4)); 263 xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2); 264 xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2); 265 xy01 = vget_lane_f32(xy01_f32x2, 0); 266 xy02 = vget_lane_f32(xy02_f32x2, 0); 267 268 for (; i < N; i++) { 269 xy01 = MAC16_16(xy01, x[i], y01[i]); 270 xy02 = MAC16_16(xy02, x[i], y02[i]); 271 } 272 *xy1 = xy01; 273 *xy2 = xy02; 274 275 #ifdef OPUS_CHECK_ASM 276 { 277 opus_val32 xy1_c, xy2_c; 278 float err[2]; 279 dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err); 280 /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]); 281 if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/ 282 celt_assert(ABS32(xy1_c - *xy1) <= err[0]); 283 celt_assert(ABS32(xy2_c - *xy2) <= err[1]); 284 } 285 #endif 286 } 287 288 #endif /* FIXED_POINT */