celt_neon_intr.c (13324B)
1 /* Copyright (c) 2014-2015 Xiph.Org Foundation 2 Copyright (c) 2024 Arm Limited 3 Written by Viswanath Puttagunta */ 4 /** 5 @file celt_neon_intr.c 6 @brief ARM Neon Intrinsic optimizations for celt 7 */ 8 9 /* 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 - Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 - Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in the 19 documentation and/or other materials provided with the distribution. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 25 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifdef HAVE_CONFIG_H 35 #include "config.h" 36 #endif 37 38 #include <arm_neon.h> 39 #include "../float_cast.h" 40 #include "../mathops.h" 41 #include "../pitch.h" 42 #include <stddef.h> 43 #if defined(OPUS_CHECK_ASM) 44 #include <stdlib.h> 45 #endif 46 47 #if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR) 48 49 void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) 50 { 51 int i = 0; 52 53 #if defined(__ARM_NEON) 54 const int BLOCK_SIZE = 16; 55 const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE; 56 57 for (; i < blockedSize; i += BLOCK_SIZE) 58 { 59 float32x4_t orig_a = vld1q_f32(&in[i + 0]); 60 float32x4_t orig_b = vld1q_f32(&in[i + 4]); 61 float32x4_t orig_c = vld1q_f32(&in[i + 8]); 62 float32x4_t orig_d = vld1q_f32(&in[i + 12]); 63 64 int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE))); 65 int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE))); 66 int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE))); 67 int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE))); 68 69 vst1_s16(&out[i + 0], asShort_a); 70 vst1_s16(&out[i + 4], asShort_b); 71 vst1_s16(&out[i + 8], asShort_c); 72 vst1_s16(&out[i + 12], asShort_d); 73 # if defined(OPUS_CHECK_ASM) 74 short out_c[BLOCK_SIZE]; 75 int j; 76 for(j = 0; j < BLOCK_SIZE; j++) 77 { 78 out_c[j] = FLOAT2INT16(in[i + j]); 79 celt_assert(abs((out_c[j] - out[i + j])) <= 1); 80 } 81 # endif 82 } 83 #endif 84 85 for (; i < cnt; i++) 86 { 87 out[i] = FLOAT2INT16(in[i]); 88 } 89 } 90 91 int opus_limit2_checkwithin1_neon(float *samples, int cnt) 92 { 93 const float hardclipMin = -2.0f; 94 const float hardclipMax = 2.0f; 95 96 int i = 0; 97 int exceeding1 = 0; 98 int nextIndex = 0; 99 100 #if defined(__ARM_NEON) 101 const int BLOCK_SIZE = 16; 102 const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE; 103 104 float32x4_t min_all_0 = vdupq_n_f32(0.0f); 105 float32x4_t min_all_1 = vdupq_n_f32(0.0f); 106 float32x4_t max_all_0 = vdupq_n_f32(0.0f); 107 float32x4_t max_all_1 = vdupq_n_f32(0.0f); 108 109 float max, min; 110 111 for (i = 0; i < blockedSize; i += BLOCK_SIZE) 112 { 113 const float32x4_t orig_a = vld1q_f32(&samples[i + 0]); 114 const float32x4_t orig_b = vld1q_f32(&samples[i + 4]); 115 const float32x4_t orig_c = vld1q_f32(&samples[i + 8]); 116 const float32x4_t orig_d = vld1q_f32(&samples[i + 12]); 117 max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b)); 118 max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d)); 119 min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b)); 120 min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d)); 121 } 122 123 max = vmaxvf(vmaxq_f32(max_all_0, max_all_1)); 124 min = vminvf(vminq_f32(min_all_0, min_all_1)); 125 126 if (min < hardclipMin || max > hardclipMax) 127 { 128 const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin); 129 const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax); 130 for (i = 0; i < blockedSize; i += BLOCK_SIZE) 131 { 132 const float32x4_t orig_a = vld1q_f32(&samples[i + 0]); 133 const float32x4_t orig_b = vld1q_f32(&samples[i + 4]); 134 const float32x4_t orig_c = vld1q_f32(&samples[i + 8]); 135 const float32x4_t orig_d = vld1q_f32(&samples[i + 12]); 136 const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg)); 137 const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg)); 138 const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg)); 139 const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg)); 140 vst1q_f32(&samples[i + 0], clipped_a); 141 vst1q_f32(&samples[i + 4], clipped_b); 142 vst1q_f32(&samples[i + 8], clipped_c); 143 vst1q_f32(&samples[i + 12], clipped_d); 144 } 145 } 146 147 nextIndex = blockedSize; 148 exceeding1 |= max > 1.0f || min < -1.0f; 149 150 #endif 151 152 for (i = nextIndex; i < cnt; i++) 153 { 154 const float origVal = samples[i]; 155 float clippedVal = origVal; 156 clippedVal = MAX16(hardclipMin, clippedVal); 157 clippedVal = MIN16(hardclipMax, clippedVal); 158 samples[i] = clippedVal; 159 160 exceeding1 |= origVal > 1.0f || origVal < -1.0f; 161 } 162 163 return !exceeding1; 164 } 165 166 #endif 167 168 169 #if defined(FIXED_POINT) 170 #include <string.h> 171 172 void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) 173 { 174 int j; 175 int32x4_t a = vld1q_s32(sum); 176 /* Load y[0...3] */ 177 /* This requires len>0 to always be valid (which we assert in the C code). */ 178 int16x4_t y0 = vld1_s16(y); 179 y += 4; 180 181 /* This loop loads one y value more than we actually need. 182 Therefore we have to stop as soon as there are 8 or fewer samples left 183 (instead of 7), to avoid reading past the end of the array. */ 184 for (j = 0; j + 8 < len; j += 8) 185 { 186 /* Load x[0...7] */ 187 int16x8_t xx = vld1q_s16(x); 188 int16x4_t x0 = vget_low_s16(xx); 189 int16x4_t x4 = vget_high_s16(xx); 190 /* Load y[4...11] */ 191 int16x8_t yy = vld1q_s16(y); 192 int16x4_t y4 = vget_low_s16(yy); 193 int16x4_t y8 = vget_high_s16(yy); 194 int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); 195 int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0); 196 197 int16x4_t y1 = vext_s16(y0, y4, 1); 198 int16x4_t y5 = vext_s16(y4, y8, 1); 199 int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1); 200 int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1); 201 202 int16x4_t y2 = vext_s16(y0, y4, 2); 203 int16x4_t y6 = vext_s16(y4, y8, 2); 204 int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2); 205 int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2); 206 207 int16x4_t y3 = vext_s16(y0, y4, 3); 208 int16x4_t y7 = vext_s16(y4, y8, 3); 209 int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3); 210 int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3); 211 212 y0 = y8; 213 a = a7; 214 x += 8; 215 y += 8; 216 } 217 if (j + 4 < len) { 218 /* Load x[0...3] */ 219 int16x4_t x0 = vld1_s16(x); 220 /* Load y[4...7] */ 221 int16x4_t y4 = vld1_s16(y); 222 int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); 223 int16x4_t y1 = vext_s16(y0, y4, 1); 224 int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1); 225 int16x4_t y2 = vext_s16(y0, y4, 2); 226 int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2); 227 int16x4_t y3 = vext_s16(y0, y4, 3); 228 int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3); 229 y0 = y4; 230 a = a3; 231 x += 4; 232 y += 4; 233 j += 4; 234 } 235 if (j + 2 < len) { 236 /* Load x[0...1] */ 237 int16x4x2_t xx = vld2_dup_s16(x); 238 int16x4_t x0 = xx.val[0]; 239 int16x4_t x1 = xx.val[1]; 240 /* Load y[4...5]. 241 We would like to use vld1_dup_s32(), but casting the pointer would 242 break strict aliasing rules and potentially have alignment issues. 243 Fortunately the compiler seems capable of translating this memcpy() 244 and vdup_n_s32() into the equivalent vld1_dup_s32().*/ 245 int32_t yy; 246 memcpy(&yy, y, sizeof(yy)); 247 int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy)); 248 int32x4_t a0 = vmlal_s16(a, y0, x0); 249 int16x4_t y1 = vext_s16(y0, y4, 1); 250 /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0, 251 using VSRI instead of VEXT, since it's a data-processing 252 instruction. */ 253 y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), 254 vreinterpret_s64_s16(y0), 32)); 255 int32x4_t a1 = vmlal_s16(a0, y1, x1); 256 a = a1; 257 x += 2; 258 y += 2; 259 j += 2; 260 } 261 if (j + 1 < len) { 262 /* Load next x. */ 263 int16x4_t x0 = vld1_dup_s16(x); 264 int32x4_t a0 = vmlal_s16(a, y0, x0); 265 /* Load last y. */ 266 int16x4_t y4 = vld1_dup_s16(y); 267 y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), 268 vreinterpret_s64_s16(y0), 16)); 269 a = a0; 270 x++; 271 } 272 /* Load last x. */ 273 int16x4_t x0 = vld1_dup_s16(x); 274 int32x4_t a0 = vmlal_s16(a, y0, x0); 275 vst1q_s32(sum, a0); 276 } 277 278 #else 279 280 #if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64) 281 /* If we can, force the compiler to use an FMA instruction rather than break 282 * vmlaq_f32() into fmul/fadd. */ 283 #ifdef vmlaq_lane_f32 284 #undef vmlaq_lane_f32 285 #endif 286 #define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane) 287 #endif 288 289 290 /* 291 * Function: xcorr_kernel_neon_float 292 * --------------------------------- 293 * Computes 4 correlation values and stores them in sum[4] 294 */ 295 static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y, 296 float32_t sum[4], int len) { 297 float32x4_t YY[3]; 298 float32x4_t YEXT[3]; 299 float32x4_t XX[2]; 300 float32x2_t XX_2; 301 float32x4_t SUMM; 302 const float32_t *xi = x; 303 const float32_t *yi = y; 304 305 celt_assert(len>0); 306 307 YY[0] = vld1q_f32(yi); 308 SUMM = vdupq_n_f32(0); 309 310 /* Consume 8 elements in x vector and 12 elements in y 311 * vector. However, the 12'th element never really gets 312 * touched in this loop. So, if len == 8, then we only 313 * must access y[0] to y[10]. y[11] must not be accessed 314 * hence make sure len > 8 and not len >= 8 315 */ 316 while (len > 8) { 317 yi += 4; 318 YY[1] = vld1q_f32(yi); 319 yi += 4; 320 YY[2] = vld1q_f32(yi); 321 322 XX[0] = vld1q_f32(xi); 323 xi += 4; 324 XX[1] = vld1q_f32(xi); 325 xi += 4; 326 327 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); 328 YEXT[0] = vextq_f32(YY[0], YY[1], 1); 329 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); 330 YEXT[1] = vextq_f32(YY[0], YY[1], 2); 331 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); 332 YEXT[2] = vextq_f32(YY[0], YY[1], 3); 333 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); 334 335 SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0); 336 YEXT[0] = vextq_f32(YY[1], YY[2], 1); 337 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1); 338 YEXT[1] = vextq_f32(YY[1], YY[2], 2); 339 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0); 340 YEXT[2] = vextq_f32(YY[1], YY[2], 3); 341 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1); 342 343 YY[0] = YY[2]; 344 len -= 8; 345 } 346 347 /* Consume 4 elements in x vector and 8 elements in y 348 * vector. However, the 8'th element in y never really gets 349 * touched in this loop. So, if len == 4, then we only 350 * must access y[0] to y[6]. y[7] must not be accessed 351 * hence make sure len>4 and not len>=4 352 */ 353 if (len > 4) { 354 yi += 4; 355 YY[1] = vld1q_f32(yi); 356 357 XX[0] = vld1q_f32(xi); 358 xi += 4; 359 360 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); 361 YEXT[0] = vextq_f32(YY[0], YY[1], 1); 362 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); 363 YEXT[1] = vextq_f32(YY[0], YY[1], 2); 364 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); 365 YEXT[2] = vextq_f32(YY[0], YY[1], 3); 366 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); 367 368 YY[0] = YY[1]; 369 len -= 4; 370 } 371 372 while (--len > 0) { 373 XX_2 = vld1_dup_f32(xi++); 374 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); 375 YY[0]= vld1q_f32(++yi); 376 } 377 378 XX_2 = vld1_dup_f32(xi); 379 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); 380 381 vst1q_f32(sum, SUMM); 382 } 383 384 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, 385 opus_val32 *xcorr, int len, int max_pitch, int arch) { 386 int i; 387 (void)arch; 388 celt_assert(max_pitch > 0); 389 celt_sig_assert((((size_t)_x)&3)==0); 390 391 for (i = 0; i < (max_pitch-3); i += 4) { 392 xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i, 393 (float32_t *)xcorr+i, len); 394 } 395 396 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */ 397 for (; i < max_pitch; i++) { 398 xcorr[i] = celt_inner_prod_neon(_x, _y+i, len); 399 } 400 } 401 #endif