NSQ_del_dec_neon_intr.c (65908B)
1 /*********************************************************************** 2 Copyright (c) 2017 Google Inc. 3 Redistribution and use in source and binary forms, with or without 4 modification, are permitted provided that the following conditions 5 are met: 6 - Redistributions of source code must retain the above copyright notice, 7 this list of conditions and the following disclaimer. 8 - Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the 12 names of specific contributors, may be used to endorse or promote 13 products derived from this software without specific prior written 14 permission. 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***********************************************************************/ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include <arm_neon.h> 33 #ifdef OPUS_CHECK_ASM 34 # include <string.h> 35 #endif 36 #include "main.h" 37 #include "stack_alloc.h" 38 #include "os_support.h" 39 40 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */ 41 /* If there are more states, C function is called, and this optimization must be expanded. */ 42 #define NEON_MAX_DEL_DEC_STATES 4 43 44 typedef struct { 45 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ]; 46 opus_int32 RandState[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ]; 47 opus_int32 Q_Q10[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ]; 48 opus_int32 Xq_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ]; 49 opus_int32 Pred_Q15[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ]; 50 opus_int32 Shape_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ]; 51 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ]; 52 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ]; 53 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ]; 54 opus_int32 Seed[ NEON_MAX_DEL_DEC_STATES ]; 55 opus_int32 SeedInit[ NEON_MAX_DEL_DEC_STATES ]; 56 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ]; 57 } NSQ_del_decs_struct; 58 59 typedef struct { 60 opus_int32 Q_Q10[ NEON_MAX_DEL_DEC_STATES ]; 61 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ]; 62 opus_int32 xq_Q14[ NEON_MAX_DEL_DEC_STATES ]; 63 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ]; 64 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ]; 65 opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ]; 66 opus_int32 LPC_exc_Q14[ NEON_MAX_DEL_DEC_STATES ]; 67 } NSQ_samples_struct; 68 69 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon( 70 const silk_encoder_state *psEncC, /* I Encoder State */ 71 silk_nsq_state *NSQ, /* I/O NSQ state */ 72 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ 73 const opus_int16 x16[], /* I Input */ 74 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ 75 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 76 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 77 opus_int subfr, /* I Subframe number */ 78 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 79 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 80 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 81 const opus_int signal_type, /* I Signal type */ 82 const opus_int decisionDelay /* I Decision delay */ 83 ); 84 85 /******************************************/ 86 /* Noise shape quantizer for one subframe */ 87 /******************************************/ 88 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( 89 silk_nsq_state *NSQ, /* I/O NSQ state */ 90 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ 91 opus_int signalType, /* I Signal type */ 92 const opus_int32 x_Q10[], /* I */ 93 opus_int8 pulses[], /* O */ 94 opus_int16 xq[], /* O */ 95 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 96 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ 97 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 98 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 99 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 100 opus_int lag, /* I Pitch lag */ 101 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 102 opus_int Tilt_Q14, /* I Spectral tilt */ 103 opus_int32 LF_shp_Q14, /* I */ 104 opus_int32 Gain_Q16, /* I */ 105 opus_int Lambda_Q10, /* I */ 106 opus_int offset_Q10, /* I */ 107 opus_int length, /* I Input length */ 108 opus_int subfr, /* I Subframe number */ 109 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 110 opus_int predictLPCOrder, /* I Prediction filter order */ 111 opus_int warping_Q16, /* I */ 112 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ 113 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ 114 opus_int decisionDelay /* I */ 115 ); 116 117 static OPUS_INLINE void copy_winner_state_kernel( 118 const NSQ_del_decs_struct *psDelDec, 119 const opus_int offset, 120 const opus_int last_smple_idx, 121 const opus_int Winner_ind, 122 const int32x2_t gain_lo_s32x2, 123 const int32x2_t gain_hi_s32x2, 124 const int32x4_t shift_s32x4, 125 int32x4_t t0_s32x4, 126 int32x4_t t1_s32x4, 127 opus_int8 *const pulses, 128 opus_int16 *pxq, 129 silk_nsq_state *NSQ 130 ) 131 { 132 int16x8_t t_s16x8; 133 int32x4_t o0_s32x4, o1_s32x4; 134 135 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 ); 136 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 ); 137 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 ); 138 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 ); 139 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 ); 140 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 ); 141 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 ); 142 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 ); 143 t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) ); 144 vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) ); 145 146 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 ); 147 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 ); 148 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 ); 149 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 ); 150 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 ); 151 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 ); 152 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 ); 153 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 ); 154 o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 ); 155 o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 ); 156 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 ); 157 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 ); 158 o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 ); 159 o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 ); 160 vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) ); 161 vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) ); 162 163 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 ); 164 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 ); 165 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 ); 166 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 ); 167 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 ); 168 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 ); 169 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 ); 170 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 ); 171 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 ); 172 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 ); 173 } 174 175 static OPUS_INLINE void copy_winner_state( 176 const NSQ_del_decs_struct *psDelDec, 177 const opus_int decisionDelay, 178 const opus_int smpl_buf_idx, 179 const opus_int Winner_ind, 180 const opus_int32 gain, 181 const opus_int32 shift, 182 opus_int8 *const pulses, 183 opus_int16 *pxq, 184 silk_nsq_state *NSQ 185 ) 186 { 187 opus_int i, last_smple_idx; 188 const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) ); 189 const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 ); 190 const int32x4_t shift_s32x4 = vdupq_n_s32( -shift ); 191 int32x4_t t0_s32x4, t1_s32x4; 192 193 t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */ 194 last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY; 195 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY; 196 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY; 197 198 for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) { 199 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ ); 200 } 201 for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) { 202 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 ); 203 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) ); 204 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ]; 205 } 206 207 last_smple_idx += DECISION_DELAY; 208 for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) { 209 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ ); 210 } 211 for( ; i < decisionDelay; i++, last_smple_idx-- ) { 212 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 ); 213 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) ); 214 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ]; 215 } 216 } 217 218 void silk_NSQ_del_dec_neon( 219 const silk_encoder_state *psEncC, /* I Encoder State */ 220 silk_nsq_state *NSQ, /* I/O NSQ state */ 221 SideInfoIndices *psIndices, /* I/O Quantization Indices */ 222 const opus_int16 x16[], /* I Input */ 223 opus_int8 pulses[], /* O Quantized pulse signal */ 224 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ 225 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ 226 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ 227 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ 228 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ 229 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ 230 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ 231 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ 232 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ 233 const opus_int LTP_scale_Q14 /* I LTP state scaling */ 234 ) 235 { 236 #ifdef OPUS_CHECK_ASM 237 silk_nsq_state NSQ_c; 238 SideInfoIndices psIndices_c; 239 opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; 240 const opus_int8 *const pulses_a = pulses; 241 242 ( void )pulses_a; 243 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); 244 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); 245 silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) ); 246 silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, 247 pitchL, Lambda_Q10, LTP_scale_Q14 ); 248 #endif 249 250 /* The optimization parallelizes the different delay decision states. */ 251 if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) { 252 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */ 253 /* If there are more states, C function is called, and this optimization must be expanded. */ 254 /* When the number of delay decision states is less than 3, there are penalties using this */ 255 /* optimization, and C function is called. */ 256 /* When the number of delay decision states is 2, it's better to specialize another */ 257 /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority) */ 258 silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, 259 Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 ); 260 } else { 261 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; 262 opus_int smpl_buf_idx, decisionDelay; 263 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; 264 opus_int16 *pxq; 265 VARDECL( opus_int32, sLTP_Q15 ); 266 VARDECL( opus_int16, sLTP ); 267 opus_int32 HarmShapeFIRPacked_Q14; 268 opus_int offset_Q10; 269 opus_int32 RDmin_Q10, Gain_Q10; 270 VARDECL( opus_int32, x_sc_Q10 ); 271 VARDECL( opus_int32, delayedGain_Q10 ); 272 VARDECL( NSQ_del_decs_struct, psDelDec ); 273 int32x4_t t_s32x4; 274 SAVE_STACK; 275 276 /* Set unvoiced lag to the previous one, overwrite later for voiced */ 277 lag = NSQ->lagPrev; 278 279 silk_assert( NSQ->prev_gain_Q16 != 0 ); 280 281 /* Initialize delayed decision states */ 282 ALLOC( psDelDec, 1, NSQ_del_decs_struct ); 283 OPUS_CLEAR(psDelDec, 1); 284 /* Only RandState and RD_Q10 need to be initialized to 0. */ 285 silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) ); 286 vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) ); 287 288 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) { 289 psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3; 290 } 291 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) ); 292 vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) ); 293 vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) ); 294 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 295 vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) ); 296 } 297 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) { 298 vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) ); 299 } 300 301 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; 302 smpl_buf_idx = 0; /* index of oldest samples */ 303 304 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length ); 305 306 /* For voiced frames limit the decision delay to lower than the pitch lag */ 307 if( psIndices->signalType == TYPE_VOICED ) { 308 opus_int pitch_min = pitchL[ 0 ]; 309 for( k = 1; k < psEncC->nb_subfr; k++ ) { 310 pitch_min = silk_min_int( pitch_min, pitchL[ k ] ); 311 } 312 decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 ); 313 } else { 314 if( lag > 0 ) { 315 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 ); 316 } 317 } 318 319 if( psIndices->NLSFInterpCoef_Q2 == 4 ) { 320 LSF_interpolation_flag = 0; 321 } else { 322 LSF_interpolation_flag = 1; 323 } 324 325 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); 326 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); 327 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); 328 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); 329 /* Set up pointers to start of sub frame */ 330 pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; 331 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; 332 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 333 subfr = 0; 334 for( k = 0; k < psEncC->nb_subfr; k++ ) { 335 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; 336 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; 337 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; 338 339 /* Noise shape parameters */ 340 silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); 341 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); 342 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); 343 344 NSQ->rewhite_flag = 0; 345 if( psIndices->signalType == TYPE_VOICED ) { 346 /* Voiced */ 347 lag = pitchL[ k ]; 348 349 /* Re-whitening */ 350 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { 351 if( k == 2 ) { 352 /* RESET DELAYED DECISIONS */ 353 /* Find winner */ 354 int32x4_t RD_Q10_s32x4; 355 RDmin_Q10 = psDelDec->RD_Q10[ 0 ]; 356 Winner_ind = 0; 357 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) { 358 if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) { 359 RDmin_Q10 = psDelDec->RD_Q10[ i ]; 360 Winner_ind = i; 361 } 362 } 363 psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 ); 364 RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 ); 365 RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) ); 366 vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 ); 367 368 /* Copy final part of signals from winner state to output and long-term filter states */ 369 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ ); 370 371 subfr = 0; 372 } 373 374 /* Rewhiten with new A coefs */ 375 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; 376 silk_assert( start_idx > 0 ); 377 378 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], 379 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); 380 381 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 382 NSQ->rewhite_flag = 1; 383 } 384 } 385 386 silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, 387 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); 388 389 silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, 390 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], 391 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, 392 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay ); 393 394 x16 += psEncC->subfr_length; 395 pulses += psEncC->subfr_length; 396 pxq += psEncC->subfr_length; 397 } 398 399 /* Find winner */ 400 RDmin_Q10 = psDelDec->RD_Q10[ 0 ]; 401 Winner_ind = 0; 402 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) { 403 if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) { 404 RDmin_Q10 = psDelDec->RD_Q10[ k ]; 405 Winner_ind = k; 406 } 407 } 408 409 /* Copy final part of signals from winner state to output and long-term filter states */ 410 psIndices->Seed = psDelDec->SeedInit[ Winner_ind ]; 411 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 ); 412 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ ); 413 414 t_s32x4 = vdupq_n_s32( 0 ); /* initialization */ 415 for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) { 416 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 ); 417 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 ); 418 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 ); 419 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 ); 420 vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 ); 421 } 422 423 for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) { 424 NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ]; 425 } 426 427 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) { 428 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 ); 429 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 ); 430 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 ); 431 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 ); 432 vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 ); 433 } 434 435 for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) { 436 NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ]; 437 } 438 439 /* Update states */ 440 NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ]; 441 NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ]; 442 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; 443 444 /* Save quantized speech signal */ 445 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); 446 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); 447 RESTORE_STACK; 448 } 449 450 #ifdef OPUS_CHECK_ASM 451 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); 452 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); 453 silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) ); 454 #endif 455 } 456 457 /******************************************/ 458 /* Noise shape quantizer for one subframe */ 459 /******************************************/ 460 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */ 461 /* Therefore here we append "_local" to the NEON function name to avoid confusion. */ 462 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order) 463 { 464 int16x8_t t_s16x8; 465 int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4; 466 silk_assert( order == 10 || order == 16 ); 467 468 t_s16x8 = vld1q_s16( in + 0 ); /* 7 6 5 4 3 2 1 0 */ 469 t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */ 470 t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* 4 5 6 7 */ 471 t3_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 0 1 2 3 */ 472 473 if( order == 16 ) { 474 t_s16x8 = vld1q_s16( in + 8 ); /* F E D C B A 9 8 */ 475 t_s16x8 = vrev64q_s16( t_s16x8 ); /* C D E F 8 9 A B */ 476 t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* C D E F */ 477 t1_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 8 9 A B */ 478 } else { 479 int16x4_t t_s16x4; 480 481 t0_s32x4 = vdupq_n_s32( 0 ); /* zero zero zero zero */ 482 t_s16x4 = vld1_s16( in + 6 ); /* 9 8 7 6 */ 483 t_s16x4 = vrev64_s16( t_s16x4 ); /* 6 7 8 9 */ 484 t1_s32x4 = vshll_n_s16( t_s16x4, 15 ); 485 t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8 9 zero zero */ 486 } 487 vst1q_s32( out + 0, t0_s32x4 ); 488 vst1q_s32( out + 4, t1_s32x4 ); 489 vst1q_s32( out + 8, t2_s32x4 ); 490 vst1q_s32( out + 12, t3_s32x4 ); 491 } 492 493 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon( 494 const int32x4_t out_s32x4, 495 const int32x4_t in_s32x4, 496 const int32x2_t coef_s32x2 497 ) 498 { 499 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) ); 500 } 501 502 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon( 503 const int32x4_t out_s32x4, 504 const int32x4_t in_s32x4, 505 const int32x2_t coef_s32x2 506 ) 507 { 508 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) ); 509 } 510 511 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */ 512 /* Therefore here we append "_local" to the function name to avoid confusion. */ 513 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order) 514 { 515 const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 ); 516 const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 ); 517 const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 ); 518 const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 ); 519 int32x4_t LPC_pred_Q14_s32x4; 520 521 silk_assert( order == 10 || order == 16 ); 522 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 523 LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) ); 524 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) ); 525 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) ); 526 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) ); 527 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) ); 528 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) ); 529 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) ); 530 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) ); 531 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) ); 532 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) ); 533 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) ); 534 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) ); 535 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) ); 536 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) ); 537 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) ); 538 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) ); 539 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) ); 540 541 return LPC_pred_Q14_s32x4; 542 } 543 544 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( 545 silk_nsq_state *NSQ, /* I/O NSQ state */ 546 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ 547 opus_int signalType, /* I Signal type */ 548 const opus_int32 x_Q10[], /* I */ 549 opus_int8 pulses[], /* O */ 550 opus_int16 xq[], /* O */ 551 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 552 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ 553 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 554 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 555 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 556 opus_int lag, /* I Pitch lag */ 557 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 558 opus_int Tilt_Q14, /* I Spectral tilt */ 559 opus_int32 LF_shp_Q14, /* I */ 560 opus_int32 Gain_Q16, /* I */ 561 opus_int Lambda_Q10, /* I */ 562 opus_int offset_Q10, /* I */ 563 opus_int length, /* I Input length */ 564 opus_int subfr, /* I Subframe number */ 565 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 566 opus_int predictLPCOrder, /* I Prediction filter order */ 567 opus_int warping_Q16, /* I */ 568 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ 569 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ 570 opus_int decisionDelay /* I */ 571 ) 572 { 573 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; 574 opus_int32 Winner_rand_state; 575 opus_int32 LTP_pred_Q14, n_LTP_Q14; 576 opus_int32 RDmin_Q10, RDmax_Q10; 577 opus_int32 Gain_Q10; 578 opus_int32 *pred_lag_ptr, *shp_lag_ptr; 579 opus_int32 a_Q12_arch[MAX_LPC_ORDER]; 580 const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 ); 581 const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1; 582 opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ]; 583 const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER ); 584 const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT ); 585 586 VARDECL( NSQ_samples_struct, psSampleState ); 587 SAVE_STACK; 588 589 silk_assert( nStatesDelayedDecision > 0 ); 590 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ 591 ALLOC( psSampleState, 2, NSQ_samples_struct ); 592 OPUS_CLEAR(psSampleState, 2); 593 594 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; 595 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; 596 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); 597 598 for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) { 599 const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i ); 600 vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) ); 601 vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) ); 602 } 603 604 for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) { 605 AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 ); 606 } 607 608 silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder ); 609 610 for( i = 0; i < length; i++ ) { 611 int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4; 612 int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4; 613 int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4; 614 int32x2_t AR_shp_Q28_s32x2; 615 int16x4_t r_Q10_s16x4, rr_Q10_s16x4; 616 617 /* Perform common calculations used in all states */ 618 619 /* Long-term prediction */ 620 if( signalType == TYPE_VOICED ) { 621 /* Unrolled loop */ 622 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 623 LTP_pred_Q14 = 2; 624 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] ); 625 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] ); 626 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] ); 627 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] ); 628 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); 629 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */ 630 pred_lag_ptr++; 631 } else { 632 LTP_pred_Q14 = 0; 633 } 634 635 /* Long-term shaping */ 636 if( lag > 0 ) { 637 /* Symmetric, packed FIR coefficients */ 638 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); 639 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); 640 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ 641 shp_lag_ptr++; 642 } else { 643 n_LTP_Q14 = 0; 644 } 645 646 /* Generate dither */ 647 Seed_s32x4 = vld1q_s32( psDelDec->Seed ); 648 Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) ); 649 vst1q_s32( psDelDec->Seed, Seed_s32x4 ); 650 651 /* Short-term prediction */ 652 LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder); 653 LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */ 654 655 /* Noise shape feedback */ 656 /* Output of lowpass section */ 657 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 ); 658 /* Output of allpass section */ 659 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 ); 660 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 ); 661 vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 ); 662 AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 ); 663 n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); 664 665 /* Loop over allpass sections */ 666 for( j = 2; j < shapingLPCOrder; j += 2 ) { 667 /* Output of allpass section */ 668 tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 ); 669 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 ); 670 vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 ); 671 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); 672 /* Output of allpass section */ 673 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 ); 674 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 ); 675 vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 ); 676 AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] ); 677 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); 678 } 679 vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 ); 680 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); 681 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */ 682 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */ 683 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */ 684 n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */ 685 n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */ 686 n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */ 687 688 /* Input minus prediction plus noise feedback */ 689 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ 690 tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */ 691 tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */ 692 tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */ 693 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */ 694 tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */ 695 696 /* Flip sign depending on dither */ 697 sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) ); 698 tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 ); 699 tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 ); 700 tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) ); 701 tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) ); 702 r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 ); 703 704 /* Find two quantization level candidates and measure their rate-distortion */ 705 { 706 int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) ); 707 int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 ); 708 int16x4_t q2_Q10_s16x4; 709 int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4; 710 uint32x4_t t_u32x4; 711 712 if( Lambda_Q10 > 2048 ) { 713 /* For aggressive RDO, the bias becomes more than one pulse. */ 714 const int rdo_offset = Lambda_Q10/2 - 512; 715 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ); 716 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) ); 717 int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) ); 718 signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset ); 719 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */ 720 silk_assert( Lambda_Q10 <= 32767 ); 721 722 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) ); 723 q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4); 724 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 ); 725 } 726 { 727 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) ); 728 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); 729 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); 730 int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4; 731 732 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 ); 733 tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) ); 734 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 ); 735 tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) ); 736 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4); 737 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 ); 738 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 ); 739 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 ); 740 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) ); 741 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 ); 742 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 ); 743 tmp1_s16x4 = q1_Q10_s16x4; 744 tmp2_s16x4 = q2_Q10_s16x4; 745 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 ); 746 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 ); 747 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) ); 748 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) ); 749 } 750 751 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 ); 752 rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 ); 753 rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 ); 754 755 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 ); 756 rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 ); 757 rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 ); 758 759 tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 ); 760 tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) ); 761 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) ); 762 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 ); 763 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 ); 764 t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ); 765 tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) ); 766 tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) ); 767 vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 ); 768 vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 ); 769 } 770 771 { 772 /* Update states for best quantization */ 773 int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4; 774 775 /* Quantized excitation */ 776 exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 ); 777 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 ); 778 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 ); 779 780 /* Add predictions */ 781 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) ); 782 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 ); 783 784 /* Update states */ 785 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) ); 786 vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 ); 787 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 ); 788 vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) ); 789 vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 ); 790 vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 ); 791 vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 ); 792 793 /* Quantized excitation */ 794 exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 ); 795 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 ); 796 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 ); 797 798 /* Add predictions */ 799 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) ); 800 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 ); 801 802 /* Update states */ 803 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) ); 804 vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 ); 805 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 ); 806 vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) ); 807 vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 ); 808 vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 ); 809 vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 ); 810 } 811 812 *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1); 813 last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY; 814 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY; 815 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY; 816 817 /* Find winner */ 818 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ]; 819 Winner_ind = 0; 820 for( k = 1; k < nStatesDelayedDecision; k++ ) { 821 if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) { 822 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ]; 823 Winner_ind = k; 824 } 825 } 826 827 /* clear unused part of RD_Q10 to avoid overflows */ 828 if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES ) 829 { 830 OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision); 831 OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision); 832 } 833 834 /* Increase RD values of expired states */ 835 { 836 uint32x4_t t_u32x4; 837 Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ]; 838 t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) ); 839 t_u32x4 = vmvnq_u32( t_u32x4 ); 840 t_u32x4 = vshrq_n_u32( t_u32x4, 5 ); 841 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 ); 842 tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 ); 843 tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) ); 844 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) ); 845 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 ); 846 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 ); 847 848 /* Find worst in first set and best in second set */ 849 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ]; 850 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ]; 851 RDmax_ind = 0; 852 RDmin_ind = 0; 853 for( k = 1; k < nStatesDelayedDecision; k++ ) { 854 /* find worst in first set */ 855 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) { 856 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ]; 857 RDmax_ind = k; 858 } 859 /* find best in second set */ 860 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) { 861 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ]; 862 RDmin_ind = k; 863 } 864 } 865 } 866 867 /* Replace a state if best from second set outperforms worst in first set */ 868 if( RDmin_Q10 < RDmax_Q10 ) { 869 opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState; 870 const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) ) 871 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) ); 872 /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several */ 873 /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */ 874 /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. */ 875 for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) { 876 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ]; 877 } 878 for( j = 0; j < numOthers; j++ ) { 879 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ]; 880 } 881 882 psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ]; 883 psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ]; 884 psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ]; 885 psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ]; 886 psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ]; 887 psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ]; 888 psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ]; 889 } 890 891 /* Write samples from winner to output and long-term filter states */ 892 if( subfr > 0 || i >= decisionDelay ) { 893 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 ); 894 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 895 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) ); 896 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ]; 897 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDec->Pred_Q15[ last_smple_idx ][ Winner_ind ]; 898 } 899 NSQ->sLTP_shp_buf_idx++; 900 NSQ->sLTP_buf_idx++; 901 902 /* Update states */ 903 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) ); 904 vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) ); 905 vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) ); 906 vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) ); 907 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 ); 908 vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 ); 909 vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) ); 910 vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) ); 911 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 ); 912 tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32( 913 vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) ); 914 vst1q_s32( psDelDec->Seed, tmp1_s32x4 ); 915 vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 ); 916 vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) ); 917 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; 918 } 919 /* Update LPC states */ 920 silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 921 922 RESTORE_STACK; 923 } 924 925 static OPUS_INLINE void silk_SMULWB_8_neon( 926 const opus_int16 *a, 927 const int32x2_t b, 928 opus_int32 *o 929 ) 930 { 931 const int16x8_t a_s16x8 = vld1q_s16( a ); 932 int32x4_t o0_s32x4, o1_s32x4; 933 934 o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 ); 935 o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 ); 936 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 ); 937 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 ); 938 vst1q_s32( o, o0_s32x4 ); 939 vst1q_s32( o + 4, o1_s32x4 ); 940 } 941 942 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */ 943 static OPUS_INLINE void silk_SMULWW_small_b_4_neon( 944 opus_int32 *a, 945 const int32x2_t b_s32x2) 946 { 947 int32x4_t o_s32x4; 948 949 o_s32x4 = vld1q_s32( a ); 950 o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 ); 951 vst1q_s32( a, o_s32x4 ); 952 } 953 954 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */ 955 static OPUS_INLINE void silk_SMULWW_small_b_8_neon( 956 opus_int32 *a, 957 const int32x2_t b_s32x2 958 ) 959 { 960 int32x4_t o0_s32x4, o1_s32x4; 961 962 o0_s32x4 = vld1q_s32( a ); 963 o1_s32x4 = vld1q_s32( a + 4 ); 964 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 ); 965 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 ); 966 vst1q_s32( a, o0_s32x4 ); 967 vst1q_s32( a + 4, o1_s32x4 ); 968 } 969 970 static OPUS_INLINE void silk_SMULWW_4_neon( 971 opus_int32 *a, 972 const int32x2_t b_s32x2) 973 { 974 int32x4_t a_s32x4, o_s32x4; 975 976 a_s32x4 = vld1q_s32( a ); 977 o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 ); 978 o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 ); 979 vst1q_s32( a, o_s32x4 ); 980 } 981 982 static OPUS_INLINE void silk_SMULWW_8_neon( 983 opus_int32 *a, 984 const int32x2_t b_s32x2 985 ) 986 { 987 int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4; 988 989 a0_s32x4 = vld1q_s32( a ); 990 a1_s32x4 = vld1q_s32( a + 4 ); 991 o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 ); 992 o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 ); 993 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 ); 994 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 ); 995 vst1q_s32( a, o0_s32x4 ); 996 vst1q_s32( a + 4, o1_s32x4 ); 997 } 998 999 static OPUS_INLINE void silk_SMULWW_loop_neon( 1000 const opus_int16 *a, 1001 const opus_int32 b, 1002 opus_int32 *o, 1003 const opus_int loop_num 1004 ) 1005 { 1006 opus_int i; 1007 int32x2_t b_s32x2; 1008 1009 b_s32x2 = vdup_n_s32( b ); 1010 for( i = 0; i < loop_num - 7; i += 8 ) { 1011 silk_SMULWB_8_neon( a + i, b_s32x2, o + i ); 1012 } 1013 for( ; i < loop_num; i++ ) { 1014 o[ i ] = silk_SMULWW( a[ i ], b ); 1015 } 1016 } 1017 1018 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon( 1019 const silk_encoder_state *psEncC, /* I Encoder State */ 1020 silk_nsq_state *NSQ, /* I/O NSQ state */ 1021 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */ 1022 const opus_int16 x16[], /* I Input */ 1023 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ 1024 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 1025 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 1026 opus_int subfr, /* I Subframe number */ 1027 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 1028 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 1029 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 1030 const opus_int signal_type, /* I Signal type */ 1031 const opus_int decisionDelay /* I Decision delay */ 1032 ) 1033 { 1034 opus_int i, lag; 1035 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; 1036 1037 lag = pitchL[ subfr ]; 1038 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); 1039 silk_assert( inv_gain_Q31 != 0 ); 1040 1041 /* Scale input */ 1042 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); 1043 silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length ); 1044 1045 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ 1046 if( NSQ->rewhite_flag ) { 1047 if( subfr == 0 ) { 1048 /* Do LTP downscaling */ 1049 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); 1050 } 1051 silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 ); 1052 } 1053 1054 /* Adjust for changing gain */ 1055 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { 1056 int32x2_t gain_adj_Q16_s32x2; 1057 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); 1058 1059 /* Scale long-term shaping state */ 1060 if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) { 1061 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) ); 1062 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) { 1063 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 ); 1064 } 1065 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { 1066 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); 1067 } 1068 1069 /* Scale long-term prediction state */ 1070 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { 1071 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) { 1072 silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 ); 1073 } 1074 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { 1075 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); 1076 } 1077 } 1078 1079 /* Scale scalar states */ 1080 silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 ); 1081 silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 ); 1082 1083 /* Scale short-term prediction and shaping states */ 1084 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 1085 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 ); 1086 } 1087 1088 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { 1089 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 ); 1090 } 1091 1092 for( i = 0; i < DECISION_DELAY; i++ ) { 1093 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 ); 1094 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 ); 1095 } 1096 } else { 1097 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) ); 1098 gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 ); 1099 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) { 1100 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 ); 1101 } 1102 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { 1103 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); 1104 } 1105 1106 /* Scale long-term prediction state */ 1107 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { 1108 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) { 1109 silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 ); 1110 } 1111 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { 1112 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); 1113 } 1114 } 1115 1116 /* Scale scalar states */ 1117 silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 ); 1118 silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 ); 1119 1120 /* Scale short-term prediction and shaping states */ 1121 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 1122 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 ); 1123 } 1124 1125 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { 1126 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 ); 1127 } 1128 1129 for( i = 0; i < DECISION_DELAY; i++ ) { 1130 silk_SMULWW_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 ); 1131 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 ); 1132 } 1133 } 1134 1135 /* Save inverse gain */ 1136 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; 1137 } 1138 }