vad_core.h (4108B)
1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 /* 12 * This header file includes the descriptions of the core VAD calls. 13 */ 14 15 #ifndef COMMON_AUDIO_VAD_VAD_CORE_H_ 16 #define COMMON_AUDIO_VAD_VAD_CORE_H_ 17 18 #include <stddef.h> 19 #include <stdint.h> 20 21 #include "common_audio/signal_processing/include/signal_processing_library.h" 22 23 // TODO(https://bugs.webrtc.org/14476): When converted to C++, remove the macro. 24 #if defined(__cplusplus) 25 #define CONSTEXPR_INT(x) constexpr int x 26 #else 27 #define CONSTEXPR_INT(x) enum { x } 28 #endif 29 30 CONSTEXPR_INT(kNumChannels = 6); // Number of frequency bands (named channels). 31 CONSTEXPR_INT( 32 kNumGaussians = 2); // Number of Gaussians per channel in the GMM. 33 CONSTEXPR_INT(kTableSize = kNumChannels * kNumGaussians); 34 CONSTEXPR_INT( 35 kMinEnergy = 10); // Minimum energy required to trigger audio signal. 36 37 typedef struct VadInstT_ { 38 int vad; 39 int32_t downsampling_filter_states[4]; 40 WebRtcSpl_State48khzTo8khz state_48_to_8; 41 int16_t noise_means[kTableSize]; 42 int16_t speech_means[kTableSize]; 43 int16_t noise_stds[kTableSize]; 44 int16_t speech_stds[kTableSize]; 45 // TODO(bjornv): Change to `frame_count`. 46 int32_t frame_counter; 47 int16_t over_hang; // Over Hang 48 int16_t num_of_speech; 49 // TODO(bjornv): Change to `age_vector`. 50 int16_t index_vector[16 * kNumChannels]; 51 int16_t low_value_vector[16 * kNumChannels]; 52 // TODO(bjornv): Change to `median`. 53 int16_t mean_value[kNumChannels]; 54 int16_t upper_state[5]; 55 int16_t lower_state[5]; 56 int16_t hp_filter_state[4]; 57 int16_t over_hang_max_1[3]; 58 int16_t over_hang_max_2[3]; 59 int16_t individual[3]; 60 int16_t total[3]; 61 62 int init_flag; 63 } VadInstT; 64 65 // Initializes the core VAD component. The default aggressiveness mode is 66 // controlled by `kDefaultMode` in vad_core.c. 67 // 68 // - self [i/o] : Instance that should be initialized 69 // 70 // returns : 0 (OK), -1 (null pointer in or if the default mode can't be 71 // set) 72 int WebRtcVad_InitCore(VadInstT* self); 73 74 /**************************************************************************** 75 * WebRtcVad_set_mode_core(...) 76 * 77 * This function changes the VAD settings 78 * 79 * Input: 80 * - inst : VAD instance 81 * - mode : Aggressiveness degree 82 * 0 (High quality) - 3 (Highly aggressive) 83 * 84 * Output: 85 * - inst : Changed instance 86 * 87 * Return value : 0 - Ok 88 * -1 - Error 89 */ 90 91 int WebRtcVad_set_mode_core(VadInstT* self, int mode); 92 93 /**************************************************************************** 94 * WebRtcVad_CalcVad48khz(...) 95 * WebRtcVad_CalcVad32khz(...) 96 * WebRtcVad_CalcVad16khz(...) 97 * WebRtcVad_CalcVad8khz(...) 98 * 99 * Calculate probability for active speech and make VAD decision. 100 * 101 * Input: 102 * - inst : Instance that should be initialized 103 * - speech_frame : Input speech frame 104 * - frame_length : Number of input samples 105 * 106 * Output: 107 * - inst : Updated filter states etc. 108 * 109 * Return value : VAD decision 110 * 0 - No active speech 111 * 1-6 - Active speech 112 */ 113 int WebRtcVad_CalcVad48khz(VadInstT* inst, 114 const int16_t* speech_frame, 115 size_t frame_length); 116 int WebRtcVad_CalcVad32khz(VadInstT* inst, 117 const int16_t* speech_frame, 118 size_t frame_length); 119 int WebRtcVad_CalcVad16khz(VadInstT* inst, 120 const int16_t* speech_frame, 121 size_t frame_length); 122 int WebRtcVad_CalcVad8khz(VadInstT* inst, 123 const int16_t* speech_frame, 124 size_t frame_length); 125 126 #endif // COMMON_AUDIO_VAD_VAD_CORE_H_