audio_encoder_cng.cc (11199B)
1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_coding/codecs/cng/audio_encoder_cng.h" 12 13 #include <cstddef> 14 #include <cstdint> 15 #include <memory> 16 #include <optional> 17 #include <utility> 18 #include <vector> 19 20 #include "api/array_view.h" 21 #include "api/audio_codecs/audio_encoder.h" 22 #include "api/units/time_delta.h" 23 #include "common_audio/vad/include/vad.h" 24 #include "modules/audio_coding/codecs/cng/webrtc_cng.h" 25 #include "rtc_base/buffer.h" 26 #include "rtc_base/checks.h" 27 28 namespace webrtc { 29 30 namespace { 31 32 const int kMaxFrameSizeMs = 60; 33 34 class AudioEncoderCng final : public AudioEncoder { 35 public: 36 explicit AudioEncoderCng(AudioEncoderCngConfig&& config); 37 ~AudioEncoderCng() override; 38 39 // Not copyable or moveable. 40 AudioEncoderCng(const AudioEncoderCng&) = delete; 41 AudioEncoderCng(AudioEncoderCng&&) = delete; 42 AudioEncoderCng& operator=(const AudioEncoderCng&) = delete; 43 AudioEncoderCng& operator=(AudioEncoderCng&&) = delete; 44 45 int SampleRateHz() const override; 46 size_t NumChannels() const override; 47 int RtpTimestampRateHz() const override; 48 size_t Num10MsFramesInNextPacket() const override; 49 size_t Max10MsFramesInAPacket() const override; 50 int GetTargetBitrate() const override; 51 EncodedInfo EncodeImpl(uint32_t rtp_timestamp, 52 ArrayView<const int16_t> audio, 53 Buffer* encoded) override; 54 void Reset() override; 55 bool SetFec(bool enable) override; 56 bool SetDtx(bool enable) override; 57 bool SetApplication(Application application) override; 58 void SetMaxPlaybackRate(int frequency_hz) override; 59 ArrayView<std::unique_ptr<AudioEncoder>> ReclaimContainedEncoders() override; 60 void OnReceivedUplinkPacketLossFraction( 61 float uplink_packet_loss_fraction) override; 62 void OnReceivedUplinkBandwidth(int target_audio_bitrate_bps, 63 std::optional<int64_t> bwe_period_ms) override; 64 std::optional<std::pair<TimeDelta, TimeDelta>> GetFrameLengthRange() 65 const override; 66 67 private: 68 EncodedInfo EncodePassive(size_t frames_to_encode, Buffer* encoded); 69 EncodedInfo EncodeActive(size_t frames_to_encode, Buffer* encoded); 70 size_t SamplesPer10msFrame() const; 71 72 std::unique_ptr<AudioEncoder> speech_encoder_; 73 const int cng_payload_type_; 74 const int num_cng_coefficients_; 75 const int sid_frame_interval_ms_; 76 std::vector<int16_t> speech_buffer_; 77 std::vector<uint32_t> rtp_timestamps_; 78 bool last_frame_active_; 79 std::unique_ptr<Vad> vad_; 80 std::unique_ptr<ComfortNoiseEncoder> cng_encoder_; 81 }; 82 83 AudioEncoderCng::AudioEncoderCng(AudioEncoderCngConfig&& config) 84 : speech_encoder_((static_cast<void>([&] { 85 RTC_CHECK(config.IsOk()) << "Invalid configuration."; 86 }()), 87 std::move(config.speech_encoder))), 88 cng_payload_type_(config.payload_type), 89 num_cng_coefficients_(config.num_cng_coefficients), 90 sid_frame_interval_ms_(config.sid_frame_interval_ms), 91 last_frame_active_(true), 92 vad_(config.vad ? std::unique_ptr<Vad>(config.vad) 93 : CreateVad(config.vad_mode)), 94 cng_encoder_(new ComfortNoiseEncoder(SampleRateHz(), 95 sid_frame_interval_ms_, 96 num_cng_coefficients_)) { 97 speech_encoder_->Reset(); 98 } 99 100 AudioEncoderCng::~AudioEncoderCng() = default; 101 102 int AudioEncoderCng::SampleRateHz() const { 103 return speech_encoder_->SampleRateHz(); 104 } 105 106 size_t AudioEncoderCng::NumChannels() const { 107 return 1; 108 } 109 110 int AudioEncoderCng::RtpTimestampRateHz() const { 111 return speech_encoder_->RtpTimestampRateHz(); 112 } 113 114 size_t AudioEncoderCng::Num10MsFramesInNextPacket() const { 115 return speech_encoder_->Num10MsFramesInNextPacket(); 116 } 117 118 size_t AudioEncoderCng::Max10MsFramesInAPacket() const { 119 return speech_encoder_->Max10MsFramesInAPacket(); 120 } 121 122 int AudioEncoderCng::GetTargetBitrate() const { 123 return speech_encoder_->GetTargetBitrate(); 124 } 125 126 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeImpl( 127 uint32_t rtp_timestamp, 128 ArrayView<const int16_t> audio, 129 Buffer* encoded) { 130 const size_t samples_per_10ms_frame = SamplesPer10msFrame(); 131 RTC_CHECK_EQ(speech_buffer_.size(), 132 rtp_timestamps_.size() * samples_per_10ms_frame); 133 rtp_timestamps_.push_back(rtp_timestamp); 134 RTC_DCHECK_EQ(samples_per_10ms_frame, audio.size()); 135 speech_buffer_.insert(speech_buffer_.end(), audio.cbegin(), audio.cend()); 136 const size_t frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket(); 137 if (rtp_timestamps_.size() < frames_to_encode) { 138 return EncodedInfo(); 139 } 140 RTC_CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs) 141 << "Frame size cannot be larger than " << kMaxFrameSizeMs 142 << " ms when using VAD/CNG."; 143 144 // Group several 10 ms blocks per VAD call. Call VAD once or twice using the 145 // following split sizes: 146 // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms; 147 // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms. 148 size_t blocks_in_first_vad_call = 149 (frames_to_encode > 3 ? 3 : frames_to_encode); 150 if (frames_to_encode == 4) 151 blocks_in_first_vad_call = 2; 152 RTC_CHECK_GE(frames_to_encode, blocks_in_first_vad_call); 153 const size_t blocks_in_second_vad_call = 154 frames_to_encode - blocks_in_first_vad_call; 155 156 // Check if all of the buffer is passive speech. Start with checking the first 157 // block. 158 Vad::Activity activity = vad_->VoiceActivity( 159 &speech_buffer_[0], samples_per_10ms_frame * blocks_in_first_vad_call, 160 SampleRateHz()); 161 if (activity == Vad::kPassive && blocks_in_second_vad_call > 0) { 162 // Only check the second block if the first was passive. 163 activity = vad_->VoiceActivity( 164 &speech_buffer_[samples_per_10ms_frame * blocks_in_first_vad_call], 165 samples_per_10ms_frame * blocks_in_second_vad_call, SampleRateHz()); 166 } 167 168 EncodedInfo info; 169 switch (activity) { 170 case Vad::kPassive: { 171 info = EncodePassive(frames_to_encode, encoded); 172 last_frame_active_ = false; 173 break; 174 } 175 case Vad::kActive: { 176 info = EncodeActive(frames_to_encode, encoded); 177 last_frame_active_ = true; 178 break; 179 } 180 default: { 181 RTC_CHECK_NOTREACHED(); 182 } 183 } 184 185 speech_buffer_.erase( 186 speech_buffer_.begin(), 187 speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame); 188 rtp_timestamps_.erase(rtp_timestamps_.begin(), 189 rtp_timestamps_.begin() + frames_to_encode); 190 return info; 191 } 192 193 void AudioEncoderCng::Reset() { 194 speech_encoder_->Reset(); 195 speech_buffer_.clear(); 196 rtp_timestamps_.clear(); 197 last_frame_active_ = true; 198 vad_->Reset(); 199 cng_encoder_.reset(new ComfortNoiseEncoder( 200 SampleRateHz(), sid_frame_interval_ms_, num_cng_coefficients_)); 201 } 202 203 bool AudioEncoderCng::SetFec(bool enable) { 204 return speech_encoder_->SetFec(enable); 205 } 206 207 bool AudioEncoderCng::SetDtx(bool enable) { 208 return speech_encoder_->SetDtx(enable); 209 } 210 211 bool AudioEncoderCng::SetApplication(Application application) { 212 return speech_encoder_->SetApplication(application); 213 } 214 215 void AudioEncoderCng::SetMaxPlaybackRate(int frequency_hz) { 216 speech_encoder_->SetMaxPlaybackRate(frequency_hz); 217 } 218 219 ArrayView<std::unique_ptr<AudioEncoder>> 220 AudioEncoderCng::ReclaimContainedEncoders() { 221 return ArrayView<std::unique_ptr<AudioEncoder>>(&speech_encoder_, 1); 222 } 223 224 void AudioEncoderCng::OnReceivedUplinkPacketLossFraction( 225 float uplink_packet_loss_fraction) { 226 speech_encoder_->OnReceivedUplinkPacketLossFraction( 227 uplink_packet_loss_fraction); 228 } 229 230 void AudioEncoderCng::OnReceivedUplinkBandwidth( 231 int target_audio_bitrate_bps, 232 std::optional<int64_t> bwe_period_ms) { 233 speech_encoder_->OnReceivedUplinkBandwidth(target_audio_bitrate_bps, 234 bwe_period_ms); 235 } 236 237 std::optional<std::pair<TimeDelta, TimeDelta>> 238 AudioEncoderCng::GetFrameLengthRange() const { 239 return speech_encoder_->GetFrameLengthRange(); 240 } 241 242 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive( 243 size_t frames_to_encode, 244 Buffer* encoded) { 245 bool force_sid = last_frame_active_; 246 bool output_produced = false; 247 const size_t samples_per_10ms_frame = SamplesPer10msFrame(); 248 AudioEncoder::EncodedInfo info; 249 250 for (size_t i = 0; i < frames_to_encode; ++i) { 251 // It's important not to pass &info.encoded_bytes directly to 252 // WebRtcCng_Encode(), since later loop iterations may return zero in 253 // that value, in which case we don't want to overwrite any value from 254 // an earlier iteration. 255 size_t encoded_bytes_tmp = cng_encoder_->Encode( 256 ArrayView<const int16_t>(&speech_buffer_[i * samples_per_10ms_frame], 257 samples_per_10ms_frame), 258 force_sid, encoded); 259 260 if (encoded_bytes_tmp > 0) { 261 RTC_CHECK(!output_produced); 262 info.encoded_bytes = encoded_bytes_tmp; 263 output_produced = true; 264 force_sid = false; 265 } 266 } 267 268 info.encoded_timestamp = rtp_timestamps_.front(); 269 info.payload_type = cng_payload_type_; 270 info.send_even_if_empty = true; 271 info.speech = false; 272 return info; 273 } 274 275 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(size_t frames_to_encode, 276 Buffer* encoded) { 277 const size_t samples_per_10ms_frame = SamplesPer10msFrame(); 278 AudioEncoder::EncodedInfo info; 279 for (size_t i = 0; i < frames_to_encode; ++i) { 280 info = speech_encoder_->Encode( 281 rtp_timestamps_.front(), 282 ArrayView<const int16_t>(&speech_buffer_[i * samples_per_10ms_frame], 283 samples_per_10ms_frame), 284 encoded); 285 if (i + 1 == frames_to_encode) { 286 RTC_CHECK_GT(info.encoded_bytes, 0) << "Encoder didn't deliver data."; 287 } else { 288 RTC_CHECK_EQ(info.encoded_bytes, 0) 289 << "Encoder delivered data too early."; 290 } 291 } 292 return info; 293 } 294 295 size_t AudioEncoderCng::SamplesPer10msFrame() const { 296 return CheckedDivExact(10 * SampleRateHz(), 1000); 297 } 298 299 } // namespace 300 301 AudioEncoderCngConfig::AudioEncoderCngConfig() = default; 302 AudioEncoderCngConfig::AudioEncoderCngConfig(AudioEncoderCngConfig&&) = default; 303 AudioEncoderCngConfig::~AudioEncoderCngConfig() = default; 304 305 bool AudioEncoderCngConfig::IsOk() const { 306 if (num_channels != 1) 307 return false; 308 if (!speech_encoder) 309 return false; 310 if (num_channels != speech_encoder->NumChannels()) 311 return false; 312 if (sid_frame_interval_ms < 313 static_cast<int>(speech_encoder->Max10MsFramesInAPacket() * 10)) 314 return false; 315 if (num_cng_coefficients > WEBRTC_CNG_MAX_LPC_ORDER || 316 num_cng_coefficients <= 0) 317 return false; 318 return true; 319 } 320 321 std::unique_ptr<AudioEncoder> CreateComfortNoiseEncoder( 322 AudioEncoderCngConfig&& config) { 323 return std::make_unique<AudioEncoderCng>(std::move(config)); 324 } 325 326 } // namespace webrtc