audio_coding_module.cc (20628B)
1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_coding/include/audio_coding_module.h" 12 13 #include <array> 14 #include <cstddef> 15 #include <cstdint> 16 #include <memory> 17 #include <optional> 18 #include <string> 19 #include <vector> 20 21 #include "absl/strings/string_view.h" 22 #include "api/array_view.h" 23 #include "api/audio/audio_view.h" 24 #include "api/audio_codecs/audio_encoder.h" 25 #include "api/function_view.h" 26 #include "common_audio/resampler/include/push_resampler.h" 27 #include "modules/audio_coding/acm2/acm_remixing.h" 28 #include "modules/audio_coding/include/audio_coding_module_typedefs.h" 29 #include "modules/include/module_common_types_public.h" 30 #include "rtc_base/buffer.h" 31 #include "rtc_base/checks.h" 32 #include "rtc_base/logging.h" 33 #include "rtc_base/numerics/safe_conversions.h" 34 #include "rtc_base/synchronization/mutex.h" 35 #include "rtc_base/thread_annotations.h" 36 #include "system_wrappers/include/metrics.h" 37 38 namespace webrtc { 39 40 namespace { 41 42 // Initial size for the buffer in InputBuffer. This matches 6 channels of 10 ms 43 // 48 kHz data. 44 constexpr size_t kInitialInputDataBufferSize = 6 * 480; 45 46 constexpr int32_t kMaxInputSampleRateHz = 192000; 47 48 class AudioCodingModuleImpl final : public AudioCodingModule { 49 public: 50 explicit AudioCodingModuleImpl(); 51 ~AudioCodingModuleImpl() override; 52 53 void Reset() override; 54 55 ///////////////////////////////////////// 56 // Sender 57 // 58 59 void ModifyEncoder( 60 FunctionView<void(std::unique_ptr<AudioEncoder>*)> modifier) override; 61 62 // Register a transport callback which will be 63 // called to deliver the encoded buffers. 64 int RegisterTransportCallback(AudioPacketizationCallback* transport) override; 65 66 // Add 10 ms of raw (PCM) audio data to the encoder. 67 int Add10MsData(const AudioFrame& audio_frame) override; 68 69 ///////////////////////////////////////// 70 // (FEC) Forward Error Correction (codec internal) 71 // 72 73 // Set target packet loss rate 74 int SetPacketLossRate(int loss_rate) override; 75 76 ///////////////////////////////////////// 77 // Statistics 78 // 79 80 ANAStats GetANAStats() const override; 81 82 int GetTargetBitrate() const override; 83 84 private: 85 struct InputData { 86 InputData() : buffer(kInitialInputDataBufferSize) {} 87 uint32_t input_timestamp; 88 const int16_t* audio; 89 size_t length_per_channel; 90 size_t audio_channel; 91 // If a re-mix is required (up or down), this buffer will store a re-mixed 92 // version of the input. 93 std::vector<int16_t> buffer; 94 }; 95 96 InputData input_data_ RTC_GUARDED_BY(acm_mutex_); 97 98 // This member class writes values to the named UMA histogram, but only if 99 // the value has changed since the last time (and always for the first call). 100 class ChangeLogger { 101 public: 102 explicit ChangeLogger(absl::string_view histogram_name) 103 : histogram_name_(histogram_name) {} 104 // Logs the new value if it is different from the last logged value, or if 105 // this is the first call. 106 void MaybeLog(int value); 107 108 private: 109 int last_value_ = 0; 110 int first_time_ = true; 111 const std::string histogram_name_; 112 }; 113 114 int Add10MsDataInternal(const AudioFrame& audio_frame, InputData* input_data) 115 RTC_EXCLUSIVE_LOCKS_REQUIRED(acm_mutex_); 116 117 // TODO(bugs.webrtc.org/10739): change `absolute_capture_timestamp_ms` to 118 // int64_t when it always receives a valid value. 119 int Encode(const InputData& input_data, 120 std::optional<int64_t> absolute_capture_timestamp_ms) 121 RTC_EXCLUSIVE_LOCKS_REQUIRED(acm_mutex_); 122 123 bool HaveValidEncoder(absl::string_view caller_name) const 124 RTC_EXCLUSIVE_LOCKS_REQUIRED(acm_mutex_); 125 126 // Updates or checks `expected_in_ts_` and `expected_codec_ts` based on the 127 // timestamps in `in_frame`. If no audio frame has been received, the fields 128 // are just set. For subsequent frames, the expected timestamps are checked 129 // for consistency. 130 void SetInputTimestamps(const AudioFrame& in_frame) 131 RTC_EXCLUSIVE_LOCKS_REQUIRED(acm_mutex_); 132 133 // Preprocessing of input audio, including resampling and down-mixing if 134 // required, before pushing audio into encoder's buffer. 135 // 136 // in_frame: input audio-frame 137 // ptr_out: pointer to output audio_frame. If no preprocessing is required 138 // `ptr_out` will be pointing to `in_frame`, otherwise pointing to 139 // `preprocess_frame_`. 140 // 141 // Return value: 142 // -1: if encountering an error. 143 // 0: otherwise. 144 int PreprocessToAddData(const AudioFrame& in_frame, 145 const AudioFrame** ptr_out) 146 RTC_EXCLUSIVE_LOCKS_REQUIRED(acm_mutex_); 147 148 // Called from `PreprocessToAddData` when no resampling or downmixing is 149 // required. Returns a pointer to an output audio_frame. If timestamps are as 150 // expected the return value will point to `in_frame`, otherwise the data will 151 // have been copied into `preprocess_frame_` and the returned pointer points 152 // to `preprocess_frame_`. 153 const AudioFrame* AddDataNoPreProcess(const AudioFrame& in_frame) 154 RTC_EXCLUSIVE_LOCKS_REQUIRED(acm_mutex_); 155 156 // Change required states after starting to receive the codec corresponding 157 // to `index`. 158 int UpdateUponReceivingCodec(int index); 159 160 mutable Mutex acm_mutex_; 161 Buffer encode_buffer_ RTC_GUARDED_BY(acm_mutex_); 162 uint32_t expected_codec_ts_ RTC_GUARDED_BY(acm_mutex_); 163 uint32_t expected_in_ts_ RTC_GUARDED_BY(acm_mutex_); 164 PushResampler<int16_t> resampler_ RTC_GUARDED_BY(acm_mutex_); 165 ChangeLogger bitrate_logger_ RTC_GUARDED_BY(acm_mutex_); 166 167 // Current encoder stack, provided by a call to RegisterEncoder. 168 std::unique_ptr<AudioEncoder> encoder_stack_ RTC_GUARDED_BY(acm_mutex_); 169 170 // This is to keep track of CN instances where we can send DTMFs. 171 uint8_t previous_pltype_ RTC_GUARDED_BY(acm_mutex_); 172 173 AudioFrame preprocess_frame_ RTC_GUARDED_BY(acm_mutex_); 174 bool first_10ms_data_ RTC_GUARDED_BY(acm_mutex_); 175 176 bool first_frame_ RTC_GUARDED_BY(acm_mutex_); 177 uint32_t last_timestamp_ RTC_GUARDED_BY(acm_mutex_); 178 uint32_t last_rtp_timestamp_ RTC_GUARDED_BY(acm_mutex_); 179 std::optional<int64_t> absolute_capture_timestamp_ms_ 180 RTC_GUARDED_BY(acm_mutex_); 181 182 Mutex callback_mutex_; 183 AudioPacketizationCallback* packetization_callback_ 184 RTC_GUARDED_BY(callback_mutex_); 185 186 int codec_histogram_bins_log_[static_cast<size_t>( 187 AudioEncoder::CodecType::kMaxLoggedAudioCodecTypes)]; 188 int number_of_consecutive_empty_packets_; 189 190 mutable Mutex stats_mutex_; 191 ANAStats ana_stats_ RTC_GUARDED_BY(stats_mutex_); 192 int target_bitrate_ RTC_GUARDED_BY(stats_mutex_) = -1; 193 }; 194 195 // Adds a codec usage sample to the histogram. 196 void UpdateCodecTypeHistogram(size_t codec_type) { 197 RTC_HISTOGRAM_ENUMERATION( 198 "WebRTC.Audio.Encoder.CodecType", static_cast<int>(codec_type), 199 static_cast<int>(AudioEncoder::CodecType::kMaxLoggedAudioCodecTypes)); 200 } 201 202 void AudioCodingModuleImpl::ChangeLogger::MaybeLog(int value) { 203 if (value != last_value_ || first_time_) { 204 first_time_ = false; 205 last_value_ = value; 206 RTC_HISTOGRAM_COUNTS_SPARSE_100(histogram_name_, value); 207 } 208 } 209 210 AudioCodingModuleImpl::AudioCodingModuleImpl() 211 : expected_codec_ts_(0xD87F3F9F), 212 expected_in_ts_(0xD87F3F9F), 213 bitrate_logger_("WebRTC.Audio.TargetBitrateInKbps"), 214 encoder_stack_(nullptr), 215 previous_pltype_(255), 216 first_10ms_data_(false), 217 first_frame_(true), 218 packetization_callback_(nullptr), 219 codec_histogram_bins_log_(), 220 number_of_consecutive_empty_packets_(0) { 221 RTC_LOG(LS_INFO) << "Created"; 222 } 223 224 AudioCodingModuleImpl::~AudioCodingModuleImpl() = default; 225 226 int32_t AudioCodingModuleImpl::Encode( 227 const InputData& input_data, 228 std::optional<int64_t> absolute_capture_timestamp_ms) { 229 // TODO(bugs.webrtc.org/10739): add dcheck that 230 // `audio_frame.absolute_capture_timestamp_ms()` always has a value. 231 AudioEncoder::EncodedInfo encoded_info; 232 uint8_t previous_pltype; 233 234 // Check if there is an encoder before. 235 if (!HaveValidEncoder("Process")) 236 return -1; 237 238 if (!first_frame_) { 239 RTC_DCHECK(IsNewerTimestamp(input_data.input_timestamp, last_timestamp_)) 240 << "Time should not move backwards"; 241 } 242 243 // Scale the timestamp to the codec's RTP timestamp rate. 244 uint32_t rtp_timestamp = 245 first_frame_ 246 ? input_data.input_timestamp 247 : last_rtp_timestamp_ + 248 dchecked_cast<uint32_t>(CheckedDivExact( 249 int64_t{input_data.input_timestamp - last_timestamp_} * 250 encoder_stack_->RtpTimestampRateHz(), 251 int64_t{encoder_stack_->SampleRateHz()})); 252 253 last_timestamp_ = input_data.input_timestamp; 254 last_rtp_timestamp_ = rtp_timestamp; 255 first_frame_ = false; 256 257 if (!absolute_capture_timestamp_ms_.has_value()) { 258 absolute_capture_timestamp_ms_ = absolute_capture_timestamp_ms; 259 } 260 261 // Clear the buffer before reuse - encoded data will get appended. 262 encode_buffer_.Clear(); 263 encoded_info = encoder_stack_->Encode( 264 rtp_timestamp, 265 ArrayView<const int16_t>( 266 input_data.audio, 267 input_data.audio_channel * input_data.length_per_channel), 268 &encode_buffer_); 269 270 bitrate_logger_.MaybeLog(encoder_stack_->GetTargetBitrate() / 1000); 271 if (encode_buffer_.empty() && !encoded_info.send_even_if_empty) { 272 // Not enough data. 273 return 0; 274 } 275 previous_pltype = previous_pltype_; // Read it while we have the critsect. 276 277 // Log codec type to histogram once every 500 packets. 278 if (encoded_info.encoded_bytes == 0) { 279 ++number_of_consecutive_empty_packets_; 280 } else { 281 size_t codec_type = static_cast<size_t>(encoded_info.encoder_type); 282 codec_histogram_bins_log_[codec_type] += 283 number_of_consecutive_empty_packets_ + 1; 284 number_of_consecutive_empty_packets_ = 0; 285 if (codec_histogram_bins_log_[codec_type] >= 500) { 286 codec_histogram_bins_log_[codec_type] -= 500; 287 UpdateCodecTypeHistogram(codec_type); 288 } 289 } 290 291 AudioFrameType frame_type; 292 if (encode_buffer_.empty() && encoded_info.send_even_if_empty) { 293 frame_type = AudioFrameType::kEmptyFrame; 294 encoded_info.payload_type = previous_pltype; 295 } else { 296 RTC_DCHECK_GT(encode_buffer_.size(), 0); 297 frame_type = encoded_info.speech ? AudioFrameType::kAudioFrameSpeech 298 : AudioFrameType::kAudioFrameCN; 299 } 300 301 { 302 MutexLock lock(&callback_mutex_); 303 if (packetization_callback_) { 304 packetization_callback_->SendData( 305 frame_type, encoded_info.payload_type, encoded_info.encoded_timestamp, 306 encode_buffer_.data(), encode_buffer_.size(), 307 absolute_capture_timestamp_ms_.value_or(-1)); 308 } 309 } 310 absolute_capture_timestamp_ms_.reset(); 311 previous_pltype_ = encoded_info.payload_type; 312 { 313 MutexLock lock(&stats_mutex_); 314 ana_stats_ = encoder_stack_->GetANAStats(); 315 target_bitrate_ = encoder_stack_->GetTargetBitrate(); 316 } 317 return static_cast<int32_t>(encode_buffer_.size()); 318 } 319 320 ///////////////////////////////////////// 321 // Sender 322 // 323 324 void AudioCodingModuleImpl::Reset() { 325 MutexLock lock(&acm_mutex_); 326 absolute_capture_timestamp_ms_.reset(); 327 if (HaveValidEncoder("Reset")) { 328 encoder_stack_->Reset(); 329 } 330 } 331 332 void AudioCodingModuleImpl::ModifyEncoder( 333 FunctionView<void(std::unique_ptr<AudioEncoder>*)> modifier) { 334 MutexLock lock(&acm_mutex_); 335 modifier(&encoder_stack_); 336 } 337 338 // Register a transport callback which will be called to deliver 339 // the encoded buffers. 340 int AudioCodingModuleImpl::RegisterTransportCallback( 341 AudioPacketizationCallback* transport) { 342 MutexLock lock(&callback_mutex_); 343 packetization_callback_ = transport; 344 return 0; 345 } 346 347 // Add 10MS of raw (PCM) audio data to the encoder. 348 int AudioCodingModuleImpl::Add10MsData(const AudioFrame& audio_frame) { 349 MutexLock lock(&acm_mutex_); 350 int r = Add10MsDataInternal(audio_frame, &input_data_); 351 // TODO(bugs.webrtc.org/10739): add dcheck that 352 // `audio_frame.absolute_capture_timestamp_ms()` always has a value. 353 return r < 0 354 ? r 355 : Encode(input_data_, audio_frame.absolute_capture_timestamp_ms()); 356 } 357 358 int AudioCodingModuleImpl::Add10MsDataInternal(const AudioFrame& audio_frame, 359 InputData* input_data) { 360 if (audio_frame.samples_per_channel_ == 0) { 361 RTC_DCHECK_NOTREACHED(); 362 RTC_LOG(LS_ERROR) << "Cannot Add 10 ms audio, payload length is zero"; 363 return -1; 364 } 365 366 if (audio_frame.sample_rate_hz_ > kMaxInputSampleRateHz) { 367 RTC_DCHECK_NOTREACHED(); 368 RTC_LOG(LS_ERROR) << "Cannot Add 10 ms audio, input frequency not valid"; 369 return -1; 370 } 371 372 // If the length and frequency matches. We currently just support raw PCM. 373 if (static_cast<size_t>(audio_frame.sample_rate_hz_ / 100) != 374 audio_frame.samples_per_channel_) { 375 RTC_LOG(LS_ERROR) 376 << "Cannot Add 10 ms audio, input frequency and length doesn't match"; 377 return -1; 378 } 379 380 if (audio_frame.num_channels_ != 1 && audio_frame.num_channels_ != 2 && 381 audio_frame.num_channels_ != 4 && audio_frame.num_channels_ != 6 && 382 audio_frame.num_channels_ != 8) { 383 RTC_LOG(LS_ERROR) << "Cannot Add 10 ms audio, invalid number of channels."; 384 return -1; 385 } 386 387 // Do we have a codec registered? 388 if (!HaveValidEncoder("Add10MsData")) { 389 return -1; 390 } 391 392 const AudioFrame* ptr_frame; 393 // Perform a resampling, also down-mix if it is required and can be 394 // performed before resampling (a down mix prior to resampling will take 395 // place if both primary and secondary encoders are mono and input is in 396 // stereo). 397 if (PreprocessToAddData(audio_frame, &ptr_frame) < 0) { 398 return -1; 399 } 400 401 // Check whether we need an up-mix or down-mix? 402 const size_t current_num_channels = encoder_stack_->NumChannels(); 403 const bool same_num_channels = 404 ptr_frame->num_channels_ == current_num_channels; 405 406 // TODO(yujo): Skip encode of muted frames. 407 input_data->input_timestamp = ptr_frame->timestamp_; 408 input_data->length_per_channel = ptr_frame->samples_per_channel_; 409 input_data->audio_channel = current_num_channels; 410 411 if (!same_num_channels) { 412 // Remixes the input frame to the output data and in the process resize the 413 // output data if needed. 414 ReMixFrame(*ptr_frame, current_num_channels, &input_data->buffer); 415 416 // For pushing data to primary, point the `ptr_audio` to correct buffer. 417 input_data->audio = input_data->buffer.data(); 418 RTC_DCHECK_GE(input_data->buffer.size(), 419 input_data->length_per_channel * input_data->audio_channel); 420 } else { 421 // When adding data to encoders this pointer is pointing to an audio buffer 422 // with correct number of channels. 423 input_data->audio = ptr_frame->data(); 424 } 425 426 return 0; 427 } 428 429 void AudioCodingModuleImpl::SetInputTimestamps(const AudioFrame& in_frame) { 430 if (!first_10ms_data_) { 431 expected_in_ts_ = in_frame.timestamp_; 432 expected_codec_ts_ = in_frame.timestamp_; 433 first_10ms_data_ = true; 434 } else if (in_frame.timestamp_ != expected_in_ts_) { 435 RTC_LOG(LS_WARNING) << "Unexpected input timestamp: " << in_frame.timestamp_ 436 << ", expected: " << expected_in_ts_; 437 expected_codec_ts_ += 438 (in_frame.timestamp_ - expected_in_ts_) * 439 static_cast<uint32_t>( 440 static_cast<double>(encoder_stack_->SampleRateHz()) / 441 static_cast<double>(in_frame.sample_rate_hz_)); 442 expected_in_ts_ = in_frame.timestamp_; 443 } 444 } 445 446 // Perform a resampling and down-mix if required. We down-mix only if 447 // encoder is mono and input is stereo. In case of dual-streaming, both 448 // encoders has to be mono for down-mix to take place. 449 // |*ptr_out| will point to the pre-processed audio-frame. If no pre-processing 450 // is required, |*ptr_out| points to `in_frame`. 451 // TODO(yujo): Make this more efficient for muted frames. 452 int AudioCodingModuleImpl::PreprocessToAddData(const AudioFrame& in_frame, 453 const AudioFrame** ptr_out) { 454 SetInputTimestamps(in_frame); 455 456 const bool resample = 457 in_frame.sample_rate_hz_ != encoder_stack_->SampleRateHz(); 458 459 // This variable is true if primary codec and secondary codec (if exists) 460 // are both mono and input is stereo. 461 // TODO(henrik.lundin): This condition should probably be 462 // in_frame.num_channels_ > encoder_stack_->NumChannels() 463 const bool down_mix = 464 in_frame.num_channels_ == 2 && encoder_stack_->NumChannels() == 1; 465 466 if (!down_mix && !resample) { 467 // No preprocessing is required. 468 *ptr_out = AddDataNoPreProcess(in_frame); 469 return 0; 470 } 471 472 // Some pre-processing will be required, so we'll use the internal buffer. 473 *ptr_out = &preprocess_frame_; 474 preprocess_frame_.timestamp_ = expected_codec_ts_; 475 preprocess_frame_.samples_per_channel_ = in_frame.samples_per_channel_; 476 477 // Temporary buffer in case both downmixing and resampling is required. 478 std::array<int16_t, AudioFrame::kMaxDataSizeSamples> audio; 479 // When resampling is needed, this view will represent the buffer to resample. 480 InterleavedView<const int16_t> resample_src_audio; 481 482 if (down_mix) { 483 RTC_DCHECK_GE(audio.size(), in_frame.samples_per_channel()); 484 preprocess_frame_.num_channels_ = 1; // We always downmix to mono. 485 // If a resampling is also required, the output of a down-mix is written 486 // into a local buffer, otherwise, it will be written to the output frame. 487 auto downmixed = 488 resample 489 ? InterleavedView<int16_t>(audio.data(), 490 in_frame.samples_per_channel(), 1) 491 : preprocess_frame_.mutable_data(in_frame.samples_per_channel(), 1); 492 DownMixFrame(in_frame, downmixed.AsMono()); 493 if (resample) { 494 // Set the input for the resampler to the down-mixed signal. 495 resample_src_audio = downmixed; 496 } 497 } else { 498 preprocess_frame_.num_channels_ = in_frame.num_channels_; 499 if (resample) { 500 // Set the input of the resampler to the original data. 501 resample_src_audio = in_frame.data_view(); 502 } 503 } 504 505 RTC_DCHECK(resample_src_audio.empty() || resample); 506 preprocess_frame_.SetSampleRateAndChannelSize(encoder_stack_->SampleRateHz()); 507 508 if (resample) { 509 resampler_.Resample( 510 resample_src_audio, 511 preprocess_frame_.mutable_data(preprocess_frame_.samples_per_channel(), 512 preprocess_frame_.num_channels())); 513 } 514 515 expected_codec_ts_ += 516 static_cast<uint32_t>(preprocess_frame_.samples_per_channel_); 517 expected_in_ts_ += static_cast<uint32_t>(in_frame.samples_per_channel_); 518 519 return 0; 520 } 521 522 const AudioFrame* AudioCodingModuleImpl::AddDataNoPreProcess( 523 const AudioFrame& in_frame) { 524 const AudioFrame* ret = nullptr; 525 // No preprocessing is required. 526 if (expected_in_ts_ == expected_codec_ts_) { 527 // Timestamps as expected, we can use the input frame as-is. 528 ret = &in_frame; 529 } else { 530 // Otherwise we'll need to alter the timestamp. Since in_frame is const, 531 // we'll have to make a copy of it. 532 preprocess_frame_.CopyFrom(in_frame); 533 preprocess_frame_.timestamp_ = expected_codec_ts_; 534 ret = &preprocess_frame_; 535 } 536 537 expected_in_ts_ += static_cast<uint32_t>(in_frame.samples_per_channel_); 538 expected_codec_ts_ += static_cast<uint32_t>(in_frame.samples_per_channel_); 539 540 return ret; 541 } 542 543 ///////////////////////////////////////// 544 // (FEC) Forward Error Correction (codec internal) 545 // 546 547 int AudioCodingModuleImpl::SetPacketLossRate(int loss_rate) { 548 MutexLock lock(&acm_mutex_); 549 if (HaveValidEncoder("SetPacketLossRate")) { 550 encoder_stack_->OnReceivedUplinkPacketLossFraction(loss_rate / 100.0); 551 } 552 return 0; 553 } 554 555 ///////////////////////////////////////// 556 // Statistics 557 // 558 559 bool AudioCodingModuleImpl::HaveValidEncoder( 560 absl::string_view caller_name) const { 561 if (!encoder_stack_) { 562 RTC_LOG(LS_ERROR) << caller_name << " failed: No send codec is registered."; 563 return false; 564 } 565 return true; 566 } 567 568 ANAStats AudioCodingModuleImpl::GetANAStats() const { 569 MutexLock lock(&stats_mutex_); 570 return ana_stats_; 571 } 572 573 int AudioCodingModuleImpl::GetTargetBitrate() const { 574 MutexLock lock(&stats_mutex_); 575 return target_bitrate_; 576 } 577 578 } // namespace 579 580 std::unique_ptr<AudioCodingModule> AudioCodingModule::Create() { 581 return std::make_unique<AudioCodingModuleImpl>(); 582 } 583 584 } // namespace webrtc