multiend_call.cc (7382B)
1 /* 2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_processing/test/conversational_speech/multiend_call.h" 12 13 #include <algorithm> 14 #include <cstddef> 15 #include <iterator> 16 #include <map> 17 #include <memory> 18 #include <string> 19 #include <tuple> 20 #include <utility> 21 #include <vector> 22 23 #include "absl/strings/string_view.h" 24 #include "api/array_view.h" 25 #include "modules/audio_processing/test/conversational_speech/timing.h" 26 #include "modules/audio_processing/test/conversational_speech/wavreader_abstract_factory.h" 27 #include "modules/audio_processing/test/conversational_speech/wavreader_interface.h" 28 #include "rtc_base/checks.h" 29 #include "rtc_base/logging.h" 30 #include "test/testsupport/file_utils.h" 31 32 namespace webrtc { 33 namespace test { 34 namespace conversational_speech { 35 36 MultiEndCall::MultiEndCall( 37 ArrayView<const Turn> timing, 38 absl::string_view audiotracks_path, 39 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory) 40 : timing_(timing), 41 audiotracks_path_(audiotracks_path), 42 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)), 43 valid_(false) { 44 FindSpeakerNames(); 45 if (CreateAudioTrackReaders()) 46 valid_ = CheckTiming(); 47 } 48 49 MultiEndCall::~MultiEndCall() = default; 50 51 void MultiEndCall::FindSpeakerNames() { 52 RTC_DCHECK(speaker_names_.empty()); 53 for (const Turn& turn : timing_) { 54 speaker_names_.emplace(turn.speaker_name); 55 } 56 } 57 58 bool MultiEndCall::CreateAudioTrackReaders() { 59 RTC_DCHECK(audiotrack_readers_.empty()); 60 sample_rate_hz_ = 0; // Sample rate will be set when reading the first track. 61 for (const Turn& turn : timing_) { 62 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); 63 if (it != audiotrack_readers_.end()) 64 continue; 65 66 const std::string audiotrack_file_path = 67 test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name); 68 69 // Map the audiotrack file name to a new instance of WavReaderInterface. 70 std::unique_ptr<WavReaderInterface> wavreader = 71 wavreader_abstract_factory_->Create( 72 test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name)); 73 74 if (sample_rate_hz_ == 0) { 75 sample_rate_hz_ = wavreader->SampleRate(); 76 } else if (sample_rate_hz_ != wavreader->SampleRate()) { 77 RTC_LOG(LS_ERROR) 78 << "All the audio tracks should have the same sample rate."; 79 return false; 80 } 81 82 if (wavreader->NumChannels() != 1) { 83 RTC_LOG(LS_ERROR) << "Only mono audio tracks supported."; 84 return false; 85 } 86 87 audiotrack_readers_.emplace(turn.audiotrack_file_name, 88 std::move(wavreader)); 89 } 90 91 return true; 92 } 93 94 bool MultiEndCall::CheckTiming() { 95 struct Interval { 96 size_t begin; 97 size_t end; 98 }; 99 size_t number_of_turns = timing_.size(); 100 auto millisecond_to_samples = [](int ms, int sr) -> int { 101 // Truncation may happen if the sampling rate is not an integer multiple 102 // of 1000 (e.g., 44100). 103 return ms * sr / 1000; 104 }; 105 auto in_interval = [](size_t value, const Interval& interval) { 106 return interval.begin <= value && value < interval.end; 107 }; 108 total_duration_samples_ = 0; 109 speaking_turns_.clear(); 110 111 // Begin and end timestamps for the last two turns (unit: number of samples). 112 Interval second_last_turn = {.begin = 0, .end = 0}; 113 Interval last_turn = {.begin = 0, .end = 0}; 114 115 // Initialize map to store speaking turn indices of each speaker (used to 116 // detect self cross-talk). 117 std::map<std::string, std::vector<size_t>> speaking_turn_indices; 118 for (const std::string& speaker_name : speaker_names_) { 119 speaking_turn_indices.emplace(std::piecewise_construct, 120 std::forward_as_tuple(speaker_name), 121 std::forward_as_tuple()); 122 } 123 124 // Parse turns. 125 for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { 126 const Turn& turn = timing_[turn_index]; 127 auto it = audiotrack_readers_.find(turn.audiotrack_file_name); 128 RTC_CHECK(it != audiotrack_readers_.end()) 129 << "Audio track reader not created"; 130 131 // Begin and end timestamps for the current turn. 132 int offset_samples = 133 millisecond_to_samples(turn.offset, it->second->SampleRate()); 134 std::size_t begin_timestamp = last_turn.end + offset_samples; 135 std::size_t end_timestamp = begin_timestamp + it->second->NumSamples(); 136 RTC_LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp << "-" 137 << end_timestamp << " ms"; 138 139 // The order is invalid if the offset is negative and its absolute value is 140 // larger then the duration of the previous turn. 141 if (offset_samples < 0 && 142 -offset_samples > static_cast<int>(last_turn.end - last_turn.begin)) { 143 RTC_LOG(LS_ERROR) << "invalid order"; 144 return false; 145 } 146 147 // Cross-talk with 3 or more speakers occurs when the beginning of the 148 // current interval falls in the last two turns. 149 if (turn_index > 1 && in_interval(begin_timestamp, last_turn) && 150 in_interval(begin_timestamp, second_last_turn)) { 151 RTC_LOG(LS_ERROR) << "cross-talk with 3+ speakers"; 152 return false; 153 } 154 155 // Append turn. 156 speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name, 157 begin_timestamp, end_timestamp, turn.gain); 158 159 // Save speaking turn index for self cross-talk detection. 160 RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1); 161 speaking_turn_indices[turn.speaker_name].push_back(turn_index); 162 163 // Update total duration of the consversational speech. 164 if (total_duration_samples_ < end_timestamp) 165 total_duration_samples_ = end_timestamp; 166 167 // Update and continue with next turn. 168 second_last_turn = last_turn; 169 last_turn.begin = begin_timestamp; 170 last_turn.end = end_timestamp; 171 } 172 173 // Detect self cross-talk. 174 for (const std::string& speaker_name : speaker_names_) { 175 RTC_LOG(LS_INFO) << "checking self cross-talk for <" << speaker_name << ">"; 176 177 // Copy all turns for this speaker to new vector. 178 std::vector<SpeakingTurn> speaking_turns_for_name; 179 std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), 180 std::back_inserter(speaking_turns_for_name), 181 [&speaker_name](const SpeakingTurn& st) { 182 return st.speaker_name == speaker_name; 183 }); 184 185 // Check for overlap between adjacent elements. 186 // This is a sufficient condition for self cross-talk since the intervals 187 // are sorted by begin timestamp. 188 auto overlap = std::adjacent_find( 189 speaking_turns_for_name.begin(), speaking_turns_for_name.end(), 190 [](const SpeakingTurn& a, const SpeakingTurn& b) { 191 return a.end > b.begin; 192 }); 193 194 if (overlap != speaking_turns_for_name.end()) { 195 RTC_LOG(LS_ERROR) << "Self cross-talk detected"; 196 return false; 197 } 198 } 199 200 return true; 201 } 202 203 } // namespace conversational_speech 204 } // namespace test 205 } // namespace webrtc