OpusTrackEncoder.cpp (16399B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 * You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 #include "OpusTrackEncoder.h" 6 7 #include <opus/opus.h> 8 9 #include "VideoUtils.h" 10 #include "mozilla/CheckedInt.h" 11 #include "mozilla/ProfilerLabels.h" 12 #include "nsString.h" 13 14 #define LOG(args, ...) 15 16 namespace mozilla { 17 18 // The Opus format supports up to 8 channels, and supports multitrack audio up 19 // to 255 channels, but the current implementation supports only mono and 20 // stereo, and downmixes any more than that. 21 constexpr int MAX_SUPPORTED_AUDIO_CHANNELS = 8; 22 23 // http://www.opus-codec.org/docs/html_api-1.0.2/group__opus__encoder.html 24 // In section "opus_encoder_init", channels must be 1 or 2 of input signal. 25 constexpr int MAX_CHANNELS = 2; 26 27 // A maximum data bytes for Opus to encode. 28 constexpr int MAX_DATA_BYTES = 4096; 29 30 // http://tools.ietf.org/html/draft-ietf-codec-oggopus-00#section-4 31 // Second paragraph, " The granule position of an audio data page is in units 32 // of PCM audio samples at a fixed rate of 48 kHz." 33 constexpr int kOpusSamplingRate = 48000; 34 35 // The duration of an Opus frame, and it must be 2.5, 5, 10, 20, 40 or 60 ms. 36 constexpr int kFrameDurationMs = 20; 37 38 // The supported sampling rate of input signal (Hz), 39 // must be one of the following. Will resampled to 48kHz otherwise. 40 constexpr int kOpusSupportedInputSamplingRates[] = {8000, 12000, 16000, 24000, 41 48000}; 42 43 namespace { 44 45 // An endian-neutral serialization of integers. Serializing T in little endian 46 // format to aOutput, where T is a 16 bits or 32 bits integer. 47 template <typename T> 48 static void SerializeToBuffer(T aValue, nsTArray<uint8_t>* aOutput) { 49 for (uint32_t i = 0; i < sizeof(T); i++) { 50 aOutput->AppendElement((uint8_t)(0x000000ff & (aValue >> (i * 8)))); 51 } 52 } 53 54 static inline void SerializeToBuffer(const nsCString& aComment, 55 nsTArray<uint8_t>* aOutput) { 56 // Format of serializing a string to buffer is, the length of string (32 bits, 57 // little endian), and the string. 58 SerializeToBuffer((uint32_t)(aComment.Length()), aOutput); 59 aOutput->AppendElements(aComment.get(), aComment.Length()); 60 } 61 62 static void SerializeOpusIdHeader(uint8_t aChannelCount, uint16_t aPreskip, 63 uint32_t aInputSampleRate, 64 nsTArray<uint8_t>* aOutput) { 65 // The magic signature, null terminator has to be stripped off from strings. 66 constexpr uint8_t magic[] = "OpusHead"; 67 aOutput->AppendElements(magic, sizeof(magic) - 1); 68 69 // The version must always be 1 (8 bits, unsigned). 70 aOutput->AppendElement(1); 71 72 // Number of output channels (8 bits, unsigned). 73 aOutput->AppendElement(aChannelCount); 74 75 // Number of samples (at 48 kHz) to discard from the decoder output when 76 // starting playback (16 bits, unsigned, little endian). 77 SerializeToBuffer(aPreskip, aOutput); 78 79 // The sampling rate of input source (32 bits, unsigned, little endian). 80 SerializeToBuffer(aInputSampleRate, aOutput); 81 82 // Output gain, an encoder should set this field to zero (16 bits, signed, 83 // little endian). 84 SerializeToBuffer((int16_t)0, aOutput); 85 86 // Channel mapping family. Family 0 allows only 1 or 2 channels (8 bits, 87 // unsigned). 88 aOutput->AppendElement(0); 89 } 90 91 static void SerializeOpusCommentHeader(const nsCString& aVendor, 92 const nsTArray<nsCString>& aComments, 93 nsTArray<uint8_t>* aOutput) { 94 // The magic signature, null terminator has to be stripped off. 95 constexpr uint8_t magic[] = "OpusTags"; 96 aOutput->AppendElements(magic, sizeof(magic) - 1); 97 98 // The vendor; Should append in the following order: 99 // vendor string length (32 bits, unsigned, little endian) 100 // vendor string. 101 SerializeToBuffer(aVendor, aOutput); 102 103 // Add comments; Should append in the following order: 104 // comment list length (32 bits, unsigned, little endian) 105 // comment #0 string length (32 bits, unsigned, little endian) 106 // comment #0 string 107 // comment #1 string length (32 bits, unsigned, little endian) 108 // comment #1 string ... 109 SerializeToBuffer((uint32_t)aComments.Length(), aOutput); 110 for (uint32_t i = 0; i < aComments.Length(); ++i) { 111 SerializeToBuffer(aComments[i], aOutput); 112 } 113 } 114 115 bool IsSampleRateSupported(TrackRate aSampleRate) { 116 // According to www.opus-codec.org, creating an opus encoder requires the 117 // sampling rate of source signal be one of 8000, 12000, 16000, 24000, or 118 // 48000. If this constraint is not satisfied, we resample the input to 48kHz. 119 AutoTArray<int, 5> supportedSamplingRates; 120 supportedSamplingRates.AppendElements( 121 kOpusSupportedInputSamplingRates, 122 std::size(kOpusSupportedInputSamplingRates)); 123 return supportedSamplingRates.Contains(aSampleRate); 124 } 125 126 } // Anonymous namespace. 127 128 OpusTrackEncoder::OpusTrackEncoder(TrackRate aTrackRate, 129 MediaQueue<EncodedFrame>& aEncodedDataQueue) 130 : AudioTrackEncoder(aTrackRate, aEncodedDataQueue), 131 mOutputSampleRate(IsSampleRateSupported(aTrackRate) ? aTrackRate 132 : kOpusSamplingRate), 133 mEncoder(nullptr), 134 mLookahead(0), 135 mLookaheadWritten(0), 136 mResampler(nullptr), 137 mNumOutputFrames(0) {} 138 139 OpusTrackEncoder::~OpusTrackEncoder() { 140 if (mEncoder) { 141 opus_encoder_destroy(mEncoder); 142 } 143 if (mResampler) { 144 speex_resampler_destroy(mResampler); 145 mResampler = nullptr; 146 } 147 } 148 149 nsresult OpusTrackEncoder::Init(int aChannels) { 150 NS_ENSURE_TRUE((aChannels <= MAX_SUPPORTED_AUDIO_CHANNELS) && (aChannels > 0), 151 NS_ERROR_FAILURE); 152 153 // This version of encoder API only support 1 or 2 channels, 154 // So set the mChannels less or equal 2 and 155 // let InterleaveTrackData downmix pcm data. 156 mChannels = aChannels > MAX_CHANNELS ? MAX_CHANNELS : aChannels; 157 158 // Reject non-audio sample rates. 159 NS_ENSURE_TRUE(mTrackRate >= 8000, NS_ERROR_INVALID_ARG); 160 NS_ENSURE_TRUE(mTrackRate <= 192000, NS_ERROR_INVALID_ARG); 161 162 if (NeedsResampler()) { 163 int error; 164 mResampler = speex_resampler_init(mChannels, mTrackRate, kOpusSamplingRate, 165 SPEEX_RESAMPLER_QUALITY_DEFAULT, &error); 166 167 if (error != RESAMPLER_ERR_SUCCESS) { 168 return NS_ERROR_FAILURE; 169 } 170 } 171 172 int error = 0; 173 mEncoder = opus_encoder_create(mOutputSampleRate, mChannels, 174 OPUS_APPLICATION_AUDIO, &error); 175 176 if (error != OPUS_OK) { 177 return NS_ERROR_FAILURE; 178 } 179 180 if (mAudioBitrate) { 181 int bps = static_cast<int>( 182 std::min<uint32_t>(mAudioBitrate, std::numeric_limits<int>::max())); 183 error = opus_encoder_ctl(mEncoder, OPUS_SET_BITRATE(bps)); 184 if (error != OPUS_OK) { 185 return NS_ERROR_FAILURE; 186 } 187 } 188 189 // In the case of Opus we need to calculate the codec delay based on the 190 // pre-skip. For more information see: 191 // https://tools.ietf.org/html/rfc7845#section-4.2 192 error = opus_encoder_ctl(mEncoder, OPUS_GET_LOOKAHEAD(&mLookahead)); 193 if (error != OPUS_OK) { 194 mLookahead = 0; 195 return NS_ERROR_FAILURE; 196 } 197 198 SetInitialized(); 199 200 return NS_OK; 201 } 202 203 int OpusTrackEncoder::GetLookahead() const { 204 return mLookahead * kOpusSamplingRate / mOutputSampleRate; 205 } 206 207 int OpusTrackEncoder::NumInputFramesPerPacket() const { 208 return mTrackRate * kFrameDurationMs / 1000; 209 } 210 211 int OpusTrackEncoder::NumOutputFramesPerPacket() const { 212 return mOutputSampleRate * kFrameDurationMs / 1000; 213 } 214 215 bool OpusTrackEncoder::NeedsResampler() const { 216 // A resampler is needed when mTrackRate is not supported by the opus encoder. 217 // This is equivalent to !IsSampleRateSupported(mTrackRate) but less cycles. 218 return mTrackRate != mOutputSampleRate && 219 mOutputSampleRate == kOpusSamplingRate; 220 } 221 222 already_AddRefed<TrackMetadataBase> OpusTrackEncoder::GetMetadata() { 223 AUTO_PROFILER_LABEL("OpusTrackEncoder::GetMetadata", OTHER); 224 225 MOZ_ASSERT(mInitialized); 226 227 if (!mInitialized) { 228 return nullptr; 229 } 230 231 RefPtr<OpusMetadata> meta = new OpusMetadata(); 232 meta->mChannels = mChannels; 233 meta->mSamplingFrequency = mTrackRate; 234 235 // Ogg and Webm timestamps are always sampled at 48k for Opus. 236 SerializeOpusIdHeader(mChannels, 237 mLookahead * (kOpusSamplingRate / mOutputSampleRate), 238 mTrackRate, &meta->mIdHeader); 239 240 nsCString vendor; 241 vendor.AppendASCII(opus_get_version_string()); 242 243 nsTArray<nsCString> comments; 244 comments.AppendElement( 245 nsLiteralCString("ENCODER=Mozilla" MOZ_APP_UA_VERSION)); 246 247 SerializeOpusCommentHeader(vendor, comments, &meta->mCommentHeader); 248 249 return meta.forget(); 250 } 251 252 nsresult OpusTrackEncoder::Encode(AudioSegment* aSegment) { 253 AUTO_PROFILER_LABEL("OpusTrackEncoder::Encode", OTHER); 254 255 MOZ_ASSERT(aSegment); 256 MOZ_ASSERT(mInitialized || mCanceled); 257 258 if (mCanceled || IsEncodingComplete()) { 259 return NS_ERROR_FAILURE; 260 } 261 262 if (!mInitialized) { 263 // calculation below depends on the truth that mInitialized is true. 264 return NS_ERROR_FAILURE; 265 } 266 267 int result = 0; 268 // Loop until we run out of packets of input data 269 while (result >= 0 && !IsEncodingComplete()) { 270 // re-sampled frames left last time which didn't fit into an Opus packet 271 // duration. 272 const int framesLeft = mResampledLeftover.Length() / mChannels; 273 MOZ_ASSERT(NumOutputFramesPerPacket() >= framesLeft); 274 // Fetch input frames such that there will be n frames where (n + 275 // framesLeft) >= NumOutputFramesPerPacket() after re-sampling. 276 const int framesToFetch = NumInputFramesPerPacket() - 277 (framesLeft * mTrackRate / kOpusSamplingRate) + 278 (NeedsResampler() ? 1 : 0); 279 280 if (!mEndOfStream && aSegment->GetDuration() < framesToFetch) { 281 // Not enough raw data 282 return NS_OK; 283 } 284 285 // Start encoding data. 286 AutoTArray<AudioDataValue, 9600> pcm; 287 pcm.SetLength(NumOutputFramesPerPacket() * mChannels); 288 289 int frameCopied = 0; 290 291 for (AudioSegment::ChunkIterator iter(*aSegment); 292 !iter.IsEnded() && frameCopied < framesToFetch; iter.Next()) { 293 AudioChunk chunk = *iter; 294 295 // Chunk to the required frame size. 296 TrackTime frameToCopy = 297 std::min(chunk.GetDuration(), 298 static_cast<TrackTime>(framesToFetch - frameCopied)); 299 300 // Possible greatest value of framesToFetch = 3844: see 301 // https://bugzilla.mozilla.org/show_bug.cgi?id=1349421#c8. frameToCopy 302 // should not be able to exceed this value. 303 MOZ_ASSERT(frameToCopy <= 3844, "frameToCopy exceeded expected range"); 304 305 if (!chunk.IsNull()) { 306 // Append the interleaved data to the end of pcm buffer. 307 AudioTrackEncoder::InterleaveTrackData( 308 chunk, frameToCopy, mChannels, 309 pcm.Elements() + frameCopied * mChannels); 310 } else { 311 CheckedInt<int> memsetLength = 312 CheckedInt<int>(frameToCopy) * mChannels * sizeof(AudioDataValue); 313 if (!memsetLength.isValid()) { 314 // This should never happen, but we use a defensive check because 315 // we really don't want a bad memset 316 MOZ_ASSERT_UNREACHABLE("memsetLength invalid!"); 317 return NS_ERROR_FAILURE; 318 } 319 memset(pcm.Elements() + frameCopied * mChannels, 0, 320 memsetLength.value()); 321 } 322 323 frameCopied += frameToCopy; 324 } 325 326 // Possible greatest value of framesToFetch = 3844: see 327 // https://bugzilla.mozilla.org/show_bug.cgi?id=1349421#c8. frameCopied 328 // should not be able to exceed this value. 329 MOZ_ASSERT(frameCopied <= 3844, "frameCopied exceeded expected range"); 330 331 int framesInPCM = frameCopied; 332 if (mResampler) { 333 AutoTArray<AudioDataValue, 9600> resamplingDest; 334 uint32_t inframes = frameCopied; 335 uint32_t outframes = inframes * kOpusSamplingRate / mTrackRate + 1; 336 337 // We want to consume all the input data, so we slightly oversize the 338 // resampled data buffer so we can fit the output data in. We cannot 339 // really predict the output frame count at each call. 340 resamplingDest.SetLength(outframes * mChannels); 341 342 float* in = reinterpret_cast<float*>(pcm.Elements()); 343 float* out = reinterpret_cast<float*>(resamplingDest.Elements()); 344 speex_resampler_process_interleaved_float(mResampler, in, &inframes, out, 345 &outframes); 346 347 MOZ_ASSERT(pcm.Length() >= mResampledLeftover.Length()); 348 PodCopy(pcm.Elements(), mResampledLeftover.Elements(), 349 mResampledLeftover.Length()); 350 351 uint32_t outframesToCopy = std::min( 352 outframes, 353 static_cast<uint32_t>(NumOutputFramesPerPacket() - framesLeft)); 354 355 MOZ_ASSERT(pcm.Length() - mResampledLeftover.Length() >= 356 outframesToCopy * mChannels); 357 PodCopy(pcm.Elements() + mResampledLeftover.Length(), 358 resamplingDest.Elements(), outframesToCopy * mChannels); 359 int frameLeftover = outframes - outframesToCopy; 360 mResampledLeftover.SetLength(frameLeftover * mChannels); 361 PodCopy(mResampledLeftover.Elements(), 362 resamplingDest.Elements() + outframesToCopy * mChannels, 363 mResampledLeftover.Length()); 364 // This is always at 48000Hz. 365 framesInPCM = framesLeft + outframesToCopy; 366 } 367 368 // Remove the raw data which has been pulled to pcm buffer. 369 // The value of frameCopied should be equal to (or smaller than, if eos) 370 // NumOutputFramesPerPacket(). 371 aSegment->RemoveLeading(frameCopied); 372 373 // Has reached the end of input stream and all queued data has pulled for 374 // encoding. 375 bool isFinalPacket = false; 376 if (aSegment->GetDuration() == 0 && mEndOfStream && 377 framesInPCM < NumOutputFramesPerPacket()) { 378 // Pad |mLookahead| samples to the end of the track to prevent loss of 379 // original data. 380 const int toWrite = std::min(mLookahead - mLookaheadWritten, 381 NumOutputFramesPerPacket() - framesInPCM); 382 PodZero(pcm.Elements() + framesInPCM * mChannels, toWrite * mChannels); 383 mLookaheadWritten += toWrite; 384 framesInPCM += toWrite; 385 if (mLookaheadWritten == mLookahead) { 386 isFinalPacket = true; 387 } 388 } 389 390 MOZ_ASSERT_IF(!isFinalPacket, framesInPCM == NumOutputFramesPerPacket()); 391 392 // Append null data to pcm buffer if the leftover data is not enough for 393 // opus encoder. 394 if (framesInPCM < NumOutputFramesPerPacket() && isFinalPacket) { 395 PodZero(pcm.Elements() + framesInPCM * mChannels, 396 (NumOutputFramesPerPacket() - framesInPCM) * mChannels); 397 } 398 auto frameData = MakeRefPtr<EncodedFrame::FrameData>(); 399 // Encode the data with Opus Encoder. 400 frameData->SetLength(MAX_DATA_BYTES); 401 // result is returned as opus error code if it is negative. 402 result = 0; 403 const float* pcmBuf = static_cast<float*>(pcm.Elements()); 404 result = opus_encode_float(mEncoder, pcmBuf, NumOutputFramesPerPacket(), 405 frameData->Elements(), MAX_DATA_BYTES); 406 frameData->SetLength(result >= 0 ? result : 0); 407 408 if (result < 0) { 409 LOG("[Opus] Fail to encode data! Result: %s.", opus_strerror(result)); 410 } 411 if (isFinalPacket) { 412 if (mResampler) { 413 speex_resampler_destroy(mResampler); 414 mResampler = nullptr; 415 } 416 mResampledLeftover.SetLength(0); 417 } 418 419 // timestamp should be the time of the first sample 420 mEncodedDataQueue.Push(MakeAndAddRef<EncodedFrame>( 421 media::TimeUnit(mNumOutputFrames + mLookahead, mOutputSampleRate), 422 static_cast<uint64_t>(framesInPCM) * kOpusSamplingRate / 423 mOutputSampleRate, 424 kOpusSamplingRate, EncodedFrame::OPUS_AUDIO_FRAME, 425 std::move(frameData))); 426 427 mNumOutputFrames += NumOutputFramesPerPacket(); 428 LOG("[Opus] mOutputTimeStamp %.3f.", 429 media::TimeUnit(mNumOutputFrames, mOutputSampleRate).ToSeconds()); 430 431 if (isFinalPacket) { 432 LOG("[Opus] Done encoding."); 433 mEncodedDataQueue.Finish(); 434 } 435 } 436 437 return result >= 0 ? NS_OK : NS_ERROR_FAILURE; 438 } 439 440 } // namespace mozilla 441 442 #undef LOG