FFmpegAudioEncoder.cpp (19482B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=2 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "FFmpegAudioEncoder.h" 8 9 #include "AudioSegment.h" 10 #include "FFmpegLog.h" 11 #include "FFmpegRuntimeLinker.h" 12 #include "FFmpegUtils.h" 13 #include "MediaData.h" 14 15 namespace mozilla { 16 17 FFmpegAudioEncoder<LIBAV_VER>::FFmpegAudioEncoder( 18 const FFmpegLibWrapper* aLib, AVCodecID aCodecID, 19 const RefPtr<TaskQueue>& aTaskQueue, const EncoderConfig& aConfig) 20 : FFmpegDataEncoder(aLib, aCodecID, aTaskQueue, aConfig) {} 21 22 RefPtr<MediaDataEncoder::InitPromise> FFmpegAudioEncoder<LIBAV_VER>::Init() { 23 FFMPEGA_LOG("Init"); 24 return InvokeAsync(mTaskQueue, __func__, [self = RefPtr(this)]() { 25 MediaResult r = self->InitEncoder(); 26 if (NS_FAILED(r.Code())) { 27 FFMPEGV_LOG("%s", r.Description().get()); 28 return InitPromise::CreateAndReject(r, __func__); 29 } 30 return InitPromise::CreateAndResolve(true, __func__); 31 }); 32 } 33 34 nsCString FFmpegAudioEncoder<LIBAV_VER>::GetDescriptionName() const { 35 #ifdef USING_MOZFFVPX 36 return "ffvpx audio encoder"_ns; 37 #else 38 const char* lib = 39 # if defined(MOZ_FFMPEG) 40 FFmpegRuntimeLinker::LinkStatusLibraryName(); 41 # else 42 "no library: ffmpeg disabled during build"; 43 # endif 44 return nsPrintfCString("ffmpeg audio encoder (%s)", lib); 45 #endif 46 } 47 48 void FFmpegAudioEncoder<LIBAV_VER>::ResamplerDestroy::operator()( 49 SpeexResamplerState* aResampler) { 50 speex_resampler_destroy(aResampler); 51 } 52 53 MediaResult FFmpegAudioEncoder<LIBAV_VER>::InitEncoder() { 54 MOZ_ASSERT(mTaskQueue->IsOnCurrentThread()); 55 56 FFMPEG_LOG("FFmpegAudioEncoder::InitEncoder"); 57 58 // Initialize the common members of the encoder instance 59 auto r = AllocateCodecContext(/* aHardware */ false); 60 if (r.isErr()) { 61 return r.unwrapErr(); 62 } 63 mCodecContext = r.unwrap(); 64 const AVCodec* codec = mCodecContext->codec; 65 mCodecName = codec->name; 66 67 #if LIBAVCODEC_VERSION_MAJOR >= 60 68 mCodecContext->flags |= AV_CODEC_FLAG_FRAME_DURATION; 69 #endif 70 71 // Find a compatible input rate for the codec, update the encoder config, and 72 // note the rate at which this instance was configured. 73 mInputSampleRate = AssertedCast<int>(mConfig.mSampleRate); 74 if (codec->supported_samplerates) { 75 // Ensure the sample-rate list is sorted, iterate and either find that the 76 // sample rate is supported, or pick the same rate just above the audio 77 // input sample-rate (as to not lose information). If the audio is higher 78 // than the highest supported sample-rate, down-sample to the highest 79 // sample-rate supported by the codec. This is the case when encoding high 80 // samplerate audio to opus. 81 AutoTArray<int, 16> supportedSampleRates; 82 IterateZeroTerminated(codec->supported_samplerates, 83 [&supportedSampleRates](int aRate) mutable { 84 supportedSampleRates.AppendElement(aRate); 85 }); 86 supportedSampleRates.Sort(); 87 88 for (const auto& rate : supportedSampleRates) { 89 if (mInputSampleRate == rate) { 90 mConfig.mSampleRate = rate; 91 break; 92 } 93 if (mInputSampleRate < rate) { 94 // This rate is the smallest supported rate above the content's rate. 95 mConfig.mSampleRate = rate; 96 break; 97 } 98 if (mInputSampleRate > rate) { 99 mConfig.mSampleRate = rate; 100 } 101 } 102 } 103 104 if (mConfig.mSampleRate != AssertedCast<uint32_t>(mInputSampleRate)) { 105 // Need to resample to targetRate 106 int err; 107 SpeexResamplerState* resampler = speex_resampler_init( 108 mConfig.mNumberOfChannels, mInputSampleRate, mConfig.mSampleRate, 109 SPEEX_RESAMPLER_QUALITY_DEFAULT, &err); 110 if (!err) { 111 mResampler.reset(resampler); 112 } else { 113 FFMPEG_LOG( 114 "Error creating resampler in FFmpegAudioEncoder %dHz -> %dHz (%dch)", 115 mInputSampleRate, mConfig.mSampleRate, mConfig.mNumberOfChannels); 116 } 117 } 118 119 // And now the audio-specific part 120 mCodecContext->sample_rate = AssertedCast<int>(mConfig.mSampleRate); 121 122 #if LIBAVCODEC_VERSION_MAJOR >= 60 123 // Gecko's ordering intentionnally matches ffmepg's ordering 124 mLib->av_channel_layout_default(&mCodecContext->ch_layout, 125 AssertedCast<int>(mConfig.mNumberOfChannels)); 126 #else 127 mCodecContext->channels = AssertedCast<int>(mConfig.mNumberOfChannels); 128 #endif 129 130 switch (mConfig.mCodec) { 131 case CodecType::Opus: 132 // When using libopus, ffmpeg supports interleaved float and s16 input. 133 mCodecContext->sample_fmt = AV_SAMPLE_FMT_FLT; 134 break; 135 case CodecType::Vorbis: 136 // When using libvorbis, ffmpeg only supports planar f32 input. 137 mCodecContext->sample_fmt = AV_SAMPLE_FMT_FLTP; 138 break; 139 default: 140 MOZ_ASSERT_UNREACHABLE("Not supported"); 141 } 142 143 if (mConfig.mCodec == CodecType::Opus) { 144 // Default is VBR 145 if (mConfig.mBitrateMode == BitrateMode::Constant) { 146 mLib->av_opt_set(mCodecContext->priv_data, "vbr", "off", 0); 147 } 148 if (mConfig.mCodecSpecific.is<OpusSpecific>()) { 149 const OpusSpecific& specific = mConfig.mCodecSpecific.as<OpusSpecific>(); 150 // This attribute maps directly to complexity 151 mCodecContext->compression_level = specific.mComplexity; 152 FFMPEG_LOG("Opus complexity set to %d", specific.mComplexity); 153 float frameDurationMs = 154 AssertedCast<float>(specific.mFrameDuration) / 1000.f; 155 if (mLib->av_opt_set_double(mCodecContext->priv_data, "frame_duration", 156 frameDurationMs, 0)) { 157 return MediaResult( 158 NS_ERROR_FAILURE, 159 "Error setting the frame duration on Opus encoder"_ns); 160 } 161 FFMPEG_LOG("Opus frame duration set to %0.2f", frameDurationMs); 162 if (specific.mPacketLossPerc) { 163 if (mLib->av_opt_set_int( 164 mCodecContext->priv_data, "packet_loss", 165 AssertedCast<int64_t>(specific.mPacketLossPerc), 0)) { 166 return MediaResult( 167 NS_ERROR_FAILURE, 168 RESULT_DETAIL( 169 "Error setting the packet loss percentage to %" PRIu64 170 " on Opus encoder", 171 specific.mPacketLossPerc)); 172 } 173 FFMPEG_LOGV("Packet loss set to %d%% in Opus encoder", 174 AssertedCast<int>(specific.mPacketLossPerc)); 175 } 176 if (specific.mUseInBandFEC) { 177 if (mLib->av_opt_set(mCodecContext->priv_data, "fec", "on", 0)) { 178 return MediaResult( 179 NS_ERROR_FAILURE, 180 RESULT_DETAIL("Error %s FEC on Opus encoder", 181 specific.mUseInBandFEC ? "enabling" : "disabling")); 182 } 183 FFMPEG_LOGV("In-band FEC enabled for Opus encoder."); 184 } 185 if (specific.mUseDTX) { 186 if (mLib->av_opt_set(mCodecContext->priv_data, "dtx", "on", 0)) { 187 return MediaResult( 188 NS_ERROR_FAILURE, 189 RESULT_DETAIL("Error %s DTX on Opus encoder", 190 specific.mUseDTX ? "enabling" : "disabling")); 191 } 192 // DTX packets are a TOC byte, and possibly one byte of length, packets 193 // 3 bytes and larger are to be returned. 194 mDtxThreshold = 3; 195 } 196 // TODO: format 197 // https://bugzilla.mozilla.org/show_bug.cgi?id=1876066 198 } else { 199 MOZ_ASSERT(mConfig.mCodecSpecific.is<void_t>()); 200 } 201 } 202 // Override the time base: always the sample-rate the encoder is running at 203 mCodecContext->time_base = 204 AVRational{.num = 1, .den = mCodecContext->sample_rate}; 205 206 #if LIBAVCODEC_VERSION_MAJOR >= 60 207 mCodecContext->flags |= AV_CODEC_FLAG_FRAME_DURATION; 208 #endif 209 210 SetContextBitrate(); 211 212 AVDictionary* options = nullptr; 213 if (int ret = OpenCodecContext(mCodecContext->codec, &options); ret < 0) { 214 return MediaResult( 215 NS_ERROR_DOM_MEDIA_FATAL_ERR, 216 RESULT_DETAIL("failed to open %s avcodec: %s", mCodecName.get(), 217 MakeErrorString(mLib, ret).get())); 218 } 219 mLib->av_dict_free(&options); 220 221 FFMPEGA_LOG( 222 "%s has been initialized with sample-format: %d, bitrate: %" PRIi64 223 ", sample-rate: %d, channels: %d, time_base: %d/%d", 224 mCodecName.get(), static_cast<int>(mCodecContext->sample_fmt), 225 static_cast<int64_t>(mCodecContext->bit_rate), mCodecContext->sample_rate, 226 mConfig.mNumberOfChannels, mCodecContext->time_base.num, 227 mCodecContext->time_base.den); 228 229 return NS_OK; 230 } 231 232 // avcodec_send_frame and avcodec_receive_packet were introduced in version 58. 233 #if LIBAVCODEC_VERSION_MAJOR >= 58 234 235 Result<MediaDataEncoder::EncodedData, MediaResult> 236 FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples, 237 media::TimeUnit aPts) { 238 MOZ_ASSERT(mTaskQueue->IsOnCurrentThread()); 239 MOZ_ASSERT(aSamples.Length() % mConfig.mNumberOfChannels == 0); 240 241 // Allocate AVFrame. 242 if (!PrepareFrame()) { 243 return Err( 244 MediaResult(NS_ERROR_OUT_OF_MEMORY, "failed to allocate frame"_ns)); 245 } 246 247 uint32_t frameCount = aSamples.Length() / mConfig.mNumberOfChannels; 248 249 // This method assumes that the audio has been packetized appropriately -- 250 // packets smaller than the packet size are allowed when draining. 251 MOZ_ASSERT(AssertedCast<int>(frameCount) <= mCodecContext->frame_size); 252 253 ChannelCount(mFrame) = AssertedCast<int>(mConfig.mNumberOfChannels); 254 255 # if LIBAVCODEC_VERSION_MAJOR >= 60 256 int rv = mLib->av_channel_layout_copy(&mFrame->ch_layout, 257 &mCodecContext->ch_layout); 258 if (rv < 0) { 259 return Err(MediaResult(NS_ERROR_DOM_MEDIA_FATAL_ERR, 260 RESULT_DETAIL("channel layout copy error: %s", 261 MakeErrorString(mLib, rv).get()))); 262 } 263 # endif 264 265 mFrame->sample_rate = AssertedCast<int>(mConfig.mSampleRate); 266 // Not a mistake, nb_samples is per channel in ffmpeg 267 mFrame->nb_samples = AssertedCast<int>(frameCount); 268 // Audio is converted below if needed 269 mFrame->format = mCodecContext->sample_fmt; 270 // Set presentation timestamp and duration of the AVFrame. 271 # if LIBAVCODEC_VERSION_MAJOR >= 59 272 mFrame->time_base = 273 AVRational{.num = 1, .den = static_cast<int>(mConfig.mSampleRate)}; 274 # endif 275 mFrame->pts = aPts.ToTicksAtRate(mConfig.mSampleRate); 276 # if LIBAVCODEC_VERSION_MAJOR >= 60 277 mFrame->duration = frameCount; 278 # else 279 mFrame->pkt_duration = frameCount; 280 # endif 281 282 if (int ret = mLib->av_frame_get_buffer(mFrame, 16); ret < 0) { 283 return Err(MediaResult(NS_ERROR_OUT_OF_MEMORY, 284 RESULT_DETAIL("failed to allocate frame data: %s", 285 MakeErrorString(mLib, ret).get()))); 286 } 287 288 // Make sure AVFrame is writable. 289 if (int ret = mLib->av_frame_make_writable(mFrame); ret < 0) { 290 return Err(MediaResult(NS_ERROR_DOM_MEDIA_FATAL_ERR, 291 RESULT_DETAIL("failed to make frame writable: %s", 292 MakeErrorString(mLib, ret).get()))); 293 } 294 295 // The input is always in f32 interleaved for now 296 if (mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLT) { 297 PodCopy(reinterpret_cast<float*>(mFrame->data[0]), aSamples.data(), 298 aSamples.Length()); 299 } else { 300 MOZ_ASSERT(mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP); 301 for (uint32_t i = 0; i < mConfig.mNumberOfChannels; i++) { 302 DeinterleaveAndConvertBuffer(aSamples.data(), mFrame->nb_samples, 303 mConfig.mNumberOfChannels, mFrame->data); 304 } 305 } 306 307 // Now send the AVFrame to ffmpeg for encoding, same code for audio and video. 308 return FFmpegDataEncoder<LIBAV_VER>::EncodeWithModernAPIs(); 309 } 310 311 Result<MediaDataEncoder::EncodedData, MediaResult> FFmpegAudioEncoder< 312 LIBAV_VER>::EncodeInputWithModernAPIs(RefPtr<const MediaData> aSample) { 313 MOZ_ASSERT(mTaskQueue->IsOnCurrentThread()); 314 MOZ_ASSERT(mCodecContext); 315 MOZ_ASSERT(aSample); 316 317 RefPtr<const AudioData> sample(aSample->As<AudioData>()); 318 319 FFMPEG_LOG("Encoding %" PRIu32 " frames of audio at pts: %s", 320 sample->Frames(), sample->mTime.ToString().get()); 321 322 if ((!mResampler && sample->mRate != mConfig.mSampleRate) || 323 (mResampler && 324 sample->mRate != AssertedCast<uint32_t>(mInputSampleRate)) || 325 sample->mChannels != mConfig.mNumberOfChannels) { 326 return Err(MediaResult(NS_ERROR_DOM_ENCODING_NOT_SUPPORTED_ERR, 327 "Rate or sample-rate at the input of the encoder " 328 "different from what has been configured " 329 "initially"_ns)); 330 } 331 332 // ffmpeg expects exactly sized input audio packets most of the time. 333 // Packetization is performed if needed, and audio packets of the correct size 334 // are fed to ffmpeg, with timestamps extrapolated the timestamp found on 335 // the input MediaData. 336 337 if (!mPacketizer) { 338 media::TimeUnit basePts = media::TimeUnit::Zero(mConfig.mSampleRate); 339 basePts += sample->mTime; 340 mPacketizer.emplace(mCodecContext->frame_size, sample->mChannels, 341 basePts.ToTicksAtRate(mConfig.mSampleRate), 342 mConfig.mSampleRate); 343 } 344 345 if (!mFirstPacketPts.IsValid()) { 346 mFirstPacketPts = sample->mTime; 347 } 348 349 Span<float> audio = sample->Data(); 350 351 if (mResampler) { 352 // Ensure that all input frames are consumed each time by oversizing the 353 // output buffer. 354 int bufferLengthGuess = std::ceil(2. * static_cast<float>(audio.size()) * 355 mConfig.mSampleRate / mInputSampleRate); 356 mTempBuffer.SetLength(bufferLengthGuess); 357 uint32_t inputFrames = audio.size() / mConfig.mNumberOfChannels; 358 uint32_t inputFramesProcessed = inputFrames; 359 uint32_t outputFrames = bufferLengthGuess / mConfig.mNumberOfChannels; 360 DebugOnly<int> rv = speex_resampler_process_interleaved_float( 361 mResampler.get(), audio.data(), &inputFramesProcessed, 362 mTempBuffer.Elements(), &outputFrames); 363 audio = Span<float>(mTempBuffer.Elements(), 364 outputFrames * mConfig.mNumberOfChannels); 365 MOZ_ASSERT(inputFrames == inputFramesProcessed, 366 "increate the buffer to consume all input each time"); 367 MOZ_ASSERT(rv == RESAMPLER_ERR_SUCCESS); 368 } 369 370 EncodedData output; 371 MediaResult rv = NS_OK; 372 373 mPacketizer->Input(audio.data(), audio.Length() / mConfig.mNumberOfChannels); 374 375 // Dequeue and encode each packet 376 while (mPacketizer->PacketsAvailable() && rv.Code() == NS_OK) { 377 mTempBuffer.SetLength(mCodecContext->frame_size * 378 mConfig.mNumberOfChannels); 379 media::TimeUnit pts = mPacketizer->Output(mTempBuffer.Elements()); 380 auto audio = Span(mTempBuffer.Elements(), mTempBuffer.Length()); 381 FFMPEG_LOG("Encoding %" PRIu32 " frames, pts: %s", 382 mPacketizer->PacketSize(), pts.ToString().get()); 383 auto encodeResult = EncodeOnePacket(audio, pts); 384 if (encodeResult.isOk()) { 385 output.AppendElements(std::move(encodeResult.unwrap())); 386 } else { 387 return encodeResult; 388 } 389 pts += media::TimeUnit(mPacketizer->PacketSize(), mConfig.mSampleRate); 390 } 391 return std::move(output); 392 } 393 394 Result<MediaDataEncoder::EncodedData, MediaResult> 395 FFmpegAudioEncoder<LIBAV_VER>::DrainWithModernAPIs() { 396 MOZ_ASSERT(mTaskQueue->IsOnCurrentThread()); 397 398 // If there's no packetizer, or it's empty, we can proceed immediately. 399 if (!mPacketizer || mPacketizer->FramesAvailable() == 0) { 400 return FFmpegDataEncoder<LIBAV_VER>::DrainWithModernAPIs(); 401 } 402 EncodedData output; 403 MediaResult rv = NS_OK; 404 // Dequeue and encode each packet 405 mTempBuffer.SetLength(mCodecContext->frame_size * 406 mPacketizer->ChannelCount()); 407 uint32_t written; 408 media::TimeUnit pts = mPacketizer->Drain(mTempBuffer.Elements(), written); 409 auto audio = 410 Span(mTempBuffer.Elements(), written * mPacketizer->ChannelCount()); 411 auto encodeResult = EncodeOnePacket(audio, pts); 412 if (encodeResult.isOk()) { 413 auto array = encodeResult.unwrap(); 414 output.AppendElements(std::move(array)); 415 } else { 416 return encodeResult; 417 } 418 // Now, drain the encoder 419 auto drainResult = FFmpegDataEncoder<LIBAV_VER>::DrainWithModernAPIs(); 420 if (drainResult.isOk()) { 421 auto array = drainResult.unwrap(); 422 output.AppendElements(std::move(array)); 423 } else { 424 return drainResult; 425 } 426 return std::move(output); 427 } 428 #endif // if LIBAVCODEC_VERSION_MAJOR >= 58 429 430 Result<RefPtr<MediaRawData>, MediaResult> 431 FFmpegAudioEncoder<LIBAV_VER>::ToMediaRawData(AVPacket* aPacket) { 432 MOZ_ASSERT(mTaskQueue->IsOnCurrentThread()); 433 MOZ_ASSERT(aPacket); 434 435 if (aPacket->size < mDtxThreshold) { 436 FFMPEG_LOG( 437 "DTX enabled and packet is %d bytes (threshold %d), not returning.", 438 aPacket->size, mDtxThreshold); 439 return RefPtr<MediaRawData>(nullptr); 440 } 441 442 auto creationResult = CreateMediaRawData(aPacket); 443 if (creationResult.isErr()) { 444 return Err(creationResult.unwrapErr()); 445 } 446 447 RefPtr<MediaRawData> data = creationResult.unwrap(); 448 449 data->mKeyframe = (aPacket->flags & AV_PKT_FLAG_KEY) != 0; 450 451 if (auto extradataResult = GetExtraData(aPacket); extradataResult.isOk()) { 452 data->mExtraData = extradataResult.unwrap(); 453 } 454 455 data->mTime = media::TimeUnit(aPacket->pts, mConfig.mSampleRate); 456 data->mTimecode = data->mTime; 457 data->mDuration = 458 media::TimeUnit(mCodecContext->frame_size, mConfig.mSampleRate); 459 460 // Handle encoder delay 461 // Tracked in https://github.com/w3c/webcodecs/issues/626 because not quite 462 // specced yet. 463 if (mFirstPacketPts > data->mTime) { 464 data->mOriginalPresentationWindow = 465 Some(media::TimeInterval{data->mTime, data->GetEndTime()}); 466 // Duration is likely to be ajusted when the above spec issue is fixed. For 467 // now, leave it as-is 468 // data->mDuration -= (mFirstPacketPts - data->mTime); 469 // if (data->mDuration.IsNegative()) { 470 // data->mDuration = media::TimeUnit::Zero(); 471 // } 472 data->mTime = mFirstPacketPts; 473 } 474 475 if (mPacketsDelivered++ == 0) { 476 // Attach the config (including any channel / samplerate modification to fit 477 // the encoder requirements), if needed. 478 data->mConfig = MakeUnique<EncoderConfig>(mConfig); 479 } 480 481 if (data->mExtraData) { 482 FFMPEGA_LOG( 483 "FFmpegAudioEncoder out: [%s,%s] (%zu bytes, extradata %zu bytes)", 484 data->mTime.ToString().get(), data->mDuration.ToString().get(), 485 data->Size(), data->mExtraData->Length()); 486 } else { 487 FFMPEGA_LOG("FFmpegAudioEncoder out: [%s,%s] (%zu bytes)", 488 data->mTime.ToString().get(), data->mDuration.ToString().get(), 489 data->Size()); 490 } 491 492 return data; 493 } 494 495 Result<already_AddRefed<MediaByteBuffer>, MediaResult> 496 FFmpegAudioEncoder<LIBAV_VER>::GetExtraData(AVPacket* /* aPacket */) { 497 MOZ_ASSERT(mTaskQueue->IsOnCurrentThread()); 498 499 if (!mCodecContext->extradata_size) { 500 return Err(MediaResult(NS_ERROR_NOT_AVAILABLE, "no extradata"_ns)); 501 } 502 // Create extra data -- they are on the context. 503 auto extraData = MakeRefPtr<MediaByteBuffer>(); 504 extraData->SetLength(mCodecContext->extradata_size); 505 MOZ_ASSERT(extraData); 506 PodCopy(extraData->Elements(), mCodecContext->extradata, 507 mCodecContext->extradata_size); 508 return extraData.forget(); 509 } 510 511 } // namespace mozilla