tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

OpusTrackEncoder.cpp (16399B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
      4 * You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 #include "OpusTrackEncoder.h"
      6 
      7 #include <opus/opus.h>
      8 
      9 #include "VideoUtils.h"
     10 #include "mozilla/CheckedInt.h"
     11 #include "mozilla/ProfilerLabels.h"
     12 #include "nsString.h"
     13 
     14 #define LOG(args, ...)
     15 
     16 namespace mozilla {
     17 
     18 // The Opus format supports up to 8 channels, and supports multitrack audio up
     19 // to 255 channels, but the current implementation supports only mono and
     20 // stereo, and downmixes any more than that.
     21 constexpr int MAX_SUPPORTED_AUDIO_CHANNELS = 8;
     22 
     23 // http://www.opus-codec.org/docs/html_api-1.0.2/group__opus__encoder.html
     24 // In section "opus_encoder_init", channels must be 1 or 2 of input signal.
     25 constexpr int MAX_CHANNELS = 2;
     26 
     27 // A maximum data bytes for Opus to encode.
     28 constexpr int MAX_DATA_BYTES = 4096;
     29 
     30 // http://tools.ietf.org/html/draft-ietf-codec-oggopus-00#section-4
     31 // Second paragraph, " The granule position of an audio data page is in units
     32 // of PCM audio samples at a fixed rate of 48 kHz."
     33 constexpr int kOpusSamplingRate = 48000;
     34 
     35 // The duration of an Opus frame, and it must be 2.5, 5, 10, 20, 40 or 60 ms.
     36 constexpr int kFrameDurationMs = 20;
     37 
     38 // The supported sampling rate of input signal (Hz),
     39 // must be one of the following. Will resampled to 48kHz otherwise.
     40 constexpr int kOpusSupportedInputSamplingRates[] = {8000, 12000, 16000, 24000,
     41                                                    48000};
     42 
     43 namespace {
     44 
     45 // An endian-neutral serialization of integers. Serializing T in little endian
     46 // format to aOutput, where T is a 16 bits or 32 bits integer.
     47 template <typename T>
     48 static void SerializeToBuffer(T aValue, nsTArray<uint8_t>* aOutput) {
     49  for (uint32_t i = 0; i < sizeof(T); i++) {
     50    aOutput->AppendElement((uint8_t)(0x000000ff & (aValue >> (i * 8))));
     51  }
     52 }
     53 
     54 static inline void SerializeToBuffer(const nsCString& aComment,
     55                                     nsTArray<uint8_t>* aOutput) {
     56  // Format of serializing a string to buffer is, the length of string (32 bits,
     57  // little endian), and the string.
     58  SerializeToBuffer((uint32_t)(aComment.Length()), aOutput);
     59  aOutput->AppendElements(aComment.get(), aComment.Length());
     60 }
     61 
     62 static void SerializeOpusIdHeader(uint8_t aChannelCount, uint16_t aPreskip,
     63                                  uint32_t aInputSampleRate,
     64                                  nsTArray<uint8_t>* aOutput) {
     65  // The magic signature, null terminator has to be stripped off from strings.
     66  constexpr uint8_t magic[] = "OpusHead";
     67  aOutput->AppendElements(magic, sizeof(magic) - 1);
     68 
     69  // The version must always be 1 (8 bits, unsigned).
     70  aOutput->AppendElement(1);
     71 
     72  // Number of output channels (8 bits, unsigned).
     73  aOutput->AppendElement(aChannelCount);
     74 
     75  // Number of samples (at 48 kHz) to discard from the decoder output when
     76  // starting playback (16 bits, unsigned, little endian).
     77  SerializeToBuffer(aPreskip, aOutput);
     78 
     79  // The sampling rate of input source (32 bits, unsigned, little endian).
     80  SerializeToBuffer(aInputSampleRate, aOutput);
     81 
     82  // Output gain, an encoder should set this field to zero (16 bits, signed,
     83  // little endian).
     84  SerializeToBuffer((int16_t)0, aOutput);
     85 
     86  // Channel mapping family. Family 0 allows only 1 or 2 channels (8 bits,
     87  // unsigned).
     88  aOutput->AppendElement(0);
     89 }
     90 
     91 static void SerializeOpusCommentHeader(const nsCString& aVendor,
     92                                       const nsTArray<nsCString>& aComments,
     93                                       nsTArray<uint8_t>* aOutput) {
     94  // The magic signature, null terminator has to be stripped off.
     95  constexpr uint8_t magic[] = "OpusTags";
     96  aOutput->AppendElements(magic, sizeof(magic) - 1);
     97 
     98  // The vendor; Should append in the following order:
     99  // vendor string length (32 bits, unsigned, little endian)
    100  // vendor string.
    101  SerializeToBuffer(aVendor, aOutput);
    102 
    103  // Add comments; Should append in the following order:
    104  // comment list length (32 bits, unsigned, little endian)
    105  // comment #0 string length (32 bits, unsigned, little endian)
    106  // comment #0 string
    107  // comment #1 string length (32 bits, unsigned, little endian)
    108  // comment #1 string ...
    109  SerializeToBuffer((uint32_t)aComments.Length(), aOutput);
    110  for (uint32_t i = 0; i < aComments.Length(); ++i) {
    111    SerializeToBuffer(aComments[i], aOutput);
    112  }
    113 }
    114 
    115 bool IsSampleRateSupported(TrackRate aSampleRate) {
    116  // According to www.opus-codec.org, creating an opus encoder requires the
    117  // sampling rate of source signal be one of 8000, 12000, 16000, 24000, or
    118  // 48000. If this constraint is not satisfied, we resample the input to 48kHz.
    119  AutoTArray<int, 5> supportedSamplingRates;
    120  supportedSamplingRates.AppendElements(
    121      kOpusSupportedInputSamplingRates,
    122      std::size(kOpusSupportedInputSamplingRates));
    123  return supportedSamplingRates.Contains(aSampleRate);
    124 }
    125 
    126 }  // Anonymous namespace.
    127 
    128 OpusTrackEncoder::OpusTrackEncoder(TrackRate aTrackRate,
    129                                   MediaQueue<EncodedFrame>& aEncodedDataQueue)
    130    : AudioTrackEncoder(aTrackRate, aEncodedDataQueue),
    131      mOutputSampleRate(IsSampleRateSupported(aTrackRate) ? aTrackRate
    132                                                          : kOpusSamplingRate),
    133      mEncoder(nullptr),
    134      mLookahead(0),
    135      mLookaheadWritten(0),
    136      mResampler(nullptr),
    137      mNumOutputFrames(0) {}
    138 
    139 OpusTrackEncoder::~OpusTrackEncoder() {
    140  if (mEncoder) {
    141    opus_encoder_destroy(mEncoder);
    142  }
    143  if (mResampler) {
    144    speex_resampler_destroy(mResampler);
    145    mResampler = nullptr;
    146  }
    147 }
    148 
    149 nsresult OpusTrackEncoder::Init(int aChannels) {
    150  NS_ENSURE_TRUE((aChannels <= MAX_SUPPORTED_AUDIO_CHANNELS) && (aChannels > 0),
    151                 NS_ERROR_FAILURE);
    152 
    153  // This version of encoder API only support 1 or 2 channels,
    154  // So set the mChannels less or equal 2 and
    155  // let InterleaveTrackData downmix pcm data.
    156  mChannels = aChannels > MAX_CHANNELS ? MAX_CHANNELS : aChannels;
    157 
    158  // Reject non-audio sample rates.
    159  NS_ENSURE_TRUE(mTrackRate >= 8000, NS_ERROR_INVALID_ARG);
    160  NS_ENSURE_TRUE(mTrackRate <= 192000, NS_ERROR_INVALID_ARG);
    161 
    162  if (NeedsResampler()) {
    163    int error;
    164    mResampler = speex_resampler_init(mChannels, mTrackRate, kOpusSamplingRate,
    165                                      SPEEX_RESAMPLER_QUALITY_DEFAULT, &error);
    166 
    167    if (error != RESAMPLER_ERR_SUCCESS) {
    168      return NS_ERROR_FAILURE;
    169    }
    170  }
    171 
    172  int error = 0;
    173  mEncoder = opus_encoder_create(mOutputSampleRate, mChannels,
    174                                 OPUS_APPLICATION_AUDIO, &error);
    175 
    176  if (error != OPUS_OK) {
    177    return NS_ERROR_FAILURE;
    178  }
    179 
    180  if (mAudioBitrate) {
    181    int bps = static_cast<int>(
    182        std::min<uint32_t>(mAudioBitrate, std::numeric_limits<int>::max()));
    183    error = opus_encoder_ctl(mEncoder, OPUS_SET_BITRATE(bps));
    184    if (error != OPUS_OK) {
    185      return NS_ERROR_FAILURE;
    186    }
    187  }
    188 
    189  // In the case of Opus we need to calculate the codec delay based on the
    190  // pre-skip. For more information see:
    191  // https://tools.ietf.org/html/rfc7845#section-4.2
    192  error = opus_encoder_ctl(mEncoder, OPUS_GET_LOOKAHEAD(&mLookahead));
    193  if (error != OPUS_OK) {
    194    mLookahead = 0;
    195    return NS_ERROR_FAILURE;
    196  }
    197 
    198  SetInitialized();
    199 
    200  return NS_OK;
    201 }
    202 
    203 int OpusTrackEncoder::GetLookahead() const {
    204  return mLookahead * kOpusSamplingRate / mOutputSampleRate;
    205 }
    206 
    207 int OpusTrackEncoder::NumInputFramesPerPacket() const {
    208  return mTrackRate * kFrameDurationMs / 1000;
    209 }
    210 
    211 int OpusTrackEncoder::NumOutputFramesPerPacket() const {
    212  return mOutputSampleRate * kFrameDurationMs / 1000;
    213 }
    214 
    215 bool OpusTrackEncoder::NeedsResampler() const {
    216  // A resampler is needed when mTrackRate is not supported by the opus encoder.
    217  // This is equivalent to !IsSampleRateSupported(mTrackRate) but less cycles.
    218  return mTrackRate != mOutputSampleRate &&
    219         mOutputSampleRate == kOpusSamplingRate;
    220 }
    221 
    222 already_AddRefed<TrackMetadataBase> OpusTrackEncoder::GetMetadata() {
    223  AUTO_PROFILER_LABEL("OpusTrackEncoder::GetMetadata", OTHER);
    224 
    225  MOZ_ASSERT(mInitialized);
    226 
    227  if (!mInitialized) {
    228    return nullptr;
    229  }
    230 
    231  RefPtr<OpusMetadata> meta = new OpusMetadata();
    232  meta->mChannels = mChannels;
    233  meta->mSamplingFrequency = mTrackRate;
    234 
    235  // Ogg and Webm timestamps are always sampled at 48k for Opus.
    236  SerializeOpusIdHeader(mChannels,
    237                        mLookahead * (kOpusSamplingRate / mOutputSampleRate),
    238                        mTrackRate, &meta->mIdHeader);
    239 
    240  nsCString vendor;
    241  vendor.AppendASCII(opus_get_version_string());
    242 
    243  nsTArray<nsCString> comments;
    244  comments.AppendElement(
    245      nsLiteralCString("ENCODER=Mozilla" MOZ_APP_UA_VERSION));
    246 
    247  SerializeOpusCommentHeader(vendor, comments, &meta->mCommentHeader);
    248 
    249  return meta.forget();
    250 }
    251 
    252 nsresult OpusTrackEncoder::Encode(AudioSegment* aSegment) {
    253  AUTO_PROFILER_LABEL("OpusTrackEncoder::Encode", OTHER);
    254 
    255  MOZ_ASSERT(aSegment);
    256  MOZ_ASSERT(mInitialized || mCanceled);
    257 
    258  if (mCanceled || IsEncodingComplete()) {
    259    return NS_ERROR_FAILURE;
    260  }
    261 
    262  if (!mInitialized) {
    263    // calculation below depends on the truth that mInitialized is true.
    264    return NS_ERROR_FAILURE;
    265  }
    266 
    267  int result = 0;
    268  // Loop until we run out of packets of input data
    269  while (result >= 0 && !IsEncodingComplete()) {
    270    // re-sampled frames left last time which didn't fit into an Opus packet
    271    // duration.
    272    const int framesLeft = mResampledLeftover.Length() / mChannels;
    273    MOZ_ASSERT(NumOutputFramesPerPacket() >= framesLeft);
    274    // Fetch input frames such that there will be n frames where (n +
    275    // framesLeft) >= NumOutputFramesPerPacket() after re-sampling.
    276    const int framesToFetch = NumInputFramesPerPacket() -
    277                              (framesLeft * mTrackRate / kOpusSamplingRate) +
    278                              (NeedsResampler() ? 1 : 0);
    279 
    280    if (!mEndOfStream && aSegment->GetDuration() < framesToFetch) {
    281      // Not enough raw data
    282      return NS_OK;
    283    }
    284 
    285    // Start encoding data.
    286    AutoTArray<AudioDataValue, 9600> pcm;
    287    pcm.SetLength(NumOutputFramesPerPacket() * mChannels);
    288 
    289    int frameCopied = 0;
    290 
    291    for (AudioSegment::ChunkIterator iter(*aSegment);
    292         !iter.IsEnded() && frameCopied < framesToFetch; iter.Next()) {
    293      AudioChunk chunk = *iter;
    294 
    295      // Chunk to the required frame size.
    296      TrackTime frameToCopy =
    297          std::min(chunk.GetDuration(),
    298                   static_cast<TrackTime>(framesToFetch - frameCopied));
    299 
    300      // Possible greatest value of framesToFetch = 3844: see
    301      // https://bugzilla.mozilla.org/show_bug.cgi?id=1349421#c8. frameToCopy
    302      // should not be able to exceed this value.
    303      MOZ_ASSERT(frameToCopy <= 3844, "frameToCopy exceeded expected range");
    304 
    305      if (!chunk.IsNull()) {
    306        // Append the interleaved data to the end of pcm buffer.
    307        AudioTrackEncoder::InterleaveTrackData(
    308            chunk, frameToCopy, mChannels,
    309            pcm.Elements() + frameCopied * mChannels);
    310      } else {
    311        CheckedInt<int> memsetLength =
    312            CheckedInt<int>(frameToCopy) * mChannels * sizeof(AudioDataValue);
    313        if (!memsetLength.isValid()) {
    314          // This should never happen, but we use a defensive check because
    315          // we really don't want a bad memset
    316          MOZ_ASSERT_UNREACHABLE("memsetLength invalid!");
    317          return NS_ERROR_FAILURE;
    318        }
    319        memset(pcm.Elements() + frameCopied * mChannels, 0,
    320               memsetLength.value());
    321      }
    322 
    323      frameCopied += frameToCopy;
    324    }
    325 
    326    // Possible greatest value of framesToFetch = 3844: see
    327    // https://bugzilla.mozilla.org/show_bug.cgi?id=1349421#c8. frameCopied
    328    // should not be able to exceed this value.
    329    MOZ_ASSERT(frameCopied <= 3844, "frameCopied exceeded expected range");
    330 
    331    int framesInPCM = frameCopied;
    332    if (mResampler) {
    333      AutoTArray<AudioDataValue, 9600> resamplingDest;
    334      uint32_t inframes = frameCopied;
    335      uint32_t outframes = inframes * kOpusSamplingRate / mTrackRate + 1;
    336 
    337      // We want to consume all the input data, so we slightly oversize the
    338      // resampled data buffer so we can fit the output data in. We cannot
    339      // really predict the output frame count at each call.
    340      resamplingDest.SetLength(outframes * mChannels);
    341 
    342      float* in = reinterpret_cast<float*>(pcm.Elements());
    343      float* out = reinterpret_cast<float*>(resamplingDest.Elements());
    344      speex_resampler_process_interleaved_float(mResampler, in, &inframes, out,
    345                                                &outframes);
    346 
    347      MOZ_ASSERT(pcm.Length() >= mResampledLeftover.Length());
    348      PodCopy(pcm.Elements(), mResampledLeftover.Elements(),
    349              mResampledLeftover.Length());
    350 
    351      uint32_t outframesToCopy = std::min(
    352          outframes,
    353          static_cast<uint32_t>(NumOutputFramesPerPacket() - framesLeft));
    354 
    355      MOZ_ASSERT(pcm.Length() - mResampledLeftover.Length() >=
    356                 outframesToCopy * mChannels);
    357      PodCopy(pcm.Elements() + mResampledLeftover.Length(),
    358              resamplingDest.Elements(), outframesToCopy * mChannels);
    359      int frameLeftover = outframes - outframesToCopy;
    360      mResampledLeftover.SetLength(frameLeftover * mChannels);
    361      PodCopy(mResampledLeftover.Elements(),
    362              resamplingDest.Elements() + outframesToCopy * mChannels,
    363              mResampledLeftover.Length());
    364      // This is always at 48000Hz.
    365      framesInPCM = framesLeft + outframesToCopy;
    366    }
    367 
    368    // Remove the raw data which has been pulled to pcm buffer.
    369    // The value of frameCopied should be equal to (or smaller than, if eos)
    370    // NumOutputFramesPerPacket().
    371    aSegment->RemoveLeading(frameCopied);
    372 
    373    // Has reached the end of input stream and all queued data has pulled for
    374    // encoding.
    375    bool isFinalPacket = false;
    376    if (aSegment->GetDuration() == 0 && mEndOfStream &&
    377        framesInPCM < NumOutputFramesPerPacket()) {
    378      // Pad |mLookahead| samples to the end of the track to prevent loss of
    379      // original data.
    380      const int toWrite = std::min(mLookahead - mLookaheadWritten,
    381                                   NumOutputFramesPerPacket() - framesInPCM);
    382      PodZero(pcm.Elements() + framesInPCM * mChannels, toWrite * mChannels);
    383      mLookaheadWritten += toWrite;
    384      framesInPCM += toWrite;
    385      if (mLookaheadWritten == mLookahead) {
    386        isFinalPacket = true;
    387      }
    388    }
    389 
    390    MOZ_ASSERT_IF(!isFinalPacket, framesInPCM == NumOutputFramesPerPacket());
    391 
    392    // Append null data to pcm buffer if the leftover data is not enough for
    393    // opus encoder.
    394    if (framesInPCM < NumOutputFramesPerPacket() && isFinalPacket) {
    395      PodZero(pcm.Elements() + framesInPCM * mChannels,
    396              (NumOutputFramesPerPacket() - framesInPCM) * mChannels);
    397    }
    398    auto frameData = MakeRefPtr<EncodedFrame::FrameData>();
    399    // Encode the data with Opus Encoder.
    400    frameData->SetLength(MAX_DATA_BYTES);
    401    // result is returned as opus error code if it is negative.
    402    result = 0;
    403    const float* pcmBuf = static_cast<float*>(pcm.Elements());
    404    result = opus_encode_float(mEncoder, pcmBuf, NumOutputFramesPerPacket(),
    405                               frameData->Elements(), MAX_DATA_BYTES);
    406    frameData->SetLength(result >= 0 ? result : 0);
    407 
    408    if (result < 0) {
    409      LOG("[Opus] Fail to encode data! Result: %s.", opus_strerror(result));
    410    }
    411    if (isFinalPacket) {
    412      if (mResampler) {
    413        speex_resampler_destroy(mResampler);
    414        mResampler = nullptr;
    415      }
    416      mResampledLeftover.SetLength(0);
    417    }
    418 
    419    // timestamp should be the time of the first sample
    420    mEncodedDataQueue.Push(MakeAndAddRef<EncodedFrame>(
    421        media::TimeUnit(mNumOutputFrames + mLookahead, mOutputSampleRate),
    422        static_cast<uint64_t>(framesInPCM) * kOpusSamplingRate /
    423            mOutputSampleRate,
    424        kOpusSamplingRate, EncodedFrame::OPUS_AUDIO_FRAME,
    425        std::move(frameData)));
    426 
    427    mNumOutputFrames += NumOutputFramesPerPacket();
    428    LOG("[Opus] mOutputTimeStamp %.3f.",
    429        media::TimeUnit(mNumOutputFrames, mOutputSampleRate).ToSeconds());
    430 
    431    if (isFinalPacket) {
    432      LOG("[Opus] Done encoding.");
    433      mEncodedDataQueue.Finish();
    434    }
    435  }
    436 
    437  return result >= 0 ? NS_OK : NS_ERROR_FAILURE;
    438 }
    439 
    440 }  // namespace mozilla
    441 
    442 #undef LOG