audio_decoder.h (8169B)
1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef API_AUDIO_CODECS_AUDIO_DECODER_H_ 12 #define API_AUDIO_CODECS_AUDIO_DECODER_H_ 13 14 #include <stddef.h> 15 #include <stdint.h> 16 17 #include <memory> 18 #include <optional> 19 #include <vector> 20 21 #include "api/array_view.h" 22 #include "api/audio/audio_view.h" 23 #include "rtc_base/buffer.h" 24 25 namespace webrtc { 26 27 class AudioDecoder { 28 public: 29 enum SpeechType { 30 kSpeech = 1, 31 kComfortNoise = 2, 32 }; 33 34 // Used by PacketDuration below. Save the value -1 for errors. 35 enum { kNotImplemented = -2 }; 36 37 AudioDecoder() = default; 38 virtual ~AudioDecoder() = default; 39 40 AudioDecoder(const AudioDecoder&) = delete; 41 AudioDecoder& operator=(const AudioDecoder&) = delete; 42 43 class EncodedAudioFrame { 44 public: 45 struct DecodeResult { 46 size_t num_decoded_samples; 47 SpeechType speech_type; 48 }; 49 50 virtual ~EncodedAudioFrame() = default; 51 52 // Returns the duration in samples-per-channel of this audio frame. 53 // If no duration can be ascertained, returns zero. 54 virtual size_t Duration() const = 0; 55 56 // Returns true if this packet contains DTX. 57 virtual bool IsDtxPacket() const; 58 59 // Decodes this frame of audio and writes the result in `decoded`. 60 // `decoded` must be large enough to store as many samples as indicated by a 61 // call to Duration() . On success, returns an std::optional containing the 62 // total number of samples across all channels, as well as whether the 63 // decoder produced comfort noise or speech. On failure, returns an empty 64 // std::optional. Decode may be called at most once per frame object. 65 virtual std::optional<DecodeResult> Decode( 66 ArrayView<int16_t> decoded) const = 0; 67 }; 68 69 struct ParseResult { 70 ParseResult(); 71 ParseResult(uint32_t timestamp, 72 int priority, 73 std::unique_ptr<EncodedAudioFrame> frame); 74 ParseResult(ParseResult&& b); 75 ~ParseResult(); 76 77 ParseResult& operator=(ParseResult&& b); 78 79 // The timestamp of the frame is in samples per channel. 80 uint32_t timestamp; 81 // The relative priority of the frame compared to other frames of the same 82 // payload and the same timeframe. A higher value means a lower priority. 83 // The highest priority is zero - negative values are not allowed. 84 int priority; 85 std::unique_ptr<EncodedAudioFrame> frame; 86 }; 87 88 // Let the decoder parse this payload and prepare zero or more decodable 89 // frames. Each frame must be between 10 ms and 120 ms long. The caller must 90 // ensure that the AudioDecoder object outlives any frame objects returned by 91 // this call. The decoder is free to swap or move the data from the `payload` 92 // buffer. `timestamp` is the input timestamp, in samples, corresponding to 93 // the start of the payload. 94 virtual std::vector<ParseResult> ParsePayload(Buffer&& payload, 95 uint32_t timestamp); 96 97 // TODO(bugs.webrtc.org/10098): The Decode and DecodeRedundant methods are 98 // obsolete; callers should call ParsePayload instead. For now, subclasses 99 // must still implement DecodeInternal. 100 101 // Decodes `encode_len` bytes from `encoded` and writes the result in 102 // `decoded`. The maximum bytes allowed to be written into `decoded` is 103 // `max_decoded_bytes`. Returns the total number of samples across all 104 // channels. If the decoder produced comfort noise, `speech_type` 105 // is set to kComfortNoise, otherwise it is kSpeech. The desired output 106 // sample rate is provided in `sample_rate_hz`, which must be valid for the 107 // codec at hand. 108 int Decode(const uint8_t* encoded, 109 size_t encoded_len, 110 int sample_rate_hz, 111 size_t max_decoded_bytes, 112 int16_t* decoded, 113 SpeechType* speech_type); 114 115 // Same as Decode(), but interfaces to the decoders redundant decode function. 116 // The default implementation simply calls the regular Decode() method. 117 int DecodeRedundant(const uint8_t* encoded, 118 size_t encoded_len, 119 int sample_rate_hz, 120 size_t max_decoded_bytes, 121 int16_t* decoded, 122 SpeechType* speech_type); 123 124 // Indicates if the decoder implements the DecodePlc method. 125 virtual bool HasDecodePlc() const; 126 127 // Calls the packet-loss concealment of the decoder to update the state after 128 // one or several lost packets. The caller has to make sure that the 129 // memory allocated in `decoded` should accommodate `num_frames` frames. 130 virtual size_t DecodePlc(size_t num_frames, int16_t* decoded); 131 132 // Asks the decoder to generate packet-loss concealment and append it to the 133 // end of `concealment_audio`. The concealment audio should be in 134 // channel-interleaved format, with as many channels as the last decoded 135 // packet produced. The implementation must produce at least 136 // requested_samples_per_channel, or nothing at all. This is a signal to the 137 // caller to conceal the loss with other means. If the implementation provides 138 // concealment samples, it is also responsible for "stitching" it together 139 // with the decoded audio on either side of the concealment. 140 // Note: The default implementation of GeneratePlc will be deleted soon. All 141 // implementations must provide their own, which can be a simple as a no-op. 142 // TODO(bugs.webrtc.org/9676): Remove default implementation. 143 virtual void GeneratePlc(size_t requested_samples_per_channel, 144 BufferT<int16_t>* concealment_audio); 145 146 // Resets the decoder state (empty buffers etc.). 147 virtual void Reset() = 0; 148 149 // Returns the last error code from the decoder. 150 virtual int ErrorCode(); 151 152 // Returns the duration in samples-per-channel of the payload in `encoded` 153 // which is `encoded_len` bytes long. Returns kNotImplemented if no duration 154 // estimate is available, or -1 in case of an error. 155 virtual int PacketDuration(const uint8_t* encoded, size_t encoded_len) const; 156 157 // Returns the duration in samples-per-channel of the redandant payload in 158 // `encoded` which is `encoded_len` bytes long. Returns kNotImplemented if no 159 // duration estimate is available, or -1 in case of an error. 160 virtual int PacketDurationRedundant(const uint8_t* encoded, 161 size_t encoded_len) const; 162 163 // Detects whether a packet has forward error correction. The packet is 164 // comprised of the samples in `encoded` which is `encoded_len` bytes long. 165 // Returns true if the packet has FEC and false otherwise. 166 virtual bool PacketHasFec(const uint8_t* encoded, size_t encoded_len) const; 167 168 // Returns the actual sample rate of the decoder's output. This value may not 169 // change during the lifetime of the decoder. 170 virtual int SampleRateHz() const = 0; 171 172 // The number of channels in the decoder's output. This value may not change 173 // during the lifetime of the decoder. 174 virtual size_t Channels() const = 0; 175 176 // The maximum number of audio channels supported by WebRTC decoders. 177 static constexpr int kMaxNumberOfChannels = kMaxNumberOfAudioChannels; 178 179 protected: 180 static SpeechType ConvertSpeechType(int16_t type); 181 182 virtual int DecodeInternal(const uint8_t* encoded, 183 size_t encoded_len, 184 int sample_rate_hz, 185 int16_t* decoded, 186 SpeechType* speech_type) = 0; 187 188 virtual int DecodeRedundantInternal(const uint8_t* encoded, 189 size_t encoded_len, 190 int sample_rate_hz, 191 int16_t* decoded, 192 SpeechType* speech_type); 193 }; 194 195 } // namespace webrtc 196 #endif // API_AUDIO_CODECS_AUDIO_DECODER_H_