rtp_video_frame_assembler.cc (14613B)
1 /* 2 * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "api/video/rtp_video_frame_assembler.h" 12 13 #include <cstdint> 14 #include <memory> 15 #include <optional> 16 #include <utility> 17 #include <vector> 18 19 #include "absl/container/inlined_vector.h" 20 #include "api/array_view.h" 21 #include "api/rtp_packet_infos.h" 22 #include "api/scoped_refptr.h" 23 #include "api/transport/rtp/dependency_descriptor.h" 24 #include "api/video/encoded_image.h" 25 #include "api/video/video_frame_type.h" 26 #include "api/video/video_timing.h" 27 #include "modules/rtp_rtcp/source/frame_object.h" 28 #include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h" 29 #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor.h" 30 #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h" 31 #include "modules/rtp_rtcp/source/rtp_packet_received.h" 32 #include "modules/rtp_rtcp/source/rtp_video_header.h" 33 #include "modules/rtp_rtcp/source/video_rtp_depacketizer.h" 34 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_av1.h" 35 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_generic.h" 36 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_h264.h" 37 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_raw.h" 38 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp8.h" 39 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h" 40 #include "modules/video_coding/packet_buffer.h" 41 #include "modules/video_coding/rtp_frame_reference_finder.h" 42 #include "rtc_base/checks.h" 43 #include "rtc_base/logging.h" 44 #include "rtc_base/numerics/sequence_number_unwrapper.h" 45 46 #ifdef RTC_ENABLE_H265 47 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_h265.h" 48 #endif 49 50 namespace webrtc { 51 namespace { 52 std::unique_ptr<VideoRtpDepacketizer> CreateDepacketizer( 53 RtpVideoFrameAssembler::PayloadFormat payload_format) { 54 switch (payload_format) { 55 case RtpVideoFrameAssembler::kRaw: 56 return std::make_unique<VideoRtpDepacketizerRaw>(); 57 case RtpVideoFrameAssembler::kH264: 58 return std::make_unique<VideoRtpDepacketizerH264>(); 59 case RtpVideoFrameAssembler::kVp8: 60 return std::make_unique<VideoRtpDepacketizerVp8>(); 61 case RtpVideoFrameAssembler::kVp9: 62 return std::make_unique<VideoRtpDepacketizerVp9>(); 63 case RtpVideoFrameAssembler::kAv1: 64 return std::make_unique<VideoRtpDepacketizerAv1>(); 65 case RtpVideoFrameAssembler::kGeneric: 66 return std::make_unique<VideoRtpDepacketizerGeneric>(); 67 case RtpVideoFrameAssembler::kH265: 68 #ifdef RTC_ENABLE_H265 69 return std::make_unique<VideoRtpDepacketizerH265>(); 70 #else 71 return nullptr; 72 #endif 73 } 74 RTC_DCHECK_NOTREACHED(); 75 return nullptr; 76 } 77 } // namespace 78 79 class RtpVideoFrameAssembler::Impl { 80 public: 81 explicit Impl(std::unique_ptr<VideoRtpDepacketizer> depacketizer); 82 ~Impl() = default; 83 84 FrameVector InsertPacket(const RtpPacketReceived& packet); 85 86 private: 87 using RtpFrameVector = 88 absl::InlinedVector<std::unique_ptr<RtpFrameObject>, 3>; 89 90 RtpFrameVector AssembleFrames( 91 video_coding::PacketBuffer::InsertResult insert_result); 92 FrameVector FindReferences(RtpFrameVector frames); 93 FrameVector UpdateWithPadding(uint16_t seq_num); 94 bool ParseDependenciesDescriptorExtension(const RtpPacketReceived& rtp_packet, 95 RTPVideoHeader& video_header); 96 bool ParseGenericDescriptorExtension(const RtpPacketReceived& rtp_packet, 97 RTPVideoHeader& video_header); 98 void ClearOldData(uint16_t incoming_seq_num); 99 100 std::unique_ptr<FrameDependencyStructure> video_structure_; 101 SeqNumUnwrapper<uint16_t> rtp_sequence_number_unwrapper_; 102 SeqNumUnwrapper<uint16_t> frame_id_unwrapper_; 103 std::optional<int64_t> video_structure_frame_id_; 104 std::unique_ptr<VideoRtpDepacketizer> depacketizer_; 105 video_coding::PacketBuffer packet_buffer_; 106 RtpFrameReferenceFinder reference_finder_; 107 }; 108 109 RtpVideoFrameAssembler::Impl::Impl( 110 std::unique_ptr<VideoRtpDepacketizer> depacketizer) 111 : depacketizer_(std::move(depacketizer)), 112 packet_buffer_(/*start_buffer_size=*/2048, /*max_buffer_size=*/2048) {} 113 114 RtpVideoFrameAssembler::FrameVector RtpVideoFrameAssembler::Impl::InsertPacket( 115 const RtpPacketReceived& rtp_packet) { 116 if (rtp_packet.payload_size() == 0) { 117 ClearOldData(rtp_packet.SequenceNumber()); 118 return UpdateWithPadding(rtp_packet.SequenceNumber()); 119 } 120 121 std::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload = 122 depacketizer_->Parse(rtp_packet.PayloadBuffer()); 123 124 if (parsed_payload == std::nullopt) { 125 return {}; 126 } 127 128 if (rtp_packet.HasExtension<RtpDependencyDescriptorExtension>()) { 129 if (!ParseDependenciesDescriptorExtension(rtp_packet, 130 parsed_payload->video_header)) { 131 return {}; 132 } 133 } else if (rtp_packet.HasExtension<RtpGenericFrameDescriptorExtension00>()) { 134 if (!ParseGenericDescriptorExtension(rtp_packet, 135 parsed_payload->video_header)) { 136 return {}; 137 } 138 } 139 140 parsed_payload->video_header.is_last_packet_in_frame |= rtp_packet.Marker(); 141 142 auto packet = std::make_unique<video_coding::PacketBuffer::Packet>( 143 rtp_packet, 144 rtp_sequence_number_unwrapper_.Unwrap(rtp_packet.SequenceNumber()), 145 parsed_payload->video_header); 146 packet->video_payload = std::move(parsed_payload->video_payload); 147 148 ClearOldData(rtp_packet.SequenceNumber()); 149 return FindReferences( 150 AssembleFrames(packet_buffer_.InsertPacket(std::move(packet)))); 151 } 152 153 void RtpVideoFrameAssembler::Impl::ClearOldData(uint16_t incoming_seq_num) { 154 constexpr uint16_t kOldSeqNumThreshold = 2000; 155 uint16_t old_seq_num = incoming_seq_num - kOldSeqNumThreshold; 156 packet_buffer_.ClearTo(old_seq_num); 157 reference_finder_.ClearTo(old_seq_num); 158 } 159 160 RtpVideoFrameAssembler::Impl::RtpFrameVector 161 RtpVideoFrameAssembler::Impl::AssembleFrames( 162 video_coding::PacketBuffer::InsertResult insert_result) { 163 video_coding::PacketBuffer::Packet* first_packet = nullptr; 164 std::vector<ArrayView<const uint8_t>> payloads; 165 RtpFrameVector result; 166 167 for (auto& packet : insert_result.packets) { 168 if (packet->is_first_packet_in_frame()) { 169 first_packet = packet.get(); 170 payloads.clear(); 171 } 172 payloads.emplace_back(packet->video_payload); 173 174 if (packet->is_last_packet_in_frame()) { 175 scoped_refptr<EncodedImageBuffer> bitstream = 176 depacketizer_->AssembleFrame(payloads); 177 178 if (!bitstream) { 179 continue; 180 } 181 182 const video_coding::PacketBuffer::Packet& last_packet = *packet; 183 result.push_back(std::make_unique<RtpFrameObject>( 184 first_packet->seq_num(), // 185 last_packet.seq_num(), // 186 last_packet.marker_bit, // 187 /*times_nacked=*/0, // 188 /*first_packet_received_time=*/0, // 189 /*last_packet_received_time=*/0, // 190 first_packet->timestamp, // 191 /*ntp_time_ms=*/0, // 192 /*timing=*/VideoSendTiming(), // 193 first_packet->payload_type, // 194 first_packet->codec(), // 195 last_packet.video_header.rotation, // 196 last_packet.video_header.content_type, // 197 first_packet->video_header, // 198 last_packet.video_header.color_space, // 199 last_packet.video_header.frame_instrumentation_data, // 200 /*packet_infos=*/RtpPacketInfos(), // 201 std::move(bitstream))); 202 } 203 } 204 205 return result; 206 } 207 208 RtpVideoFrameAssembler::FrameVector 209 RtpVideoFrameAssembler::Impl::FindReferences(RtpFrameVector frames) { 210 FrameVector res; 211 for (auto& frame : frames) { 212 auto complete_frames = reference_finder_.ManageFrame(std::move(frame)); 213 for (std::unique_ptr<RtpFrameObject>& complete_frame : complete_frames) { 214 uint16_t rtp_seq_num_start = complete_frame->first_seq_num(); 215 uint16_t rtp_seq_num_end = complete_frame->last_seq_num(); 216 res.emplace_back(rtp_seq_num_start, rtp_seq_num_end, 217 std::move(complete_frame)); 218 } 219 } 220 return res; 221 } 222 223 RtpVideoFrameAssembler::FrameVector 224 RtpVideoFrameAssembler::Impl::UpdateWithPadding(uint16_t seq_num) { 225 auto res = 226 FindReferences(AssembleFrames(packet_buffer_.InsertPadding(seq_num))); 227 auto ref_finder_update = reference_finder_.PaddingReceived(seq_num); 228 229 for (std::unique_ptr<RtpFrameObject>& complete_frame : ref_finder_update) { 230 uint16_t rtp_seq_num_start = complete_frame->first_seq_num(); 231 uint16_t rtp_seq_num_end = complete_frame->last_seq_num(); 232 res.emplace_back(rtp_seq_num_start, rtp_seq_num_end, 233 std::move(complete_frame)); 234 } 235 236 return res; 237 } 238 239 bool RtpVideoFrameAssembler::Impl::ParseDependenciesDescriptorExtension( 240 const RtpPacketReceived& rtp_packet, 241 RTPVideoHeader& video_header) { 242 DependencyDescriptor dependency_descriptor; 243 244 if (!rtp_packet.GetExtension<RtpDependencyDescriptorExtension>( 245 video_structure_.get(), &dependency_descriptor)) { 246 // Descriptor is either malformed, or the template referenced is not in 247 // the `video_structure_` currently being held. 248 // TODO(bugs.webrtc.org/10342): Improve packet reordering behavior. 249 RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc() 250 << " Failed to parse dependency descriptor."; 251 return false; 252 } 253 254 if (dependency_descriptor.attached_structure != nullptr && 255 !dependency_descriptor.first_packet_in_frame) { 256 RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc() 257 << "Invalid dependency descriptor: structure " 258 "attached to non first packet of a frame."; 259 return false; 260 } 261 262 video_header.is_first_packet_in_frame = 263 dependency_descriptor.first_packet_in_frame; 264 video_header.is_last_packet_in_frame = 265 dependency_descriptor.last_packet_in_frame; 266 267 int64_t frame_id = 268 frame_id_unwrapper_.Unwrap(dependency_descriptor.frame_number); 269 auto& generic_descriptor_info = video_header.generic.emplace(); 270 generic_descriptor_info.frame_id = frame_id; 271 generic_descriptor_info.spatial_index = 272 dependency_descriptor.frame_dependencies.spatial_id; 273 generic_descriptor_info.temporal_index = 274 dependency_descriptor.frame_dependencies.temporal_id; 275 276 for (int fdiff : dependency_descriptor.frame_dependencies.frame_diffs) { 277 generic_descriptor_info.dependencies.push_back(frame_id - fdiff); 278 } 279 for (int cdiff : dependency_descriptor.frame_dependencies.chain_diffs) { 280 generic_descriptor_info.chain_diffs.push_back(frame_id - cdiff); 281 } 282 generic_descriptor_info.decode_target_indications = 283 dependency_descriptor.frame_dependencies.decode_target_indications; 284 if (dependency_descriptor.resolution) { 285 video_header.width = dependency_descriptor.resolution->Width(); 286 video_header.height = dependency_descriptor.resolution->Height(); 287 } 288 if (dependency_descriptor.active_decode_targets_bitmask.has_value()) { 289 generic_descriptor_info.active_decode_targets = 290 *dependency_descriptor.active_decode_targets_bitmask; 291 } 292 293 // FrameDependencyStructure is sent in the dependency descriptor of the first 294 // packet of a key frame and is required to parse all subsequent packets until 295 // the next key frame. 296 if (dependency_descriptor.attached_structure) { 297 RTC_DCHECK(dependency_descriptor.first_packet_in_frame); 298 if (video_structure_frame_id_ > frame_id) { 299 RTC_LOG(LS_WARNING) 300 << "Arrived key frame with id " << frame_id << " and structure id " 301 << dependency_descriptor.attached_structure->structure_id 302 << " is older than the latest received key frame with id " 303 << *video_structure_frame_id_ << " and structure id " 304 << video_structure_->structure_id; 305 return false; 306 } 307 video_structure_ = std::move(dependency_descriptor.attached_structure); 308 video_structure_frame_id_ = frame_id; 309 video_header.frame_type = VideoFrameType::kVideoFrameKey; 310 } else { 311 video_header.frame_type = VideoFrameType::kVideoFrameDelta; 312 } 313 return true; 314 } 315 316 bool RtpVideoFrameAssembler::Impl::ParseGenericDescriptorExtension( 317 const RtpPacketReceived& rtp_packet, 318 RTPVideoHeader& video_header) { 319 RtpGenericFrameDescriptor generic_frame_descriptor; 320 if (!rtp_packet.GetExtension<RtpGenericFrameDescriptorExtension00>( 321 &generic_frame_descriptor)) { 322 return false; 323 } 324 325 video_header.is_first_packet_in_frame = 326 generic_frame_descriptor.FirstPacketInSubFrame(); 327 video_header.is_last_packet_in_frame = 328 generic_frame_descriptor.LastPacketInSubFrame(); 329 330 if (generic_frame_descriptor.FirstPacketInSubFrame()) { 331 video_header.frame_type = 332 generic_frame_descriptor.FrameDependenciesDiffs().empty() 333 ? VideoFrameType::kVideoFrameKey 334 : VideoFrameType::kVideoFrameDelta; 335 336 auto& generic_descriptor_info = video_header.generic.emplace(); 337 int64_t frame_id = 338 frame_id_unwrapper_.Unwrap(generic_frame_descriptor.FrameId()); 339 generic_descriptor_info.frame_id = frame_id; 340 generic_descriptor_info.spatial_index = 341 generic_frame_descriptor.SpatialLayer(); 342 generic_descriptor_info.temporal_index = 343 generic_frame_descriptor.TemporalLayer(); 344 for (uint16_t fdiff : generic_frame_descriptor.FrameDependenciesDiffs()) { 345 generic_descriptor_info.dependencies.push_back(frame_id - fdiff); 346 } 347 } 348 video_header.width = generic_frame_descriptor.Width(); 349 video_header.height = generic_frame_descriptor.Height(); 350 return true; 351 } 352 353 RtpVideoFrameAssembler::RtpVideoFrameAssembler(PayloadFormat payload_format) 354 : impl_(std::make_unique<Impl>(CreateDepacketizer(payload_format))) {} 355 356 RtpVideoFrameAssembler::~RtpVideoFrameAssembler() = default; 357 358 RtpVideoFrameAssembler::FrameVector RtpVideoFrameAssembler::InsertPacket( 359 const RtpPacketReceived& packet) { 360 return impl_->InsertPacket(packet); 361 } 362 363 } // namespace webrtc