rtp_vp9_ref_finder.cc (13420B)
1 /* 2 * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/video_coding/rtp_vp9_ref_finder.h" 12 13 #include <algorithm> 14 #include <cstddef> 15 #include <cstdint> 16 #include <memory> 17 #include <utility> 18 19 #include "api/video/encoded_frame.h" 20 #include "api/video/video_codec_constants.h" 21 #include "api/video/video_frame_type.h" 22 #include "modules/rtp_rtcp/source/frame_object.h" 23 #include "modules/video_coding/codecs/interface/common_constants.h" 24 #include "modules/video_coding/codecs/vp9/include/vp9_globals.h" 25 #include "modules/video_coding/rtp_frame_reference_finder.h" 26 #include "rtc_base/checks.h" 27 #include "rtc_base/logging.h" 28 #include "rtc_base/numerics/mod_ops.h" 29 #include "rtc_base/numerics/sequence_number_util.h" 30 31 namespace webrtc { 32 RtpFrameReferenceFinder::ReturnVector RtpVp9RefFinder::ManageFrame( 33 std::unique_ptr<RtpFrameObject> frame) { 34 const RTPVideoHeaderVP9& codec_header = 35 std::get<RTPVideoHeaderVP9>(frame->GetRtpVideoHeader().video_type_header); 36 37 if (codec_header.temporal_idx != kNoTemporalIdx) 38 frame->SetTemporalIndex(codec_header.temporal_idx); 39 frame->SetSpatialIndex(codec_header.spatial_idx); 40 frame->SetId(codec_header.picture_id & (kFrameIdLength - 1)); 41 42 FrameDecision decision; 43 if (codec_header.temporal_idx >= kMaxTemporalLayers || 44 codec_header.spatial_idx >= kMaxSpatialLayers) { 45 decision = kDrop; 46 } else if (codec_header.flexible_mode) { 47 decision = ManageFrameFlexible(frame.get(), codec_header); 48 } else { 49 if (codec_header.tl0_pic_idx == kNoTl0PicIdx) { 50 RTC_LOG(LS_WARNING) << "TL0PICIDX is expected to be present in " 51 "non-flexible mode."; 52 decision = kDrop; 53 } else { 54 int64_t unwrapped_tl0 = 55 tl0_unwrapper_.Unwrap(codec_header.tl0_pic_idx & 0xFF); 56 decision = ManageFrameGof(frame.get(), codec_header, unwrapped_tl0); 57 58 if (decision == kStash) { 59 if (stashed_frames_.size() > kMaxStashedFrames) { 60 stashed_frames_.pop_back(); 61 } 62 63 stashed_frames_.push_front( 64 {.unwrapped_tl0 = unwrapped_tl0, .frame = std::move(frame)}); 65 } 66 } 67 } 68 69 RtpFrameReferenceFinder::ReturnVector res; 70 switch (decision) { 71 case kStash: 72 return res; 73 case kHandOff: 74 res.push_back(std::move(frame)); 75 RetryStashedFrames(res); 76 return res; 77 case kDrop: 78 return res; 79 } 80 81 return res; 82 } 83 84 RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameFlexible( 85 RtpFrameObject* frame, 86 const RTPVideoHeaderVP9& codec_header) { 87 if (codec_header.num_ref_pics > EncodedFrame::kMaxFrameReferences) { 88 return kDrop; 89 } 90 91 frame->num_references = codec_header.num_ref_pics; 92 for (size_t i = 0; i < frame->num_references; ++i) { 93 frame->references[i] = 94 Subtract<kFrameIdLength>(frame->Id(), codec_header.pid_diff[i]); 95 } 96 97 FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); 98 return kHandOff; 99 } 100 101 RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameGof( 102 RtpFrameObject* frame, 103 const RTPVideoHeaderVP9& codec_header, 104 int64_t unwrapped_tl0) { 105 GofInfo* info; 106 if (codec_header.ss_data_available) { 107 if (codec_header.temporal_idx != 0) { 108 RTC_LOG(LS_WARNING) << "Received scalability structure on a non base " 109 "layer frame. Scalability structure ignored."; 110 } else { 111 if (codec_header.gof.num_frames_in_gof > kMaxVp9FramesInGof) { 112 return kDrop; 113 } 114 115 for (size_t i = 0; i < codec_header.gof.num_frames_in_gof; ++i) { 116 if (codec_header.gof.num_ref_pics[i] > kMaxVp9RefPics) { 117 return kDrop; 118 } 119 } 120 121 GofInfoVP9 gof = codec_header.gof; 122 if (gof.num_frames_in_gof == 0) { 123 RTC_LOG(LS_WARNING) << "Number of frames in GOF is zero. Assume " 124 "that stream has only one temporal layer."; 125 gof.SetGofInfoVP9(kTemporalStructureMode1); 126 } 127 128 current_ss_idx_ = Add<kMaxGofSaved>(current_ss_idx_, 1); 129 scalability_structures_[current_ss_idx_] = gof; 130 scalability_structures_[current_ss_idx_].pid_start = frame->Id(); 131 gof_info_.emplace( 132 unwrapped_tl0, 133 GofInfo(&scalability_structures_[current_ss_idx_], frame->Id())); 134 } 135 136 const auto gof_info_it = gof_info_.find(unwrapped_tl0); 137 if (gof_info_it == gof_info_.end()) 138 return kStash; 139 140 info = &gof_info_it->second; 141 142 if (frame->frame_type() == VideoFrameType::kVideoFrameKey) { 143 frame->num_references = 0; 144 FrameReceivedVp9(frame->Id(), info); 145 FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); 146 return kHandOff; 147 } 148 } else { 149 if (frame->frame_type() == VideoFrameType::kVideoFrameKey) { 150 RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure"; 151 return kDrop; 152 } 153 154 // tl0_idx is incremented on temporal_idx=0 frames of the lowest spatial 155 // layer (which spatial_idx is not necessarily zero). Upper spatial layer 156 // frames with inter-layer prediction use GOF info of their base spatial 157 // layer frames. 158 const bool use_prev_gof = 159 codec_header.temporal_idx == 0 && !codec_header.inter_layer_predicted; 160 auto gof_info_it = 161 gof_info_.find(use_prev_gof ? unwrapped_tl0 - 1 : unwrapped_tl0); 162 163 // Gof info for this frame is not available yet, stash this frame. 164 if (gof_info_it == gof_info_.end()) 165 return kStash; 166 167 if (codec_header.temporal_idx == 0) { 168 gof_info_it = gof_info_ 169 .emplace(unwrapped_tl0, 170 GofInfo(gof_info_it->second.gof, frame->Id())) 171 .first; 172 } 173 174 info = &gof_info_it->second; 175 } 176 177 // Clean up info for base layers that are too old. 178 int64_t old_tl0_pic_idx = unwrapped_tl0 - kMaxGofSaved; 179 auto clean_gof_info_to = gof_info_.lower_bound(old_tl0_pic_idx); 180 gof_info_.erase(gof_info_.begin(), clean_gof_info_to); 181 182 FrameReceivedVp9(frame->Id(), info); 183 184 // Make sure we don't miss any frame that could potentially have the 185 // up switch flag set. 186 if (MissingRequiredFrameVp9(frame->Id(), *info)) 187 return kStash; 188 189 if (codec_header.temporal_up_switch) 190 up_switch_.emplace(frame->Id(), codec_header.temporal_idx); 191 192 // Clean out old info about up switch frames. 193 uint16_t old_picture_id = Subtract<kFrameIdLength>(frame->Id(), 50); 194 auto up_switch_erase_to = up_switch_.lower_bound(old_picture_id); 195 up_switch_.erase(up_switch_.begin(), up_switch_erase_to); 196 197 if (codec_header.inter_pic_predicted) { 198 size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, 199 frame->Id()); 200 size_t gof_idx = diff % info->gof->num_frames_in_gof; 201 202 if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) { 203 return kDrop; 204 } 205 206 // Populate references according to the scalability structure. 207 frame->num_references = info->gof->num_ref_pics[gof_idx]; 208 for (size_t i = 0; i < frame->num_references; ++i) { 209 frame->references[i] = Subtract<kFrameIdLength>( 210 frame->Id(), info->gof->pid_diff[gof_idx][i]); 211 212 // If this is a reference to a frame earlier than the last up switch 213 // point, then ignore this reference. 214 if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx, 215 frame->references[i])) { 216 --frame->num_references; 217 } 218 } 219 } else { 220 frame->num_references = 0; 221 } 222 223 FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); 224 return kHandOff; 225 } 226 227 bool RtpVp9RefFinder::MissingRequiredFrameVp9(uint16_t picture_id, 228 const GofInfo& info) { 229 size_t diff = 230 ForwardDiff<uint16_t, kFrameIdLength>(info.gof->pid_start, picture_id); 231 size_t gof_idx = diff % info.gof->num_frames_in_gof; 232 size_t temporal_idx = info.gof->temporal_idx[gof_idx]; 233 234 if (temporal_idx >= kMaxTemporalLayers) { 235 RTC_LOG(LS_WARNING) << "At most " << kMaxTemporalLayers 236 << " temporal " 237 "layers are supported."; 238 return true; 239 } 240 241 // For every reference this frame has, check if there is a frame missing in 242 // the interval (`ref_pid`, `picture_id`) in any of the lower temporal 243 // layers. If so, we are missing a required frame. 244 uint8_t num_references = info.gof->num_ref_pics[gof_idx]; 245 for (size_t i = 0; i < num_references; ++i) { 246 uint16_t ref_pid = 247 Subtract<kFrameIdLength>(picture_id, info.gof->pid_diff[gof_idx][i]); 248 for (size_t l = 0; l < temporal_idx; ++l) { 249 auto missing_frame_it = missing_frames_for_layer_[l].lower_bound(ref_pid); 250 if (missing_frame_it != missing_frames_for_layer_[l].end() && 251 AheadOf<uint16_t, kFrameIdLength>(picture_id, *missing_frame_it)) { 252 return true; 253 } 254 } 255 } 256 return false; 257 } 258 259 void RtpVp9RefFinder::FrameReceivedVp9(uint16_t picture_id, GofInfo* info) { 260 int last_picture_id = info->last_picture_id; 261 size_t gof_size = std::min(info->gof->num_frames_in_gof, kMaxVp9FramesInGof); 262 263 // If there is a gap, find which temporal layer the missing frames 264 // belong to and add the frame as missing for that temporal layer. 265 // Otherwise, remove this frame from the set of missing frames. 266 if (AheadOf<uint16_t, kFrameIdLength>(picture_id, last_picture_id)) { 267 size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, 268 last_picture_id); 269 size_t gof_idx = diff % gof_size; 270 271 last_picture_id = Add<kFrameIdLength>(last_picture_id, 1); 272 while (last_picture_id != picture_id) { 273 gof_idx = (gof_idx + 1) % gof_size; 274 RTC_CHECK(gof_idx < kMaxVp9FramesInGof); 275 276 size_t temporal_idx = info->gof->temporal_idx[gof_idx]; 277 if (temporal_idx >= kMaxTemporalLayers) { 278 RTC_LOG(LS_WARNING) << "At most " << kMaxTemporalLayers 279 << " temporal " 280 "layers are supported."; 281 return; 282 } 283 284 missing_frames_for_layer_[temporal_idx].insert(last_picture_id); 285 last_picture_id = Add<kFrameIdLength>(last_picture_id, 1); 286 } 287 288 info->last_picture_id = last_picture_id; 289 } else { 290 size_t diff = 291 ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, picture_id); 292 size_t gof_idx = diff % gof_size; 293 RTC_CHECK(gof_idx < kMaxVp9FramesInGof); 294 295 size_t temporal_idx = info->gof->temporal_idx[gof_idx]; 296 if (temporal_idx >= kMaxTemporalLayers) { 297 RTC_LOG(LS_WARNING) << "At most " << kMaxTemporalLayers 298 << " temporal " 299 "layers are supported."; 300 return; 301 } 302 303 missing_frames_for_layer_[temporal_idx].erase(picture_id); 304 } 305 } 306 307 bool RtpVp9RefFinder::UpSwitchInIntervalVp9(uint16_t picture_id, 308 uint8_t temporal_idx, 309 uint16_t pid_ref) { 310 for (auto up_switch_it = up_switch_.upper_bound(pid_ref); 311 up_switch_it != up_switch_.end() && 312 AheadOf<uint16_t, kFrameIdLength>(picture_id, up_switch_it->first); 313 ++up_switch_it) { 314 if (up_switch_it->second < temporal_idx) 315 return true; 316 } 317 318 return false; 319 } 320 321 void RtpVp9RefFinder::RetryStashedFrames( 322 RtpFrameReferenceFinder::ReturnVector& res) { 323 bool complete_frame = false; 324 do { 325 complete_frame = false; 326 for (auto it = stashed_frames_.begin(); it != stashed_frames_.end();) { 327 const RTPVideoHeaderVP9& codec_header = std::get<RTPVideoHeaderVP9>( 328 it->frame->GetRtpVideoHeader().video_type_header); 329 RTC_DCHECK(!codec_header.flexible_mode); 330 FrameDecision decision = 331 ManageFrameGof(it->frame.get(), codec_header, it->unwrapped_tl0); 332 333 switch (decision) { 334 case kStash: 335 ++it; 336 break; 337 case kHandOff: 338 complete_frame = true; 339 res.push_back(std::move(it->frame)); 340 [[fallthrough]]; 341 case kDrop: 342 it = stashed_frames_.erase(it); 343 } 344 } 345 } while (complete_frame); 346 } 347 348 void RtpVp9RefFinder::FlattenFrameIdAndRefs(RtpFrameObject* frame, 349 bool inter_layer_predicted) { 350 for (size_t i = 0; i < frame->num_references; ++i) { 351 frame->references[i] = 352 unwrapper_.Unwrap(frame->references[i]) * kMaxSpatialLayers + 353 *frame->SpatialIndex(); 354 } 355 frame->SetId(unwrapper_.Unwrap(frame->Id()) * kMaxSpatialLayers + 356 *frame->SpatialIndex()); 357 358 if (inter_layer_predicted && 359 frame->num_references + 1 <= EncodedFrame::kMaxFrameReferences) { 360 frame->references[frame->num_references] = frame->Id() - 1; 361 ++frame->num_references; 362 } 363 } 364 365 void RtpVp9RefFinder::ClearTo(uint16_t seq_num) { 366 auto it = stashed_frames_.begin(); 367 while (it != stashed_frames_.end()) { 368 if (AheadOf<uint16_t>(seq_num, it->frame->first_seq_num())) { 369 it = stashed_frames_.erase(it); 370 } else { 371 ++it; 372 } 373 } 374 } 375 376 } // namespace webrtc