ethread.c (132349B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <stdbool.h> 14 15 #include "aom_util/aom_pthread.h" 16 17 #include "av1/common/warped_motion.h" 18 #include "av1/common/thread_common.h" 19 20 #include "av1/encoder/allintra_vis.h" 21 #include "av1/encoder/bitstream.h" 22 #include "av1/encoder/enc_enums.h" 23 #include "av1/encoder/encodeframe.h" 24 #include "av1/encoder/encodeframe_utils.h" 25 #include "av1/encoder/encoder.h" 26 #include "av1/encoder/encoder_alloc.h" 27 #include "av1/encoder/ethread.h" 28 #if !CONFIG_REALTIME_ONLY 29 #include "av1/encoder/firstpass.h" 30 #endif 31 #include "av1/encoder/global_motion.h" 32 #include "av1/encoder/global_motion_facade.h" 33 #include "av1/encoder/intra_mode_search_utils.h" 34 #include "av1/encoder/picklpf.h" 35 #include "av1/encoder/rdopt.h" 36 #include "aom_dsp/aom_dsp_common.h" 37 #include "av1/encoder/temporal_filter.h" 38 #include "av1/encoder/tpl_model.h" 39 40 static inline void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { 41 td->rd_counts.compound_ref_used_flag |= 42 td_t->rd_counts.compound_ref_used_flag; 43 td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; 44 45 for (int i = 0; i < TX_SIZES_ALL; i++) { 46 for (int j = 0; j < TX_TYPES; j++) 47 td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j]; 48 } 49 50 for (int i = 0; i < BLOCK_SIZES_ALL; i++) { 51 for (int j = 0; j < 2; j++) { 52 td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j]; 53 } 54 } 55 56 for (int i = 0; i < 2; i++) { 57 td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i]; 58 } 59 60 td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0]; 61 td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1]; 62 63 td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks; 64 } 65 66 static inline void update_delta_lf_for_row_mt(AV1_COMP *cpi) { 67 AV1_COMMON *cm = &cpi->common; 68 MACROBLOCKD *xd = &cpi->td.mb.e_mbd; 69 const int mib_size = cm->seq_params->mib_size; 70 const int frame_lf_count = 71 av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; 72 for (int row = 0; row < cm->tiles.rows; row++) { 73 for (int col = 0; col < cm->tiles.cols; col++) { 74 TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col]; 75 const TileInfo *const tile_info = &tile_data->tile_info; 76 for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; 77 mi_row += mib_size) { 78 if (mi_row == tile_info->mi_row_start) 79 av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); 80 for (int mi_col = tile_info->mi_col_start; 81 mi_col < tile_info->mi_col_end; mi_col += mib_size) { 82 const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; 83 MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; 84 MB_MODE_INFO *mbmi = mi[0]; 85 if (mbmi->skip_txfm == 1 && 86 (mbmi->bsize == cm->seq_params->sb_size)) { 87 for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) 88 mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; 89 mbmi->delta_lf_from_base = xd->delta_lf_from_base; 90 } else { 91 if (cm->delta_q_info.delta_lf_multi) { 92 for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) 93 xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; 94 } else { 95 xd->delta_lf_from_base = mbmi->delta_lf_from_base; 96 } 97 } 98 } 99 } 100 } 101 } 102 } 103 104 void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, 105 int c) { 106 (void)row_mt_sync; 107 (void)r; 108 (void)c; 109 } 110 111 void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, 112 int c, int cols) { 113 (void)row_mt_sync; 114 (void)r; 115 (void)c; 116 (void)cols; 117 } 118 119 void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) { 120 #if CONFIG_MULTITHREAD 121 const int nsync = row_mt_sync->sync_range; 122 123 if (r) { 124 pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; 125 pthread_mutex_lock(mutex); 126 127 while (c > row_mt_sync->num_finished_cols[r - 1] - nsync - 128 row_mt_sync->intrabc_extra_top_right_sb_delay) { 129 pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); 130 } 131 pthread_mutex_unlock(mutex); 132 } 133 #else 134 (void)row_mt_sync; 135 (void)r; 136 (void)c; 137 #endif // CONFIG_MULTITHREAD 138 } 139 140 void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, 141 int cols) { 142 #if CONFIG_MULTITHREAD 143 const int nsync = row_mt_sync->sync_range; 144 int cur; 145 // Only signal when there are enough encoded blocks for next row to run. 146 int sig = 1; 147 148 if (c < cols - 1) { 149 cur = c; 150 if (c % nsync) sig = 0; 151 } else { 152 cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay; 153 } 154 155 if (sig) { 156 pthread_mutex_lock(&row_mt_sync->mutex_[r]); 157 158 // When a thread encounters an error, num_finished_cols[r] is set to maximum 159 // column number. In this case, the AOMMAX operation here ensures that 160 // num_finished_cols[r] is not overwritten with a smaller value thus 161 // preventing the infinite waiting of threads in the relevant sync_read() 162 // function. 163 row_mt_sync->num_finished_cols[r] = 164 AOMMAX(row_mt_sync->num_finished_cols[r], cur); 165 166 pthread_cond_signal(&row_mt_sync->cond_[r]); 167 pthread_mutex_unlock(&row_mt_sync->mutex_[r]); 168 } 169 #else 170 (void)row_mt_sync; 171 (void)r; 172 (void)c; 173 (void)cols; 174 #endif // CONFIG_MULTITHREAD 175 } 176 177 // Allocate memory for row synchronization 178 static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync, 179 AV1_COMMON *cm, int rows) { 180 #if CONFIG_MULTITHREAD 181 int i; 182 183 CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, 184 aom_malloc(sizeof(*row_mt_sync->mutex_) * rows)); 185 if (row_mt_sync->mutex_) { 186 for (i = 0; i < rows; ++i) { 187 pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); 188 } 189 } 190 191 CHECK_MEM_ERROR(cm, row_mt_sync->cond_, 192 aom_malloc(sizeof(*row_mt_sync->cond_) * rows)); 193 if (row_mt_sync->cond_) { 194 for (i = 0; i < rows; ++i) { 195 pthread_cond_init(&row_mt_sync->cond_[i], NULL); 196 } 197 } 198 #endif // CONFIG_MULTITHREAD 199 200 CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols, 201 aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows)); 202 203 row_mt_sync->rows = rows; 204 // Set up nsync. 205 row_mt_sync->sync_range = 1; 206 } 207 208 // Deallocate row based multi-threading synchronization related mutex and data 209 void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) { 210 if (row_mt_sync != NULL) { 211 #if CONFIG_MULTITHREAD 212 int i; 213 214 if (row_mt_sync->mutex_ != NULL) { 215 for (i = 0; i < row_mt_sync->rows; ++i) { 216 pthread_mutex_destroy(&row_mt_sync->mutex_[i]); 217 } 218 aom_free(row_mt_sync->mutex_); 219 } 220 if (row_mt_sync->cond_ != NULL) { 221 for (i = 0; i < row_mt_sync->rows; ++i) { 222 pthread_cond_destroy(&row_mt_sync->cond_[i]); 223 } 224 aom_free(row_mt_sync->cond_); 225 } 226 #endif // CONFIG_MULTITHREAD 227 aom_free(row_mt_sync->num_finished_cols); 228 229 // clear the structure as the source of this call may be dynamic change 230 // in tiles in which case this call will be followed by an _alloc() 231 // which may fail. 232 av1_zero(*row_mt_sync); 233 } 234 } 235 236 static inline int get_sb_rows_in_frame(AV1_COMMON *cm) { 237 return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, 238 cm->seq_params->mib_size_log2); 239 } 240 241 static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols, 242 int alloc_row_ctx) { 243 struct AV1Common *cm = &cpi->common; 244 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 245 const int tile_cols = cm->tiles.cols; 246 const int tile_rows = cm->tiles.rows; 247 int tile_col, tile_row; 248 249 av1_row_mt_mem_dealloc(cpi); 250 251 // Allocate memory for row based multi-threading 252 for (tile_row = 0; tile_row < tile_rows; tile_row++) { 253 for (tile_col = 0; tile_col < tile_cols; tile_col++) { 254 int tile_index = tile_row * tile_cols + tile_col; 255 TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; 256 257 row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows); 258 259 if (alloc_row_ctx) { 260 assert(max_cols > 0); 261 const int num_row_ctx = AOMMAX(1, (max_cols - 1)); 262 CHECK_MEM_ERROR(cm, this_tile->row_ctx, 263 (FRAME_CONTEXT *)aom_memalign( 264 16, num_row_ctx * sizeof(*this_tile->row_ctx))); 265 } 266 } 267 } 268 const int sb_rows = get_sb_rows_in_frame(cm); 269 CHECK_MEM_ERROR( 270 cm, enc_row_mt->num_tile_cols_done, 271 aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows)); 272 273 enc_row_mt->allocated_rows = max_rows; 274 enc_row_mt->allocated_cols = max_cols - 1; 275 enc_row_mt->allocated_sb_rows = sb_rows; 276 } 277 278 void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { 279 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 280 const int tile_cols = enc_row_mt->allocated_tile_cols; 281 const int tile_rows = enc_row_mt->allocated_tile_rows; 282 int tile_col, tile_row; 283 284 // Free row based multi-threading sync memory 285 for (tile_row = 0; tile_row < tile_rows; tile_row++) { 286 for (tile_col = 0; tile_col < tile_cols; tile_col++) { 287 int tile_index = tile_row * tile_cols + tile_col; 288 TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; 289 290 av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); 291 292 if (cpi->oxcf.algo_cfg.cdf_update_mode) { 293 aom_free(this_tile->row_ctx); 294 this_tile->row_ctx = NULL; 295 } 296 } 297 } 298 aom_free(enc_row_mt->num_tile_cols_done); 299 enc_row_mt->num_tile_cols_done = NULL; 300 enc_row_mt->allocated_rows = 0; 301 enc_row_mt->allocated_cols = 0; 302 enc_row_mt->allocated_sb_rows = 0; 303 } 304 305 static inline void assign_tile_to_thread(int *thread_id_to_tile_id, 306 int num_tiles, int num_workers) { 307 int tile_id = 0; 308 int i; 309 310 for (i = 0; i < num_workers; i++) { 311 thread_id_to_tile_id[i] = tile_id++; 312 if (tile_id == num_tiles) tile_id = 0; 313 } 314 } 315 316 static inline int get_next_job(TileDataEnc *const tile_data, 317 int *current_mi_row, int mib_size) { 318 AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; 319 const int mi_row_end = tile_data->tile_info.mi_row_end; 320 321 if (row_mt_sync->next_mi_row < mi_row_end) { 322 *current_mi_row = row_mt_sync->next_mi_row; 323 row_mt_sync->num_threads_working++; 324 row_mt_sync->next_mi_row += mib_size; 325 return 1; 326 } 327 return 0; 328 } 329 330 static inline void switch_tile_and_get_next_job( 331 AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id, 332 int *current_mi_row, int *end_of_frame, int is_firstpass, 333 const BLOCK_SIZE fp_block_size) { 334 const int tile_cols = cm->tiles.cols; 335 const int tile_rows = cm->tiles.rows; 336 337 int tile_id = -1; // Stores the tile ID with minimum proc done 338 int max_mis_to_encode = 0; 339 int min_num_threads_working = INT_MAX; 340 341 for (int tile_row = 0; tile_row < tile_rows; tile_row++) { 342 for (int tile_col = 0; tile_col < tile_cols; tile_col++) { 343 int tile_index = tile_row * tile_cols + tile_col; 344 TileDataEnc *const this_tile = &tile_data[tile_index]; 345 AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; 346 347 #if CONFIG_REALTIME_ONLY 348 int num_b_rows_in_tile = 349 av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); 350 int num_b_cols_in_tile = 351 av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); 352 #else 353 int num_b_rows_in_tile = 354 is_firstpass 355 ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size) 356 : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); 357 int num_b_cols_in_tile = 358 is_firstpass 359 ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size) 360 : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); 361 #endif 362 int theoretical_limit_on_threads = 363 AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile); 364 int num_threads_working = row_mt_sync->num_threads_working; 365 366 if (num_threads_working < theoretical_limit_on_threads) { 367 int num_mis_to_encode = 368 this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row; 369 370 // Tile to be processed by this thread is selected on the basis of 371 // availability of jobs: 372 // 1) If jobs are available, tile to be processed is chosen on the 373 // basis of minimum number of threads working for that tile. If two or 374 // more tiles have same number of threads working for them, then the 375 // tile with maximum number of jobs available will be chosen. 376 // 2) If no jobs are available, then end_of_frame is reached. 377 if (num_mis_to_encode > 0) { 378 if (num_threads_working < min_num_threads_working) { 379 min_num_threads_working = num_threads_working; 380 max_mis_to_encode = 0; 381 } 382 if (num_threads_working == min_num_threads_working && 383 num_mis_to_encode > max_mis_to_encode) { 384 tile_id = tile_index; 385 max_mis_to_encode = num_mis_to_encode; 386 } 387 } 388 } 389 } 390 } 391 if (tile_id == -1) { 392 *end_of_frame = 1; 393 } else { 394 // Update the current tile id to the tile id that will be processed next, 395 // which will be the least processed tile. 396 *cur_tile_id = tile_id; 397 const int unit_height = mi_size_high[fp_block_size]; 398 get_next_job(&tile_data[tile_id], current_mi_row, 399 is_firstpass ? unit_height : cm->seq_params->mib_size); 400 } 401 } 402 403 #if !CONFIG_REALTIME_ONLY 404 static void set_firstpass_encode_done(AV1_COMP *cpi) { 405 AV1_COMMON *const cm = &cpi->common; 406 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 407 const int tile_cols = cm->tiles.cols; 408 const int tile_rows = cm->tiles.rows; 409 const BLOCK_SIZE fp_block_size = cpi->fp_block_size; 410 const int unit_height = mi_size_high[fp_block_size]; 411 412 // In case of multithreading of firstpass encode, due to top-right 413 // dependency, the worker on a firstpass row waits for the completion of the 414 // firstpass processing of the top and top-right fp_blocks. Hence, in case a 415 // thread (main/worker) encounters an error, update the firstpass processing 416 // of every row in the frame to indicate that it is complete in order to avoid 417 // dependent workers waiting indefinitely. 418 for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { 419 for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { 420 TileDataEnc *const tile_data = 421 &cpi->tile_data[tile_row * tile_cols + tile_col]; 422 TileInfo *tile = &tile_data->tile_info; 423 AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; 424 const int unit_cols_in_tile = 425 av1_get_unit_cols_in_tile(tile, fp_block_size); 426 for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0; 427 mi_row < tile->mi_row_end; 428 mi_row += unit_height, unit_row_in_tile++) { 429 enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, 430 unit_cols_in_tile - 1, unit_cols_in_tile); 431 } 432 } 433 } 434 } 435 436 static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) { 437 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 438 AV1_COMP *const cpi = thread_data->cpi; 439 int thread_id = thread_data->thread_id; 440 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 441 #if CONFIG_MULTITHREAD 442 pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; 443 #endif 444 (void)unused; 445 struct aom_internal_error_info *const error_info = &thread_data->error_info; 446 MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; 447 xd->error_info = error_info; 448 449 // The jmp_buf is valid only for the duration of the function that calls 450 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 451 // before it returns. 452 if (setjmp(error_info->jmp)) { 453 error_info->setjmp = 0; 454 #if CONFIG_MULTITHREAD 455 pthread_mutex_lock(enc_row_mt_mutex_); 456 enc_row_mt->firstpass_mt_exit = true; 457 pthread_mutex_unlock(enc_row_mt_mutex_); 458 #endif 459 set_firstpass_encode_done(cpi); 460 return 0; 461 } 462 error_info->setjmp = 1; 463 464 AV1_COMMON *const cm = &cpi->common; 465 int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; 466 assert(cur_tile_id != -1); 467 468 const BLOCK_SIZE fp_block_size = cpi->fp_block_size; 469 const int unit_height = mi_size_high[fp_block_size]; 470 int end_of_frame = 0; 471 while (1) { 472 int current_mi_row = -1; 473 #if CONFIG_MULTITHREAD 474 pthread_mutex_lock(enc_row_mt_mutex_); 475 #endif 476 bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; 477 if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id], 478 ¤t_mi_row, unit_height)) { 479 // No jobs are available for the current tile. Query for the status of 480 // other tiles and get the next job if available 481 switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, 482 ¤t_mi_row, &end_of_frame, 1, 483 fp_block_size); 484 } 485 #if CONFIG_MULTITHREAD 486 pthread_mutex_unlock(enc_row_mt_mutex_); 487 #endif 488 // When firstpass_mt_exit is set to true, other workers need not pursue any 489 // further jobs. 490 if (firstpass_mt_exit || end_of_frame) break; 491 492 TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; 493 AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; 494 ThreadData *td = thread_data->td; 495 496 assert(current_mi_row != -1 && 497 current_mi_row < this_tile->tile_info.mi_row_end); 498 499 const int unit_height_log2 = mi_size_high_log2[fp_block_size]; 500 av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2, 501 fp_block_size); 502 #if CONFIG_MULTITHREAD 503 pthread_mutex_lock(enc_row_mt_mutex_); 504 #endif 505 row_mt_sync->num_threads_working--; 506 #if CONFIG_MULTITHREAD 507 pthread_mutex_unlock(enc_row_mt_mutex_); 508 #endif 509 } 510 error_info->setjmp = 0; 511 return 1; 512 } 513 #endif 514 515 static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data, 516 AV1EncRowMultiThreadInfo *enc_row_mt, 517 int mib_size_log2) { 518 AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync; 519 const int sb_rows = get_sb_rows_in_frame(cm); 520 AV1LfMTInfo *cur_job_info; 521 bool row_mt_exit = false; 522 (void)enc_row_mt; 523 #if CONFIG_MULTITHREAD 524 pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; 525 #endif 526 527 while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { 528 LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data; 529 const int lpf_opt_level = cur_job_info->lpf_opt_level; 530 (void)sb_rows; 531 #if CONFIG_MULTITHREAD 532 const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2; 533 const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1); 534 // Wait for current and next superblock row to finish encoding. 535 pthread_mutex_lock(enc_row_mt_mutex_); 536 while (!enc_row_mt->row_mt_exit && 537 (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols || 538 enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) { 539 pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_); 540 } 541 row_mt_exit = enc_row_mt->row_mt_exit; 542 pthread_mutex_unlock(enc_row_mt_mutex_); 543 #endif 544 if (row_mt_exit) return; 545 546 av1_thread_loop_filter_rows( 547 lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, 548 cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, 549 lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf, 550 lf_data->tx_buf, mib_size_log2); 551 } 552 } 553 554 static void set_encoding_done(AV1_COMP *cpi) { 555 AV1_COMMON *const cm = &cpi->common; 556 const int tile_cols = cm->tiles.cols; 557 const int tile_rows = cm->tiles.rows; 558 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 559 const int mib_size = cm->seq_params->mib_size; 560 561 // In case of row-multithreading, due to top-right dependency, the worker on 562 // an SB row waits for the completion of the encode of the top and top-right 563 // SBs. Hence, in case a thread (main/worker) encounters an error, update that 564 // encoding of every SB row in the frame is complete in order to avoid the 565 // dependent workers of every tile from waiting indefinitely. 566 for (int tile_row = 0; tile_row < tile_rows; tile_row++) { 567 for (int tile_col = 0; tile_col < tile_cols; tile_col++) { 568 TileDataEnc *const this_tile = 569 &cpi->tile_data[tile_row * tile_cols + tile_col]; 570 const TileInfo *const tile_info = &this_tile->tile_info; 571 AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; 572 const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); 573 for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0; 574 mi_row < tile_info->mi_row_end; 575 mi_row += mib_size, sb_row_in_tile++) { 576 enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile, 577 sb_cols_in_tile - 1, sb_cols_in_tile); 578 } 579 } 580 } 581 } 582 583 static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc, 584 const int filter_level[2]) { 585 return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]); 586 } 587 588 static int enc_row_mt_worker_hook(void *arg1, void *unused) { 589 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 590 AV1_COMP *const cpi = thread_data->cpi; 591 int thread_id = thread_data->thread_id; 592 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 593 #if CONFIG_MULTITHREAD 594 pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; 595 #endif 596 (void)unused; 597 598 struct aom_internal_error_info *const error_info = &thread_data->error_info; 599 AV1LfSync *const lf_sync = thread_data->lf_sync; 600 MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; 601 xd->error_info = error_info; 602 AV1_COMMON *volatile const cm = &cpi->common; 603 volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled( 604 cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level); 605 606 // The jmp_buf is valid only for the duration of the function that calls 607 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 608 // before it returns. 609 if (setjmp(error_info->jmp)) { 610 error_info->setjmp = 0; 611 #if CONFIG_MULTITHREAD 612 pthread_mutex_lock(enc_row_mt_mutex_); 613 enc_row_mt->row_mt_exit = true; 614 // Wake up all the workers waiting in launch_loop_filter_rows() to exit in 615 // case of an error. 616 pthread_cond_broadcast(enc_row_mt->cond_); 617 pthread_mutex_unlock(enc_row_mt_mutex_); 618 #endif 619 set_encoding_done(cpi); 620 621 if (do_pipelined_lpf_mt_with_enc) { 622 #if CONFIG_MULTITHREAD 623 pthread_mutex_lock(lf_sync->job_mutex); 624 lf_sync->lf_mt_exit = true; 625 pthread_mutex_unlock(lf_sync->job_mutex); 626 #endif 627 av1_set_vert_loop_filter_done(&cpi->common, lf_sync, 628 cpi->common.seq_params->mib_size_log2); 629 } 630 return 0; 631 } 632 error_info->setjmp = 1; 633 634 const int mib_size_log2 = cm->seq_params->mib_size_log2; 635 int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; 636 637 // Preallocate the pc_tree for realtime coding to reduce the cost of memory 638 // allocation. 639 if (cpi->sf.rt_sf.use_nonrd_pick_mode) { 640 thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); 641 if (!thread_data->td->pc_root) 642 aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, 643 "Failed to allocate PC_TREE"); 644 } else { 645 thread_data->td->pc_root = NULL; 646 } 647 648 assert(cur_tile_id != -1); 649 650 const BLOCK_SIZE fp_block_size = cpi->fp_block_size; 651 int end_of_frame = 0; 652 bool row_mt_exit = false; 653 654 // When master thread does not have a valid job to process, xd->tile_ctx 655 // is not set and it contains NULL pointer. This can result in NULL pointer 656 // access violation if accessed beyond the encode stage. Hence, updating 657 // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame 658 // context to avoid NULL pointer access in subsequent stages. 659 thread_data->td->mb.e_mbd.tile_ctx = cm->fc; 660 while (1) { 661 int current_mi_row = -1; 662 #if CONFIG_MULTITHREAD 663 pthread_mutex_lock(enc_row_mt_mutex_); 664 #endif 665 row_mt_exit = enc_row_mt->row_mt_exit; 666 // row_mt_exit check here can be avoided as it is checked after 667 // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here, 668 // tries to return before calling the function get_next_job(). 669 if (!row_mt_exit && 670 !get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, 671 cm->seq_params->mib_size)) { 672 // No jobs are available for the current tile. Query for the status of 673 // other tiles and get the next job if available 674 switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, 675 ¤t_mi_row, &end_of_frame, 0, 676 fp_block_size); 677 } 678 #if CONFIG_MULTITHREAD 679 pthread_mutex_unlock(enc_row_mt_mutex_); 680 #endif 681 // When row_mt_exit is set to true, other workers need not pursue any 682 // further jobs. 683 if (row_mt_exit) { 684 error_info->setjmp = 0; 685 return 1; 686 } 687 688 if (end_of_frame) break; 689 690 TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; 691 AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; 692 const TileInfo *const tile_info = &this_tile->tile_info; 693 const int tile_row = tile_info->tile_row; 694 const int tile_col = tile_info->tile_col; 695 ThreadData *td = thread_data->td; 696 const int sb_row = current_mi_row >> mib_size_log2; 697 698 assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end); 699 700 td->mb.e_mbd.tile_ctx = td->tctx; 701 td->mb.tile_pb_ctx = &this_tile->tctx; 702 td->abs_sum_level = 0; 703 704 if (this_tile->allow_update_cdf) { 705 td->mb.row_ctx = this_tile->row_ctx; 706 if (current_mi_row == tile_info->mi_row_start) 707 *td->mb.e_mbd.tile_ctx = this_tile->tctx; 708 } else { 709 *td->mb.e_mbd.tile_ctx = this_tile->tctx; 710 } 711 712 av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, 713 &td->mb.e_mbd); 714 #if !CONFIG_REALTIME_ONLY 715 cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); 716 #endif 717 if (td->mb.txfm_search_info.mb_rd_record != NULL) { 718 av1_crc32c_calculator_init( 719 &td->mb.txfm_search_info.mb_rd_record->crc_calculator); 720 } 721 722 av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row); 723 #if CONFIG_MULTITHREAD 724 pthread_mutex_lock(enc_row_mt_mutex_); 725 #endif 726 this_tile->abs_sum_level += td->abs_sum_level; 727 row_mt_sync->num_threads_working--; 728 enc_row_mt->num_tile_cols_done[sb_row]++; 729 #if CONFIG_MULTITHREAD 730 pthread_cond_broadcast(enc_row_mt->cond_); 731 pthread_mutex_unlock(enc_row_mt_mutex_); 732 #endif 733 } 734 if (do_pipelined_lpf_mt_with_enc) { 735 // Loop-filter a superblock row if encoding of the current and next 736 // superblock row is complete. 737 // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving 738 // encoding and loop filter stage. 739 launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2); 740 } 741 av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, 742 cpi->sf.part_sf.partition_search_type); 743 thread_data->td->pc_root = NULL; 744 error_info->setjmp = 0; 745 return 1; 746 } 747 748 static int enc_worker_hook(void *arg1, void *unused) { 749 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 750 AV1_COMP *const cpi = thread_data->cpi; 751 MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; 752 struct aom_internal_error_info *const error_info = &thread_data->error_info; 753 const AV1_COMMON *const cm = &cpi->common; 754 const int tile_cols = cm->tiles.cols; 755 const int tile_rows = cm->tiles.rows; 756 int t; 757 758 (void)unused; 759 760 xd->error_info = error_info; 761 762 // The jmp_buf is valid only for the duration of the function that calls 763 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 764 // before it returns. 765 if (setjmp(error_info->jmp)) { 766 error_info->setjmp = 0; 767 return 0; 768 } 769 error_info->setjmp = 1; 770 771 // Preallocate the pc_tree for realtime coding to reduce the cost of memory 772 // allocation. 773 if (cpi->sf.rt_sf.use_nonrd_pick_mode) { 774 thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); 775 if (!thread_data->td->pc_root) 776 aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, 777 "Failed to allocate PC_TREE"); 778 } else { 779 thread_data->td->pc_root = NULL; 780 } 781 782 for (t = thread_data->start; t < tile_rows * tile_cols; 783 t += cpi->mt_info.num_workers) { 784 int tile_row = t / tile_cols; 785 int tile_col = t % tile_cols; 786 787 TileDataEnc *const this_tile = 788 &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; 789 thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; 790 thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; 791 av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); 792 } 793 794 av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, 795 cpi->sf.part_sf.partition_search_type); 796 thread_data->td->pc_root = NULL; 797 error_info->setjmp = 0; 798 return 1; 799 } 800 801 void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) { 802 cpi->mt_info.workers = ppi->p_mt_info.workers; 803 cpi->mt_info.num_workers = ppi->p_mt_info.num_workers; 804 cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data; 805 int i; 806 for (i = MOD_FP; i < NUM_MT_MODULES; i++) { 807 cpi->mt_info.num_mod_workers[i] = 808 AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]); 809 } 810 } 811 812 void av1_init_cdef_worker(AV1_COMP *cpi) { 813 // The allocation is done only for level 0 parallel frames. No change 814 // in config is supported in the middle of a parallel encode set, since the 815 // rest of the MT modules also do not support dynamic change of config. 816 if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return; 817 PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; 818 int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF); 819 820 av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker, 821 &cpi->mt_info.cdef_sync, num_cdef_workers, 1); 822 cpi->mt_info.cdef_worker = p_mt_info->cdef_worker; 823 } 824 825 #if !CONFIG_REALTIME_ONLY 826 void av1_init_lr_mt_buffers(AV1_COMP *cpi) { 827 AV1_COMMON *const cm = &cpi->common; 828 AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync; 829 if (lr_sync->sync_range) { 830 if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) 831 return; 832 int num_lr_workers = 833 av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR); 834 assert(num_lr_workers <= lr_sync->num_workers); 835 lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf; 836 lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs; 837 } 838 } 839 #endif 840 841 #if CONFIG_MULTITHREAD 842 void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { 843 AV1_COMMON *const cm = &cpi->common; 844 MultiThreadInfo *const mt_info = &cpi->mt_info; 845 846 if (setjmp(cm->error->jmp)) { 847 cm->error->setjmp = 0; 848 aom_internal_error_copy(&cpi->ppi->error, cm->error); 849 } 850 cm->error->setjmp = 1; 851 // Initialize enc row MT object. 852 if (is_first_pass || cpi->oxcf.row_mt == 1) { 853 AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; 854 if (enc_row_mt->mutex_ == NULL) { 855 CHECK_MEM_ERROR(cm, enc_row_mt->mutex_, 856 aom_malloc(sizeof(*(enc_row_mt->mutex_)))); 857 if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL); 858 } 859 if (enc_row_mt->cond_ == NULL) { 860 CHECK_MEM_ERROR(cm, enc_row_mt->cond_, 861 aom_malloc(sizeof(*(enc_row_mt->cond_)))); 862 if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL); 863 } 864 } 865 866 if (!is_first_pass) { 867 // Initialize global motion MT object. 868 AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; 869 if (gm_sync->mutex_ == NULL) { 870 CHECK_MEM_ERROR(cm, gm_sync->mutex_, 871 aom_malloc(sizeof(*(gm_sync->mutex_)))); 872 if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); 873 } 874 #if !CONFIG_REALTIME_ONLY 875 // Initialize temporal filtering MT object. 876 AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; 877 if (tf_sync->mutex_ == NULL) { 878 CHECK_MEM_ERROR(cm, tf_sync->mutex_, 879 aom_malloc(sizeof(*tf_sync->mutex_))); 880 if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); 881 } 882 #endif // !CONFIG_REALTIME_ONLY 883 // Initialize CDEF MT object. 884 AV1CdefSync *cdef_sync = &mt_info->cdef_sync; 885 if (cdef_sync->mutex_ == NULL) { 886 CHECK_MEM_ERROR(cm, cdef_sync->mutex_, 887 aom_malloc(sizeof(*(cdef_sync->mutex_)))); 888 if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); 889 } 890 891 // Initialize loop filter MT object. 892 AV1LfSync *lf_sync = &mt_info->lf_row_sync; 893 // Number of superblock rows 894 const int sb_rows = 895 CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2); 896 PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; 897 int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF); 898 899 if (!lf_sync->sync_range || sb_rows != lf_sync->rows || 900 num_lf_workers > lf_sync->num_workers) { 901 av1_loop_filter_dealloc(lf_sync); 902 av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers); 903 } 904 905 // Initialize tpl MT object. 906 AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt; 907 if (tpl_row_mt->mutex_ == NULL) { 908 CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_, 909 aom_malloc(sizeof(*(tpl_row_mt->mutex_)))); 910 if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL); 911 } 912 913 #if !CONFIG_REALTIME_ONLY 914 if (is_restoration_used(cm)) { 915 // Initialize loop restoration MT object. 916 AV1LrSync *lr_sync = &mt_info->lr_row_sync; 917 int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; 918 int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height); 919 int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR); 920 if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || 921 num_lr_workers > lr_sync->num_workers || 922 MAX_MB_PLANE > lr_sync->num_planes) { 923 av1_loop_restoration_dealloc(lr_sync); 924 av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr, 925 MAX_MB_PLANE, cm->width); 926 } 927 } 928 #endif 929 930 // Initialization of pack bitstream MT object. 931 AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync; 932 if (pack_bs_sync->mutex_ == NULL) { 933 CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_, 934 aom_malloc(sizeof(*pack_bs_sync->mutex_))); 935 if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); 936 } 937 } 938 cm->error->setjmp = 0; 939 } 940 #endif // CONFIG_MULTITHREAD 941 942 // Computes the number of workers to be considered while allocating memory for a 943 // multi-threaded module under FPMT. 944 int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, 945 MULTI_THREADED_MODULES mod_name) { 946 int num_mod_workers = p_mt_info->num_mod_workers[mod_name]; 947 if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) { 948 // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC]. 949 // As frame parallel jobs will only perform multi-threading for the encode 950 // stage, we can limit the allocations according to num_enc_workers per 951 // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]). 952 num_mod_workers = p_mt_info->num_workers; 953 } 954 return num_mod_workers; 955 } 956 957 void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) { 958 PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; 959 960 assert(p_mt_info->workers != NULL); 961 assert(p_mt_info->tile_thr_data != NULL); 962 963 int num_workers = p_mt_info->num_workers; 964 int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC); 965 assert(num_enc_workers <= num_workers); 966 for (int i = num_workers - 1; i >= 0; i--) { 967 EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; 968 969 if (i > 0) { 970 // Allocate thread data. 971 ThreadData *td; 972 AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td))); 973 av1_zero(*td); 974 thread_data->original_td = thread_data->td = td; 975 976 // Set up shared coeff buffers. 977 av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf, 978 &ppi->error); 979 AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst, 980 aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * 981 sizeof(*td->tmp_conv_dst))); 982 983 if (i < p_mt_info->num_mod_workers[MOD_FP]) { 984 // Set up firstpass PICK_MODE_CONTEXT. 985 td->firstpass_ctx = 986 av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf); 987 if (!td->firstpass_ctx) 988 aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, 989 "Failed to allocate PICK_MODE_CONTEXT"); 990 } 991 992 if (!is_first_pass && i < num_enc_workers) { 993 // Set up sms_tree. 994 if (av1_setup_sms_tree(ppi->cpi, td)) { 995 aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, 996 "Failed to allocate SMS tree"); 997 } 998 999 for (int x = 0; x < 2; x++) { 1000 AOM_CHECK_MEM_ERROR( 1001 &ppi->error, td->hash_value_buffer[x], 1002 (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * 1003 sizeof(*td->hash_value_buffer[x]))); 1004 } 1005 1006 // Allocate frame counters in thread data. 1007 AOM_CHECK_MEM_ERROR(&ppi->error, td->counts, 1008 aom_calloc(1, sizeof(*td->counts))); 1009 1010 // Allocate buffers used by palette coding mode. 1011 AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer, 1012 aom_memalign(16, sizeof(*td->palette_buffer))); 1013 1014 // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are 1015 // used in inter frames to store intermediate inter mode prediction 1016 // results and are not required for allintra encoding mode. Hence, the 1017 // memory allocations for these buffers are avoided for allintra 1018 // encoding mode. 1019 if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) { 1020 alloc_obmc_buffers(&td->obmc_buffer, &ppi->error); 1021 1022 alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer); 1023 1024 for (int j = 0; j < 2; ++j) { 1025 AOM_CHECK_MEM_ERROR( 1026 &ppi->error, td->tmp_pred_bufs[j], 1027 aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * 1028 sizeof(*td->tmp_pred_bufs[j]))); 1029 } 1030 } 1031 1032 if (is_gradient_caching_for_hog_enabled(ppi->cpi)) { 1033 const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome; 1034 AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info, 1035 aom_malloc(sizeof(*td->pixel_gradient_info) * 1036 plane_types * MAX_SB_SQUARE)); 1037 } 1038 1039 if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) { 1040 const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size; 1041 const int mi_count_in_sb = 1042 mi_size_wide[sb_size] * mi_size_high[sb_size]; 1043 1044 AOM_CHECK_MEM_ERROR( 1045 &ppi->error, td->src_var_info_of_4x4_sub_blocks, 1046 aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) * 1047 mi_count_in_sb)); 1048 } 1049 1050 if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { 1051 const int num_64x64_blocks = 1052 (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; 1053 AOM_CHECK_MEM_ERROR( 1054 &ppi->error, td->vt64x64, 1055 aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks)); 1056 } 1057 } 1058 } 1059 1060 if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) { 1061 if (i == 0) { 1062 for (int j = 0; j < ppi->num_fp_contexts; j++) { 1063 AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx, 1064 (FRAME_CONTEXT *)aom_memalign( 1065 16, sizeof(*ppi->parallel_cpi[j]->td.tctx))); 1066 } 1067 } else { 1068 AOM_CHECK_MEM_ERROR( 1069 &ppi->error, thread_data->td->tctx, 1070 (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx))); 1071 } 1072 } 1073 } 1074 1075 // Record the number of workers in encode stage multi-threading for which 1076 // allocation is done. 1077 p_mt_info->prev_num_enc_workers = num_enc_workers; 1078 } 1079 1080 void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) { 1081 PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; 1082 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 1083 assert(p_mt_info->num_workers == 0); 1084 1085 AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers, 1086 aom_malloc(num_workers * sizeof(*p_mt_info->workers))); 1087 1088 AOM_CHECK_MEM_ERROR( 1089 &ppi->error, p_mt_info->tile_thr_data, 1090 aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data))); 1091 1092 for (int i = 0; i < num_workers; ++i) { 1093 AVxWorker *const worker = &p_mt_info->workers[i]; 1094 EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; 1095 1096 winterface->init(worker); 1097 worker->thread_name = "aom enc worker"; 1098 1099 thread_data->thread_id = i; 1100 // Set the starting tile for each thread. 1101 thread_data->start = i; 1102 1103 if (i > 0) { 1104 // Create threads 1105 if (!winterface->reset(worker)) 1106 aom_internal_error(&ppi->error, AOM_CODEC_ERROR, 1107 "Tile encoder thread creation failed"); 1108 } 1109 winterface->sync(worker); 1110 1111 ++p_mt_info->num_workers; 1112 } 1113 } 1114 1115 // This function will change the state and free the mutex of corresponding 1116 // workers and terminate the object. The object can not be re-used unless a call 1117 // to reset() is made. 1118 void av1_terminate_workers(AV1_PRIMARY *ppi) { 1119 PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; 1120 for (int t = 0; t < p_mt_info->num_workers; ++t) { 1121 AVxWorker *const worker = &p_mt_info->workers[t]; 1122 aom_get_worker_interface()->end(worker); 1123 } 1124 } 1125 1126 // This function returns 1 if frame parallel encode is supported for 1127 // the current configuration. Returns 0 otherwise. 1128 static inline int is_fpmt_config(const AV1_PRIMARY *ppi, 1129 const AV1EncoderConfig *oxcf) { 1130 // FPMT is enabled for AOM_Q and AOM_VBR. 1131 // TODO(Tarun): Test and enable resize config. 1132 if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) { 1133 return 0; 1134 } 1135 if (ppi->use_svc) { 1136 return 0; 1137 } 1138 if (oxcf->tile_cfg.enable_large_scale_tile) { 1139 return 0; 1140 } 1141 if (oxcf->dec_model_cfg.timing_info_present) { 1142 return 0; 1143 } 1144 if (oxcf->mode != GOOD) { 1145 return 0; 1146 } 1147 if (oxcf->tool_cfg.error_resilient_mode) { 1148 return 0; 1149 } 1150 if (oxcf->resize_cfg.resize_mode) { 1151 return 0; 1152 } 1153 if (oxcf->pass != AOM_RC_SECOND_PASS) { 1154 return 0; 1155 } 1156 if (oxcf->max_threads < 2) { 1157 return 0; 1158 } 1159 if (!oxcf->fp_mt) { 1160 return 0; 1161 } 1162 1163 return 1; 1164 } 1165 1166 int av1_check_fpmt_config(AV1_PRIMARY *const ppi, 1167 const AV1EncoderConfig *const oxcf) { 1168 if (is_fpmt_config(ppi, oxcf)) return 1; 1169 // Reset frame parallel configuration for unsupported config 1170 if (ppi->num_fp_contexts > 1) { 1171 for (int i = 1; i < ppi->num_fp_contexts; i++) { 1172 // Release the previously-used frame-buffer 1173 if (ppi->parallel_cpi[i]->common.cur_frame != NULL) { 1174 --ppi->parallel_cpi[i]->common.cur_frame->ref_count; 1175 ppi->parallel_cpi[i]->common.cur_frame = NULL; 1176 } 1177 } 1178 1179 int cur_gf_index = ppi->cpi->gf_frame_index; 1180 int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index); 1181 av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index], 1182 reset_size); 1183 av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size); 1184 av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size); 1185 memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX, 1186 sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) * 1187 reset_size * REF_FRAMES); 1188 memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX, 1189 sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size); 1190 ppi->num_fp_contexts = 1; 1191 } 1192 return 0; 1193 } 1194 1195 // A large value for threads used to compute the max num_enc_workers 1196 // possible for each resolution. 1197 #define MAX_THREADS 100 1198 1199 // Computes the max number of enc workers possible for each resolution. 1200 static inline int compute_max_num_enc_workers( 1201 CommonModeInfoParams *const mi_params, int mib_size_log2) { 1202 int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2); 1203 int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2); 1204 1205 return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows); 1206 } 1207 1208 // Computes the number of frame parallel(fp) contexts to be created 1209 // based on the number of max_enc_workers. 1210 int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { 1211 ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0; 1212 if (!av1_check_fpmt_config(ppi, oxcf)) { 1213 return 1; 1214 } 1215 int max_num_enc_workers = compute_max_num_enc_workers( 1216 &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2); 1217 // Scaling factors and rounding factors used to tune worker_per_frame 1218 // computation. 1219 int rounding_factor[2] = { 2, 4 }; 1220 int scaling_factor[2] = { 4, 8 }; 1221 int is_480p_or_lesser = 1222 AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480; 1223 int is_sb_64 = 0; 1224 if (ppi->cpi != NULL) 1225 is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64; 1226 // A parallel frame encode has at least 1/4th the 1227 // theoretical limit of max enc workers in default case. For resolutions 1228 // larger than 480p, if SB size is 64x64, optimal performance is obtained with 1229 // limit of 1/8. 1230 int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0; 1231 int workers_per_frame = 1232 AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) / 1233 scaling_factor[index]); 1234 int max_threads = oxcf->max_threads; 1235 int num_fp_contexts = max_threads / workers_per_frame; 1236 // Based on empirical results, FPMT gains with multi-tile are significant when 1237 // more parallel frames are available. Use FPMT with multi-tile encode only 1238 // when sufficient threads are available for parallel encode of 1239 // MAX_PARALLEL_FRAMES frames. 1240 if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) { 1241 if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1; 1242 } 1243 1244 num_fp_contexts = clamp(num_fp_contexts, 1, MAX_PARALLEL_FRAMES); 1245 // Limit recalculated num_fp_contexts to ppi->num_fp_contexts. 1246 num_fp_contexts = (ppi->num_fp_contexts == 1) 1247 ? num_fp_contexts 1248 : AOMMIN(num_fp_contexts, ppi->num_fp_contexts); 1249 if (num_fp_contexts > 1) { 1250 ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 1251 AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads); 1252 } 1253 return num_fp_contexts; 1254 } 1255 1256 // Computes the number of workers to process each of the parallel frames. 1257 static inline int compute_num_workers_per_frame( 1258 const int num_workers, const int parallel_frame_count) { 1259 // Number of level 2 workers per frame context (floor division). 1260 int workers_per_frame = (num_workers / parallel_frame_count); 1261 return workers_per_frame; 1262 } 1263 1264 static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi, 1265 int parallel_frame_count, 1266 int num_fpmt_workers_prepared); 1267 1268 // Prepare level 1 workers. This function is only called for 1269 // parallel_frame_count > 1. This function populates the mt_info structure of 1270 // frame level contexts appropriately by dividing the total number of available 1271 // workers amongst the frames as level 2 workers. It also populates the hook and 1272 // data members of level 1 workers. 1273 static inline void prepare_fpmt_workers(AV1_PRIMARY *ppi, 1274 AV1_COMP_DATA *first_cpi_data, 1275 AVxWorkerHook hook, 1276 int parallel_frame_count) { 1277 assert(parallel_frame_count <= ppi->num_fp_contexts && 1278 parallel_frame_count > 1); 1279 1280 PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; 1281 int num_workers = p_mt_info->num_workers; 1282 1283 volatile int frame_idx = 0; 1284 volatile int i = 0; 1285 while (i < num_workers) { 1286 // Assign level 1 worker 1287 AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] = 1288 &p_mt_info->workers[i]; 1289 AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; 1290 MultiThreadInfo *mt_info = &cur_cpi->mt_info; 1291 // This 'aom_internal_error_info' pointer is not derived from the local 1292 // pointer ('AV1_COMMON *const cm') to silence the compiler warning 1293 // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]". 1294 struct aom_internal_error_info *const error = cur_cpi->common.error; 1295 1296 // The jmp_buf is valid only within the scope of the function that calls 1297 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 1298 // before it returns. 1299 if (setjmp(error->jmp)) { 1300 error->setjmp = 0; 1301 restore_workers_after_fpmt(ppi, parallel_frame_count, i); 1302 aom_internal_error_copy(&ppi->error, error); 1303 } 1304 error->setjmp = 1; 1305 1306 AV1_COMMON *const cm = &cur_cpi->common; 1307 // Assign start of level 2 worker pool 1308 mt_info->workers = &p_mt_info->workers[i]; 1309 mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i]; 1310 // Assign number of workers for each frame in the parallel encode set. 1311 mt_info->num_workers = compute_num_workers_per_frame( 1312 num_workers - i, parallel_frame_count - frame_idx); 1313 for (int j = MOD_FP; j < NUM_MT_MODULES; j++) { 1314 mt_info->num_mod_workers[j] = 1315 AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]); 1316 } 1317 if (p_mt_info->cdef_worker != NULL) { 1318 mt_info->cdef_worker = &p_mt_info->cdef_worker[i]; 1319 1320 // Back up the original cdef_worker pointers. 1321 mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf; 1322 const int num_planes = av1_num_planes(cm); 1323 for (int plane = 0; plane < num_planes; plane++) 1324 mt_info->restore_state_buf.cdef_colbuf[plane] = 1325 mt_info->cdef_worker->colbuf[plane]; 1326 } 1327 #if !CONFIG_REALTIME_ONLY 1328 if (is_restoration_used(cm)) { 1329 // Back up the original LR buffers before update. 1330 int idx = i + mt_info->num_workers - 1; 1331 assert(idx < mt_info->lr_row_sync.num_workers); 1332 mt_info->restore_state_buf.rst_tmpbuf = 1333 mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf; 1334 mt_info->restore_state_buf.rlbs = 1335 mt_info->lr_row_sync.lrworkerdata[idx].rlbs; 1336 1337 // Update LR buffers. 1338 mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf; 1339 mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs; 1340 } 1341 #endif 1342 1343 i += mt_info->num_workers; 1344 1345 // At this stage, the thread specific CDEF buffers for the current frame's 1346 // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has 1347 // already been allocated across parallel frames. 1348 av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync, 1349 p_mt_info->num_workers, 0); 1350 1351 frame_worker->hook = hook; 1352 frame_worker->data1 = cur_cpi; 1353 frame_worker->data2 = (frame_idx == 0) 1354 ? first_cpi_data 1355 : &ppi->parallel_frames_data[frame_idx - 1]; 1356 frame_idx++; 1357 error->setjmp = 0; 1358 } 1359 p_mt_info->p_num_workers = parallel_frame_count; 1360 } 1361 1362 // Launch level 1 workers to perform frame parallel encode. 1363 static inline void launch_fpmt_workers(AV1_PRIMARY *ppi) { 1364 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 1365 int num_workers = ppi->p_mt_info.p_num_workers; 1366 1367 for (int i = num_workers - 1; i >= 0; i--) { 1368 AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; 1369 if (i == 0) 1370 winterface->execute(worker); 1371 else 1372 winterface->launch(worker); 1373 } 1374 } 1375 1376 // Restore worker states after parallel encode. 1377 static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi, 1378 int parallel_frame_count, 1379 int num_fpmt_workers_prepared) { 1380 assert(parallel_frame_count <= ppi->num_fp_contexts && 1381 parallel_frame_count > 1); 1382 (void)parallel_frame_count; 1383 1384 PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; 1385 1386 int frame_idx = 0; 1387 int i = 0; 1388 while (i < num_fpmt_workers_prepared) { 1389 AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; 1390 MultiThreadInfo *mt_info = &cur_cpi->mt_info; 1391 const AV1_COMMON *const cm = &cur_cpi->common; 1392 const int num_planes = av1_num_planes(cm); 1393 1394 // Restore the original cdef_worker pointers. 1395 if (p_mt_info->cdef_worker != NULL) { 1396 mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf; 1397 for (int plane = 0; plane < num_planes; plane++) 1398 mt_info->cdef_worker->colbuf[plane] = 1399 mt_info->restore_state_buf.cdef_colbuf[plane]; 1400 } 1401 #if !CONFIG_REALTIME_ONLY 1402 if (is_restoration_used(cm)) { 1403 // Restore the original LR buffers. 1404 int idx = i + mt_info->num_workers - 1; 1405 assert(idx < mt_info->lr_row_sync.num_workers); 1406 mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = 1407 mt_info->restore_state_buf.rst_tmpbuf; 1408 mt_info->lr_row_sync.lrworkerdata[idx].rlbs = 1409 mt_info->restore_state_buf.rlbs; 1410 } 1411 #endif 1412 1413 frame_idx++; 1414 i += mt_info->num_workers; 1415 } 1416 } 1417 1418 // Synchronize level 1 workers. 1419 static inline void sync_fpmt_workers(AV1_PRIMARY *ppi, 1420 int frames_in_parallel_set) { 1421 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 1422 int num_workers = ppi->p_mt_info.p_num_workers; 1423 int had_error = 0; 1424 // Points to error in the earliest display order frame in the parallel set. 1425 const struct aom_internal_error_info *error = NULL; 1426 1427 // Encoding ends. 1428 for (int i = num_workers - 1; i >= 0; --i) { 1429 AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; 1430 if (!winterface->sync(worker)) { 1431 had_error = 1; 1432 error = ppi->parallel_cpi[i]->common.error; 1433 } 1434 } 1435 1436 restore_workers_after_fpmt(ppi, frames_in_parallel_set, 1437 ppi->p_mt_info.num_workers); 1438 1439 if (had_error) aom_internal_error_copy(&ppi->error, error); 1440 } 1441 1442 static int get_compressed_data_hook(void *arg1, void *arg2) { 1443 AV1_COMP *cpi = (AV1_COMP *)arg1; 1444 AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2; 1445 int status = av1_get_compressed_data(cpi, cpi_data); 1446 1447 // AOM_CODEC_OK(0) means no error. 1448 return !status; 1449 } 1450 1451 // This function encodes the raw frame data for each frame in parallel encode 1452 // set, and outputs the frame bit stream to the designated buffers. 1453 void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, 1454 AV1_COMP_DATA *const first_cpi_data) { 1455 // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf 1456 // corresponding to frames in the current parallel encode set. 1457 int ref_buffers_used_map = 0; 1458 int frames_in_parallel_set = av1_init_parallel_frame_context( 1459 first_cpi_data, ppi, &ref_buffers_used_map); 1460 prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook, 1461 frames_in_parallel_set); 1462 launch_fpmt_workers(ppi); 1463 sync_fpmt_workers(ppi, frames_in_parallel_set); 1464 1465 // Release cpi->scaled_ref_buf corresponding to frames in the current parallel 1466 // encode set. 1467 for (int i = 0; i < frames_in_parallel_set; ++i) { 1468 av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]); 1469 } 1470 av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool, 1471 ref_buffers_used_map); 1472 } 1473 1474 static inline void launch_workers(MultiThreadInfo *const mt_info, 1475 int num_workers) { 1476 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 1477 for (int i = num_workers - 1; i >= 0; i--) { 1478 AVxWorker *const worker = &mt_info->workers[i]; 1479 worker->had_error = 0; 1480 if (i == 0) 1481 winterface->execute(worker); 1482 else 1483 winterface->launch(worker); 1484 } 1485 } 1486 1487 static inline void sync_enc_workers(MultiThreadInfo *const mt_info, 1488 AV1_COMMON *const cm, int num_workers) { 1489 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 1490 const AVxWorker *const worker_main = &mt_info->workers[0]; 1491 int had_error = worker_main->had_error; 1492 struct aom_internal_error_info error_info; 1493 1494 // Read the error_info of main thread. 1495 if (had_error) { 1496 error_info = ((EncWorkerData *)worker_main->data1)->error_info; 1497 } 1498 1499 // Encoding ends. 1500 for (int i = num_workers - 1; i > 0; i--) { 1501 AVxWorker *const worker = &mt_info->workers[i]; 1502 if (!winterface->sync(worker)) { 1503 had_error = 1; 1504 error_info = ((EncWorkerData *)worker->data1)->error_info; 1505 } 1506 } 1507 1508 if (had_error) aom_internal_error_copy(cm->error, &error_info); 1509 1510 // Restore xd->error_info of the main thread back to cm->error so that the 1511 // multithreaded code, when executed using a single thread, has a valid 1512 // xd->error_info. 1513 MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd; 1514 xd->error_info = cm->error; 1515 } 1516 1517 static inline void accumulate_counters_enc_workers(AV1_COMP *cpi, 1518 int num_workers) { 1519 for (int i = num_workers - 1; i >= 0; i--) { 1520 AVxWorker *const worker = &cpi->mt_info.workers[i]; 1521 EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; 1522 cpi->intrabc_used |= thread_data->td->intrabc_used; 1523 cpi->deltaq_used |= thread_data->td->deltaq_used; 1524 // Accumulate rtc counters. 1525 if (!frame_is_intra_only(&cpi->common)) 1526 av1_accumulate_rtc_counters(cpi, &thread_data->td->mb); 1527 cpi->palette_pixel_num += thread_data->td->mb.palette_pixels; 1528 if (thread_data->td != &cpi->td) { 1529 // Keep these conditional expressions in sync with the corresponding ones 1530 // in prepare_enc_workers(). 1531 if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { 1532 aom_free(thread_data->td->mv_costs_alloc); 1533 thread_data->td->mv_costs_alloc = NULL; 1534 } 1535 if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { 1536 aom_free(thread_data->td->dv_costs_alloc); 1537 thread_data->td->dv_costs_alloc = NULL; 1538 } 1539 } 1540 av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common)); 1541 1542 // Accumulate counters. 1543 if (i > 0) { 1544 av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); 1545 accumulate_rd_opt(&cpi->td, thread_data->td); 1546 cpi->td.mb.txfm_search_info.txb_split_count += 1547 thread_data->td->mb.txfm_search_info.txb_split_count; 1548 #if CONFIG_SPEED_STATS 1549 cpi->td.mb.txfm_search_info.tx_search_count += 1550 thread_data->td->mb.txfm_search_info.tx_search_count; 1551 #endif // CONFIG_SPEED_STATS 1552 } 1553 } 1554 } 1555 1556 static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, 1557 int num_workers) { 1558 MultiThreadInfo *const mt_info = &cpi->mt_info; 1559 AV1_COMMON *const cm = &cpi->common; 1560 for (int i = num_workers - 1; i >= 0; i--) { 1561 AVxWorker *const worker = &mt_info->workers[i]; 1562 EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; 1563 1564 worker->hook = hook; 1565 worker->data1 = thread_data; 1566 worker->data2 = NULL; 1567 1568 thread_data->thread_id = i; 1569 // Set the starting tile for each thread. 1570 thread_data->start = i; 1571 1572 thread_data->cpi = cpi; 1573 if (i == 0) { 1574 thread_data->td = &cpi->td; 1575 } else { 1576 thread_data->td = thread_data->original_td; 1577 } 1578 1579 thread_data->td->intrabc_used = 0; 1580 thread_data->td->deltaq_used = 0; 1581 thread_data->td->abs_sum_level = 0; 1582 thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0; 1583 thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0; 1584 1585 // Before encoding a frame, copy the thread data from cpi. 1586 if (thread_data->td != &cpi->td) { 1587 thread_data->td->mb = cpi->td.mb; 1588 thread_data->td->rd_counts = cpi->td.rd_counts; 1589 thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer; 1590 1591 for (int x = 0; x < 2; x++) { 1592 thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x] = 1593 thread_data->td->hash_value_buffer[x]; 1594 } 1595 // Keep these conditional expressions in sync with the corresponding ones 1596 // in accumulate_counters_enc_workers(). 1597 if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { 1598 CHECK_MEM_ERROR( 1599 cm, thread_data->td->mv_costs_alloc, 1600 (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc))); 1601 thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc; 1602 *thread_data->td->mb.mv_costs = *cpi->td.mb.mv_costs; 1603 } 1604 if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { 1605 // Reset dv_costs to NULL for worker threads when dv cost update is 1606 // enabled so that only dv_cost_upd_level needs to be checked before the 1607 // aom_free() call for the same. 1608 thread_data->td->mb.dv_costs = NULL; 1609 if (av1_need_dv_costs(cpi)) { 1610 CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc, 1611 (IntraBCMVCosts *)aom_malloc( 1612 sizeof(*thread_data->td->dv_costs_alloc))); 1613 thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc; 1614 *thread_data->td->mb.dv_costs = *cpi->td.mb.dv_costs; 1615 } 1616 } 1617 } 1618 av1_alloc_mb_data(cpi, &thread_data->td->mb); 1619 1620 // Reset rtc counters. 1621 av1_init_rtc_counters(&thread_data->td->mb); 1622 1623 thread_data->td->mb.palette_pixels = 0; 1624 1625 if (thread_data->td->counts != &cpi->counts) { 1626 *thread_data->td->counts = cpi->counts; 1627 } 1628 1629 if (i > 0) { 1630 thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; 1631 thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer; 1632 thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; 1633 for (int j = 0; j < 2; ++j) { 1634 thread_data->td->mb.tmp_pred_bufs[j] = 1635 thread_data->td->tmp_pred_bufs[j]; 1636 } 1637 thread_data->td->mb.pixel_gradient_info = 1638 thread_data->td->pixel_gradient_info; 1639 1640 thread_data->td->mb.src_var_info_of_4x4_sub_blocks = 1641 thread_data->td->src_var_info_of_4x4_sub_blocks; 1642 1643 thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; 1644 for (int j = 0; j < 2; ++j) { 1645 thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = 1646 thread_data->td->mb.tmp_pred_bufs[j]; 1647 } 1648 } 1649 } 1650 } 1651 1652 #if !CONFIG_REALTIME_ONLY 1653 static inline void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, 1654 int num_workers) { 1655 AV1_COMMON *const cm = &cpi->common; 1656 MultiThreadInfo *const mt_info = &cpi->mt_info; 1657 for (int i = num_workers - 1; i >= 0; i--) { 1658 AVxWorker *const worker = &mt_info->workers[i]; 1659 EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; 1660 1661 worker->hook = hook; 1662 worker->data1 = thread_data; 1663 worker->data2 = NULL; 1664 1665 thread_data->thread_id = i; 1666 // Set the starting tile for each thread. 1667 thread_data->start = i; 1668 1669 thread_data->cpi = cpi; 1670 if (i == 0) { 1671 thread_data->td = &cpi->td; 1672 } else { 1673 thread_data->td = thread_data->original_td; 1674 // Before encoding a frame, copy the thread data from cpi. 1675 thread_data->td->mb = cpi->td.mb; 1676 } 1677 av1_alloc_src_diff_buf(cm, &thread_data->td->mb); 1678 } 1679 } 1680 #endif 1681 1682 // Computes the number of workers for row multi-threading of encoding stage 1683 static inline int compute_num_enc_row_mt_workers(const AV1_COMMON *cm, 1684 int max_threads) { 1685 TileInfo tile_info; 1686 const int tile_cols = cm->tiles.cols; 1687 const int tile_rows = cm->tiles.rows; 1688 int total_num_threads_row_mt = 0; 1689 for (int row = 0; row < tile_rows; row++) { 1690 for (int col = 0; col < tile_cols; col++) { 1691 av1_tile_init(&tile_info, cm, row, col); 1692 const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info); 1693 const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info); 1694 total_num_threads_row_mt += 1695 AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); 1696 } 1697 } 1698 return AOMMIN(max_threads, total_num_threads_row_mt); 1699 } 1700 1701 // Computes the number of workers for tile multi-threading of encoding stage 1702 static inline int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm, 1703 int max_threads) { 1704 const int tile_cols = cm->tiles.cols; 1705 const int tile_rows = cm->tiles.rows; 1706 return AOMMIN(max_threads, tile_cols * tile_rows); 1707 } 1708 1709 // Find max worker of all MT stages 1710 int av1_get_max_num_workers(const AV1_COMP *cpi) { 1711 int max_num_workers = 0; 1712 for (int i = MOD_FP; i < NUM_MT_MODULES; i++) 1713 max_num_workers = 1714 AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers); 1715 assert(max_num_workers >= 1); 1716 return AOMMIN(max_num_workers, cpi->oxcf.max_threads); 1717 } 1718 1719 // Computes the number of workers for encoding stage (row/tile multi-threading) 1720 static int compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) { 1721 if (max_workers <= 1) return 1; 1722 if (cpi->oxcf.row_mt) 1723 return compute_num_enc_row_mt_workers(&cpi->common, max_workers); 1724 else 1725 return compute_num_enc_tile_mt_workers(&cpi->common, max_workers); 1726 } 1727 1728 void av1_encode_tiles_mt(AV1_COMP *cpi) { 1729 AV1_COMMON *const cm = &cpi->common; 1730 MultiThreadInfo *const mt_info = &cpi->mt_info; 1731 const int tile_cols = cm->tiles.cols; 1732 const int tile_rows = cm->tiles.rows; 1733 int num_workers = mt_info->num_mod_workers[MOD_ENC]; 1734 1735 assert(IMPLIES(cpi->tile_data == NULL, 1736 cpi->allocated_tiles < tile_cols * tile_rows)); 1737 if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); 1738 1739 av1_init_tile_data(cpi); 1740 num_workers = AOMMIN(num_workers, mt_info->num_workers); 1741 1742 prepare_enc_workers(cpi, enc_worker_hook, num_workers); 1743 launch_workers(&cpi->mt_info, num_workers); 1744 sync_enc_workers(&cpi->mt_info, cm, num_workers); 1745 accumulate_counters_enc_workers(cpi, num_workers); 1746 } 1747 1748 // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' 1749 // members, so we treat it as an array, and sum over the whole length. 1750 void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, 1751 const FRAME_COUNTS *counts) { 1752 unsigned int *const acc = (unsigned int *)acc_counts; 1753 const unsigned int *const cnt = (const unsigned int *)counts; 1754 1755 const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); 1756 1757 for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; 1758 } 1759 1760 // Computes the maximum number of sb rows and sb_cols across tiles which are 1761 // used to allocate memory for multi-threaded encoding with row-mt=1. 1762 static inline void compute_max_sb_rows_cols(const AV1_COMMON *cm, 1763 int *max_sb_rows_in_tile, 1764 int *max_sb_cols_in_tile) { 1765 const int tile_rows = cm->tiles.rows; 1766 const int mib_size_log2 = cm->seq_params->mib_size_log2; 1767 const int num_mi_rows = cm->mi_params.mi_rows; 1768 const int *const row_start_sb = cm->tiles.row_start_sb; 1769 for (int row = 0; row < tile_rows; row++) { 1770 const int mi_row_start = row_start_sb[row] << mib_size_log2; 1771 const int mi_row_end = 1772 AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); 1773 const int num_sb_rows_in_tile = 1774 CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2); 1775 *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile); 1776 } 1777 1778 const int tile_cols = cm->tiles.cols; 1779 const int num_mi_cols = cm->mi_params.mi_cols; 1780 const int *const col_start_sb = cm->tiles.col_start_sb; 1781 for (int col = 0; col < tile_cols; col++) { 1782 const int mi_col_start = col_start_sb[col] << mib_size_log2; 1783 const int mi_col_end = 1784 AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols); 1785 const int num_sb_cols_in_tile = 1786 CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2); 1787 *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile); 1788 } 1789 } 1790 1791 #if !CONFIG_REALTIME_ONLY 1792 // Computes the number of workers for firstpass stage (row/tile multi-threading) 1793 int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) { 1794 AV1_COMMON *cm = &cpi->common; 1795 const int tile_cols = cm->tiles.cols; 1796 const int tile_rows = cm->tiles.rows; 1797 int total_num_threads_row_mt = 0; 1798 TileInfo tile_info; 1799 1800 if (cpi->oxcf.max_threads <= 1) return 1; 1801 1802 for (int row = 0; row < tile_rows; row++) { 1803 for (int col = 0; col < tile_cols; col++) { 1804 av1_tile_init(&tile_info, cm, row, col); 1805 const int num_mb_rows_in_tile = 1806 av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size); 1807 const int num_mb_cols_in_tile = 1808 av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size); 1809 total_num_threads_row_mt += 1810 AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile); 1811 } 1812 } 1813 return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); 1814 } 1815 1816 // Computes the maximum number of mb_rows for row multi-threading of firstpass 1817 // stage 1818 static inline int fp_compute_max_mb_rows(const AV1_COMMON *cm, 1819 BLOCK_SIZE fp_block_size) { 1820 const int tile_rows = cm->tiles.rows; 1821 const int unit_height_log2 = mi_size_high_log2[fp_block_size]; 1822 const int mib_size_log2 = cm->seq_params->mib_size_log2; 1823 const int num_mi_rows = cm->mi_params.mi_rows; 1824 const int *const row_start_sb = cm->tiles.row_start_sb; 1825 int max_mb_rows = 0; 1826 1827 for (int row = 0; row < tile_rows; row++) { 1828 const int mi_row_start = row_start_sb[row] << mib_size_log2; 1829 const int mi_row_end = 1830 AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); 1831 const int num_mb_rows_in_tile = 1832 CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2); 1833 max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile); 1834 } 1835 return max_mb_rows; 1836 } 1837 #endif 1838 1839 static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) { 1840 // Pipelining of loop-filtering after encoding is enabled when loop-filter 1841 // level is chosen based on quantizer and frame type. It is disabled in case 1842 // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage 1843 // decides the filter level. Loop-filtering is disabled in case 1844 // of non-reference frames and for frames with intra block copy tool enabled. 1845 AV1_COMMON *cm = &cpi->common; 1846 const int use_loopfilter = is_loopfilter_used(cm); 1847 const int use_superres = av1_superres_scaled(cm); 1848 const int use_cdef = is_cdef_used(cm); 1849 const int use_restoration = is_restoration_used(cm); 1850 MultiThreadInfo *const mt_info = &cpi->mt_info; 1851 MACROBLOCKD *xd = &cpi->td.mb.e_mbd; 1852 1853 const unsigned int skip_apply_postproc_filters = 1854 derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, 1855 use_superres, use_restoration); 1856 mt_info->pipeline_lpf_mt_with_enc = 1857 (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) && 1858 (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) && 1859 (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) && 1860 !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc && 1861 ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0); 1862 1863 if (!mt_info->pipeline_lpf_mt_with_enc) return; 1864 1865 set_postproc_filter_default_params(cm); 1866 1867 if (!use_loopfilter) return; 1868 1869 const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick; 1870 assert(method == LPF_PICK_FROM_Q); 1871 assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY); 1872 1873 av1_pick_filter_level(cpi->source, cpi, method); 1874 1875 struct loopfilter *lf = &cm->lf; 1876 const int plane_start = 0; 1877 const int plane_end = av1_num_planes(cm); 1878 int planes_to_lf[MAX_MB_PLANE]; 1879 if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc, 1880 lf->filter_level)) { 1881 set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); 1882 int lpf_opt_level = get_lpf_opt_level(&cpi->sf); 1883 assert(lpf_opt_level == 2); 1884 1885 const int start_mi_row = 0; 1886 const int end_mi_row = start_mi_row + cm->mi_params.mi_rows; 1887 1888 av1_loop_filter_frame_init(cm, plane_start, plane_end); 1889 1890 assert(mt_info->num_mod_workers[MOD_ENC] == 1891 mt_info->num_mod_workers[MOD_LPF]); 1892 loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf, 1893 mt_info->num_mod_workers[MOD_LPF], 1894 &mt_info->lf_row_sync, lpf_opt_level, 1895 cm->seq_params->mib_size_log2); 1896 1897 for (int i = num_workers - 1; i >= 0; i--) { 1898 EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; 1899 // Initialize loopfilter data 1900 thread_data->lf_sync = &mt_info->lf_row_sync; 1901 thread_data->lf_data = &thread_data->lf_sync->lfdata[i]; 1902 loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd); 1903 } 1904 } 1905 } 1906 1907 void av1_encode_tiles_row_mt(AV1_COMP *cpi) { 1908 AV1_COMMON *const cm = &cpi->common; 1909 MultiThreadInfo *const mt_info = &cpi->mt_info; 1910 AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; 1911 const int tile_cols = cm->tiles.cols; 1912 const int tile_rows = cm->tiles.rows; 1913 const int sb_rows_in_frame = get_sb_rows_in_frame(cm); 1914 int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; 1915 int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0; 1916 int num_workers = mt_info->num_mod_workers[MOD_ENC]; 1917 1918 compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile); 1919 const bool alloc_row_mt_mem = 1920 (enc_row_mt->allocated_tile_cols != tile_cols || 1921 enc_row_mt->allocated_tile_rows != tile_rows || 1922 enc_row_mt->allocated_rows != max_sb_rows_in_tile || 1923 enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) || 1924 enc_row_mt->allocated_sb_rows != sb_rows_in_frame); 1925 const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; 1926 1927 assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); 1928 if (alloc_tile_data) { 1929 av1_alloc_tile_data(cpi); 1930 } 1931 1932 assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); 1933 if (alloc_row_mt_mem) { 1934 row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile, 1935 cpi->oxcf.algo_cfg.cdf_update_mode); 1936 } 1937 1938 num_workers = AOMMIN(num_workers, mt_info->num_workers); 1939 lpf_pipeline_mt_init(cpi, num_workers); 1940 1941 av1_init_tile_data(cpi); 1942 1943 memset(thread_id_to_tile_id, -1, 1944 sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); 1945 memset(enc_row_mt->num_tile_cols_done, 0, 1946 sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame); 1947 enc_row_mt->row_mt_exit = false; 1948 1949 for (int tile_row = 0; tile_row < tile_rows; tile_row++) { 1950 for (int tile_col = 0; tile_col < tile_cols; tile_col++) { 1951 int tile_index = tile_row * tile_cols + tile_col; 1952 TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; 1953 AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; 1954 1955 // Initialize num_finished_cols to -1 for all rows. 1956 memset(row_mt_sync->num_finished_cols, -1, 1957 sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile); 1958 row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; 1959 row_mt_sync->num_threads_working = 0; 1960 row_mt_sync->intrabc_extra_top_right_sb_delay = 1961 av1_get_intrabc_extra_top_right_sb_delay(cm); 1962 1963 av1_inter_mode_data_init(this_tile); 1964 av1_zero_above_context(cm, &cpi->td.mb.e_mbd, 1965 this_tile->tile_info.mi_col_start, 1966 this_tile->tile_info.mi_col_end, tile_row); 1967 } 1968 } 1969 1970 assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, 1971 num_workers); 1972 prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers); 1973 launch_workers(&cpi->mt_info, num_workers); 1974 sync_enc_workers(&cpi->mt_info, cm, num_workers); 1975 if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi); 1976 accumulate_counters_enc_workers(cpi, num_workers); 1977 } 1978 1979 #if !CONFIG_REALTIME_ONLY 1980 static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) { 1981 for (int i = num_workers - 1; i >= 0; --i) { 1982 EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i]; 1983 if (thread_data->td != &cpi->td) 1984 av1_dealloc_src_diff_buf(&thread_data->td->mb, 1985 av1_num_planes(&cpi->common)); 1986 } 1987 } 1988 1989 void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) { 1990 AV1_COMMON *const cm = &cpi->common; 1991 MultiThreadInfo *const mt_info = &cpi->mt_info; 1992 AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; 1993 const int tile_cols = cm->tiles.cols; 1994 const int tile_rows = cm->tiles.rows; 1995 int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; 1996 int num_workers = 0; 1997 int max_mb_rows = 0; 1998 1999 max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size); 2000 const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols || 2001 enc_row_mt->allocated_tile_rows != tile_rows || 2002 enc_row_mt->allocated_rows != max_mb_rows; 2003 const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; 2004 2005 assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); 2006 if (alloc_tile_data) { 2007 av1_alloc_tile_data(cpi); 2008 } 2009 2010 assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); 2011 if (alloc_row_mt_mem) { 2012 row_mt_mem_alloc(cpi, max_mb_rows, -1, 0); 2013 } 2014 2015 av1_init_tile_data(cpi); 2016 2017 // For pass = 1, compute the no. of workers needed. For single-pass encode 2018 // (pass = 0), no. of workers are already computed. 2019 if (mt_info->num_mod_workers[MOD_FP] == 0) 2020 num_workers = av1_fp_compute_num_enc_workers(cpi); 2021 else 2022 num_workers = mt_info->num_mod_workers[MOD_FP]; 2023 2024 memset(thread_id_to_tile_id, -1, 2025 sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); 2026 enc_row_mt->firstpass_mt_exit = false; 2027 2028 for (int tile_row = 0; tile_row < tile_rows; tile_row++) { 2029 for (int tile_col = 0; tile_col < tile_cols; tile_col++) { 2030 int tile_index = tile_row * tile_cols + tile_col; 2031 TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; 2032 AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; 2033 2034 // Initialize num_finished_cols to -1 for all rows. 2035 memset(row_mt_sync->num_finished_cols, -1, 2036 sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows); 2037 row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; 2038 row_mt_sync->num_threads_working = 0; 2039 2040 // intraBC mode is not evaluated during first-pass encoding. Hence, no 2041 // additional top-right delay is required. 2042 row_mt_sync->intrabc_extra_top_right_sb_delay = 0; 2043 } 2044 } 2045 2046 num_workers = AOMMIN(num_workers, mt_info->num_workers); 2047 assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, 2048 num_workers); 2049 fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers); 2050 launch_workers(&cpi->mt_info, num_workers); 2051 sync_enc_workers(&cpi->mt_info, cm, num_workers); 2052 dealloc_thread_data_src_diff_buf(cpi, num_workers); 2053 } 2054 2055 void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, 2056 int r, int c) { 2057 (void)tpl_mt_sync; 2058 (void)r; 2059 (void)c; 2060 } 2061 2062 void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, 2063 int r, int c, int cols) { 2064 (void)tpl_mt_sync; 2065 (void)r; 2066 (void)c; 2067 (void)cols; 2068 } 2069 2070 void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, 2071 int c) { 2072 #if CONFIG_MULTITHREAD 2073 int nsync = tpl_row_mt_sync->sync_range; 2074 2075 if (r) { 2076 pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1]; 2077 pthread_mutex_lock(mutex); 2078 2079 while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync) 2080 pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex); 2081 pthread_mutex_unlock(mutex); 2082 } 2083 #else 2084 (void)tpl_row_mt_sync; 2085 (void)r; 2086 (void)c; 2087 #endif // CONFIG_MULTITHREAD 2088 } 2089 2090 void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, 2091 int c, int cols) { 2092 #if CONFIG_MULTITHREAD 2093 int nsync = tpl_row_mt_sync->sync_range; 2094 int cur; 2095 // Only signal when there are enough encoded blocks for next row to run. 2096 int sig = 1; 2097 2098 if (c < cols - 1) { 2099 cur = c; 2100 if (c % nsync) sig = 0; 2101 } else { 2102 cur = cols + nsync; 2103 } 2104 2105 if (sig) { 2106 pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]); 2107 2108 // When a thread encounters an error, num_finished_cols[r] is set to maximum 2109 // column number. In this case, the AOMMAX operation here ensures that 2110 // num_finished_cols[r] is not overwritten with a smaller value thus 2111 // preventing the infinite waiting of threads in the relevant sync_read() 2112 // function. 2113 tpl_row_mt_sync->num_finished_cols[r] = 2114 AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur); 2115 2116 pthread_cond_signal(&tpl_row_mt_sync->cond_[r]); 2117 pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]); 2118 } 2119 #else 2120 (void)tpl_row_mt_sync; 2121 (void)r; 2122 (void)c; 2123 (void)cols; 2124 #endif // CONFIG_MULTITHREAD 2125 } 2126 2127 static inline void set_mode_estimation_done(AV1_COMP *cpi) { 2128 const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; 2129 TplParams *const tpl_data = &cpi->ppi->tpl_data; 2130 const BLOCK_SIZE bsize = 2131 convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); 2132 const int mi_height = mi_size_high[bsize]; 2133 AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; 2134 const int tplb_cols_in_tile = 2135 ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); 2136 // In case of tpl row-multithreading, due to top-right dependency, the worker 2137 // on an mb_row waits for the completion of the tpl processing of the top and 2138 // top-right blocks. Hence, in case a thread (main/worker) encounters an 2139 // error, update that the tpl processing of every mb_row in the frame is 2140 // complete in order to avoid dependent workers waiting indefinitely. 2141 for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows; 2142 mi_row += mi_height, tplb_row++) { 2143 (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, 2144 tplb_cols_in_tile - 1, tplb_cols_in_tile); 2145 } 2146 } 2147 2148 // Each worker calls tpl_worker_hook() and computes the tpl data. 2149 static int tpl_worker_hook(void *arg1, void *unused) { 2150 (void)unused; 2151 EncWorkerData *thread_data = (EncWorkerData *)arg1; 2152 AV1_COMP *cpi = thread_data->cpi; 2153 AV1_COMMON *cm = &cpi->common; 2154 MACROBLOCK *x = &thread_data->td->mb; 2155 MACROBLOCKD *xd = &x->e_mbd; 2156 TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats; 2157 TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers; 2158 CommonModeInfoParams *mi_params = &cm->mi_params; 2159 int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working; 2160 2161 struct aom_internal_error_info *const error_info = &thread_data->error_info; 2162 xd->error_info = error_info; 2163 AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; 2164 (void)tpl_row_mt; 2165 #if CONFIG_MULTITHREAD 2166 pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_; 2167 #endif 2168 2169 // The jmp_buf is valid only for the duration of the function that calls 2170 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 2171 // before it returns. 2172 if (setjmp(error_info->jmp)) { 2173 error_info->setjmp = 0; 2174 #if CONFIG_MULTITHREAD 2175 pthread_mutex_lock(tpl_error_mutex_); 2176 tpl_row_mt->tpl_mt_exit = true; 2177 pthread_mutex_unlock(tpl_error_mutex_); 2178 #endif 2179 set_mode_estimation_done(cpi); 2180 return 0; 2181 } 2182 error_info->setjmp = 1; 2183 2184 BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); 2185 TX_SIZE tx_size = max_txsize_lookup[bsize]; 2186 int mi_height = mi_size_high[bsize]; 2187 2188 av1_init_tpl_txfm_stats(tpl_txfm_stats); 2189 2190 for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows; 2191 mi_row += num_active_workers * mi_height) { 2192 // Motion estimation row boundary 2193 av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, 2194 cpi->oxcf.border_in_pixels); 2195 xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); 2196 xd->mb_to_bottom_edge = 2197 GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); 2198 av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, 2199 bsize, tx_size); 2200 } 2201 error_info->setjmp = 0; 2202 return 1; 2203 } 2204 2205 // Deallocate tpl synchronization related mutex and data. 2206 void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) { 2207 assert(tpl_sync != NULL); 2208 2209 #if CONFIG_MULTITHREAD 2210 if (tpl_sync->mutex_ != NULL) { 2211 for (int i = 0; i < tpl_sync->rows; ++i) 2212 pthread_mutex_destroy(&tpl_sync->mutex_[i]); 2213 aom_free(tpl_sync->mutex_); 2214 } 2215 if (tpl_sync->cond_ != NULL) { 2216 for (int i = 0; i < tpl_sync->rows; ++i) 2217 pthread_cond_destroy(&tpl_sync->cond_[i]); 2218 aom_free(tpl_sync->cond_); 2219 } 2220 #endif // CONFIG_MULTITHREAD 2221 2222 aom_free(tpl_sync->num_finished_cols); 2223 // clear the structure as the source of this call may be a resize in which 2224 // case this call will be followed by an _alloc() which may fail. 2225 av1_zero(*tpl_sync); 2226 } 2227 2228 // Allocate memory for tpl row synchronization. 2229 static void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm, 2230 int mb_rows) { 2231 tpl_sync->rows = mb_rows; 2232 #if CONFIG_MULTITHREAD 2233 { 2234 CHECK_MEM_ERROR(cm, tpl_sync->mutex_, 2235 aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows)); 2236 if (tpl_sync->mutex_) { 2237 for (int i = 0; i < mb_rows; ++i) 2238 pthread_mutex_init(&tpl_sync->mutex_[i], NULL); 2239 } 2240 2241 CHECK_MEM_ERROR(cm, tpl_sync->cond_, 2242 aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows)); 2243 if (tpl_sync->cond_) { 2244 for (int i = 0; i < mb_rows; ++i) 2245 pthread_cond_init(&tpl_sync->cond_[i], NULL); 2246 } 2247 } 2248 #endif // CONFIG_MULTITHREAD 2249 CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols, 2250 aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows)); 2251 2252 // Set up nsync. 2253 tpl_sync->sync_range = 1; 2254 } 2255 2256 // Each worker is prepared by assigning the hook function and individual thread 2257 // data. 2258 static inline void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook, 2259 int num_workers) { 2260 MultiThreadInfo *mt_info = &cpi->mt_info; 2261 for (int i = num_workers - 1; i >= 0; i--) { 2262 AVxWorker *worker = &mt_info->workers[i]; 2263 EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; 2264 2265 worker->hook = hook; 2266 worker->data1 = thread_data; 2267 worker->data2 = NULL; 2268 2269 thread_data->thread_id = i; 2270 // Set the starting tile for each thread. 2271 thread_data->start = i; 2272 2273 thread_data->cpi = cpi; 2274 if (i == 0) { 2275 thread_data->td = &cpi->td; 2276 } else { 2277 thread_data->td = thread_data->original_td; 2278 } 2279 2280 // Before encoding a frame, copy the thread data from cpi. 2281 if (thread_data->td != &cpi->td) { 2282 thread_data->td->mb = cpi->td.mb; 2283 // OBMC buffers are used only to init MS params and remain unused when 2284 // called from tpl, hence set the buffers to defaults. 2285 av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); 2286 if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers, 2287 cpi->ppi->tpl_data.tpl_bsize_1d)) { 2288 aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, 2289 "Error allocating tpl data"); 2290 } 2291 thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; 2292 thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; 2293 } 2294 } 2295 } 2296 2297 #if CONFIG_BITRATE_ACCURACY 2298 // Accumulate transform stats after tpl. 2299 static void tpl_accumulate_txfm_stats(ThreadData *main_td, 2300 const MultiThreadInfo *mt_info, 2301 int num_workers) { 2302 TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats; 2303 for (int i = num_workers - 1; i >= 0; i--) { 2304 AVxWorker *const worker = &mt_info->workers[i]; 2305 EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; 2306 ThreadData *td = thread_data->td; 2307 if (td != main_td) { 2308 const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; 2309 av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats); 2310 } 2311 } 2312 } 2313 #endif // CONFIG_BITRATE_ACCURACY 2314 2315 // Implements multi-threading for tpl. 2316 void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { 2317 AV1_COMMON *cm = &cpi->common; 2318 CommonModeInfoParams *mi_params = &cm->mi_params; 2319 MultiThreadInfo *mt_info = &cpi->mt_info; 2320 TplParams *tpl_data = &cpi->ppi->tpl_data; 2321 AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync; 2322 int mb_rows = mi_params->mb_rows; 2323 int num_workers = 2324 AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers); 2325 2326 if (mb_rows != tpl_sync->rows) { 2327 av1_tpl_dealloc(tpl_sync); 2328 av1_tpl_alloc(tpl_sync, cm, mb_rows); 2329 } 2330 tpl_sync->num_threads_working = num_workers; 2331 mt_info->tpl_row_mt.tpl_mt_exit = false; 2332 2333 // Initialize cur_mb_col to -1 for all MB rows. 2334 memset(tpl_sync->num_finished_cols, -1, 2335 sizeof(*tpl_sync->num_finished_cols) * mb_rows); 2336 2337 prepare_tpl_workers(cpi, tpl_worker_hook, num_workers); 2338 launch_workers(&cpi->mt_info, num_workers); 2339 sync_enc_workers(&cpi->mt_info, cm, num_workers); 2340 #if CONFIG_BITRATE_ACCURACY 2341 tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers); 2342 #endif // CONFIG_BITRATE_ACCURACY 2343 for (int i = num_workers - 1; i >= 0; i--) { 2344 EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; 2345 ThreadData *td = thread_data->td; 2346 if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); 2347 } 2348 } 2349 2350 // Deallocate memory for temporal filter multi-thread synchronization. 2351 void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) { 2352 assert(tf_sync != NULL); 2353 #if CONFIG_MULTITHREAD 2354 if (tf_sync->mutex_ != NULL) { 2355 pthread_mutex_destroy(tf_sync->mutex_); 2356 aom_free(tf_sync->mutex_); 2357 } 2358 #endif // CONFIG_MULTITHREAD 2359 tf_sync->next_tf_row = 0; 2360 } 2361 2362 // Checks if a job is available. If job is available, 2363 // populates next_tf_row and returns 1, else returns 0. 2364 static inline int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync, 2365 int *current_mb_row, int mb_rows) { 2366 int do_next_row = 0; 2367 #if CONFIG_MULTITHREAD 2368 pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_; 2369 pthread_mutex_lock(tf_mutex_); 2370 #endif 2371 if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) { 2372 *current_mb_row = tf_mt_sync->next_tf_row; 2373 tf_mt_sync->next_tf_row++; 2374 do_next_row = 1; 2375 } 2376 #if CONFIG_MULTITHREAD 2377 pthread_mutex_unlock(tf_mutex_); 2378 #endif 2379 return do_next_row; 2380 } 2381 2382 // Hook function for each thread in temporal filter multi-threading. 2383 static int tf_worker_hook(void *arg1, void *unused) { 2384 (void)unused; 2385 EncWorkerData *thread_data = (EncWorkerData *)arg1; 2386 AV1_COMP *cpi = thread_data->cpi; 2387 ThreadData *td = thread_data->td; 2388 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; 2389 AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync; 2390 const struct scale_factors *scale = &cpi->tf_ctx.sf; 2391 2392 #if CONFIG_MULTITHREAD 2393 pthread_mutex_t *tf_mutex_ = tf_sync->mutex_; 2394 #endif 2395 MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; 2396 struct aom_internal_error_info *const error_info = &thread_data->error_info; 2397 xd->error_info = error_info; 2398 2399 // The jmp_buf is valid only for the duration of the function that calls 2400 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 2401 // before it returns. 2402 if (setjmp(error_info->jmp)) { 2403 error_info->setjmp = 0; 2404 #if CONFIG_MULTITHREAD 2405 pthread_mutex_lock(tf_mutex_); 2406 tf_sync->tf_mt_exit = true; 2407 pthread_mutex_unlock(tf_mutex_); 2408 #endif 2409 return 0; 2410 } 2411 error_info->setjmp = 1; 2412 2413 const int num_planes = av1_num_planes(&cpi->common); 2414 assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); 2415 2416 MACROBLOCKD *mbd = &td->mb.e_mbd; 2417 uint8_t *input_buffer[MAX_MB_PLANE]; 2418 MB_MODE_INFO **input_mb_mode_info; 2419 tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); 2420 tf_setup_macroblockd(mbd, &td->tf_data, scale); 2421 2422 int current_mb_row = -1; 2423 2424 while (tf_get_next_job(tf_sync, ¤t_mb_row, tf_ctx->mb_rows)) 2425 av1_tf_do_filtering_row(cpi, td, current_mb_row); 2426 2427 tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); 2428 2429 error_info->setjmp = 0; 2430 return 1; 2431 } 2432 2433 // Assigns temporal filter hook function and thread data to each worker. 2434 static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook, 2435 int num_workers, int is_highbitdepth) { 2436 MultiThreadInfo *mt_info = &cpi->mt_info; 2437 mt_info->tf_sync.next_tf_row = 0; 2438 mt_info->tf_sync.tf_mt_exit = false; 2439 for (int i = num_workers - 1; i >= 0; i--) { 2440 AVxWorker *worker = &mt_info->workers[i]; 2441 EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; 2442 2443 worker->hook = hook; 2444 worker->data1 = thread_data; 2445 worker->data2 = NULL; 2446 2447 thread_data->thread_id = i; 2448 // Set the starting tile for each thread. 2449 thread_data->start = i; 2450 2451 thread_data->cpi = cpi; 2452 if (i == 0) { 2453 thread_data->td = &cpi->td; 2454 } else { 2455 thread_data->td = thread_data->original_td; 2456 } 2457 2458 // Before encoding a frame, copy the thread data from cpi. 2459 if (thread_data->td != &cpi->td) { 2460 thread_data->td->mb = cpi->td.mb; 2461 // OBMC buffers are used only to init MS params and remain unused when 2462 // called from tf, hence set the buffers to defaults. 2463 av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); 2464 if (!tf_alloc_and_reset_data(&thread_data->td->tf_data, 2465 cpi->tf_ctx.num_pels, is_highbitdepth)) { 2466 aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, 2467 "Error allocating temporal filter data"); 2468 } 2469 } 2470 } 2471 } 2472 2473 // Deallocate thread specific data for temporal filter. 2474 static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers, 2475 int is_highbitdepth) { 2476 MultiThreadInfo *mt_info = &cpi->mt_info; 2477 for (int i = num_workers - 1; i >= 0; i--) { 2478 EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; 2479 ThreadData *td = thread_data->td; 2480 if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth); 2481 } 2482 } 2483 2484 // Accumulate sse and sum after temporal filtering. 2485 static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) { 2486 FRAME_DIFF *total_diff = &cpi->td.tf_data.diff; 2487 for (int i = num_workers - 1; i >= 0; i--) { 2488 AVxWorker *const worker = &cpi->mt_info.workers[i]; 2489 EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; 2490 ThreadData *td = thread_data->td; 2491 FRAME_DIFF *diff = &td->tf_data.diff; 2492 if (td != &cpi->td) { 2493 total_diff->sse += diff->sse; 2494 total_diff->sum += diff->sum; 2495 } 2496 } 2497 } 2498 2499 // Implements multi-threading for temporal filter. 2500 void av1_tf_do_filtering_mt(AV1_COMP *cpi) { 2501 AV1_COMMON *cm = &cpi->common; 2502 MultiThreadInfo *mt_info = &cpi->mt_info; 2503 const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; 2504 2505 int num_workers = 2506 AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers); 2507 2508 prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth); 2509 launch_workers(mt_info, num_workers); 2510 sync_enc_workers(mt_info, cm, num_workers); 2511 tf_accumulate_frame_diff(cpi, num_workers); 2512 tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth); 2513 } 2514 2515 // Checks if a job is available in the current direction. If a job is available, 2516 // frame_idx will be populated and returns 1, else returns 0. 2517 static inline int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) { 2518 GlobalMotionInfo *gm_info = &cpi->gm_info; 2519 GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; 2520 2521 int total_refs = gm_info->num_ref_frames[cur_dir]; 2522 int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; 2523 2524 if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) { 2525 *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame; 2526 job_info->next_frame_to_process[cur_dir] += 1; 2527 return 1; 2528 } 2529 return 0; 2530 } 2531 2532 // Switches the current direction and calls the function get_next_gm_job() if 2533 // the speed feature 'prune_ref_frame_for_gm_search' is not set. 2534 static inline void switch_direction(AV1_COMP *cpi, int *frame_idx, 2535 int *cur_dir) { 2536 if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return; 2537 // Switch the direction and get next job 2538 *cur_dir = !(*cur_dir); 2539 get_next_gm_job(cpi, frame_idx, *(cur_dir)); 2540 } 2541 2542 // Hook function for each thread in global motion multi-threading. 2543 static int gm_mt_worker_hook(void *arg1, void *unused) { 2544 (void)unused; 2545 2546 EncWorkerData *thread_data = (EncWorkerData *)arg1; 2547 AV1_COMP *cpi = thread_data->cpi; 2548 GlobalMotionInfo *gm_info = &cpi->gm_info; 2549 AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; 2550 GlobalMotionJobInfo *job_info = &gm_sync->job_info; 2551 int thread_id = thread_data->thread_id; 2552 GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; 2553 #if CONFIG_MULTITHREAD 2554 pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_; 2555 #endif 2556 2557 MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; 2558 struct aom_internal_error_info *const error_info = &thread_data->error_info; 2559 xd->error_info = error_info; 2560 2561 // The jmp_buf is valid only for the duration of the function that calls 2562 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 2563 // before it returns. 2564 if (setjmp(error_info->jmp)) { 2565 error_info->setjmp = 0; 2566 #if CONFIG_MULTITHREAD 2567 pthread_mutex_lock(gm_mt_mutex_); 2568 gm_sync->gm_mt_exit = true; 2569 pthread_mutex_unlock(gm_mt_mutex_); 2570 #endif 2571 return 0; 2572 } 2573 error_info->setjmp = 1; 2574 2575 int cur_dir = job_info->thread_id_to_dir[thread_id]; 2576 bool gm_mt_exit = false; 2577 while (1) { 2578 int ref_buf_idx = -1; 2579 2580 #if CONFIG_MULTITHREAD 2581 pthread_mutex_lock(gm_mt_mutex_); 2582 #endif 2583 2584 gm_mt_exit = gm_sync->gm_mt_exit; 2585 // Populates ref_buf_idx(the reference frame type) for which global motion 2586 // estimation will be done. 2587 if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) { 2588 // No jobs are available for the current direction. Switch 2589 // to other direction and get the next job, if available. 2590 switch_direction(cpi, &ref_buf_idx, &cur_dir); 2591 } 2592 2593 #if CONFIG_MULTITHREAD 2594 pthread_mutex_unlock(gm_mt_mutex_); 2595 #endif 2596 2597 // When gm_mt_exit is set to true, other workers need not pursue any 2598 // further jobs. 2599 if (gm_mt_exit || ref_buf_idx == -1) break; 2600 2601 // Compute global motion for the given ref_buf_idx. 2602 av1_compute_gm_for_valid_ref_frames( 2603 cpi, error_info, gm_info->ref_buf, ref_buf_idx, 2604 gm_thread_data->motion_models, gm_thread_data->segment_map, 2605 gm_info->segment_map_w, gm_info->segment_map_h); 2606 2607 #if CONFIG_MULTITHREAD 2608 pthread_mutex_lock(gm_mt_mutex_); 2609 #endif 2610 // If global motion w.r.t. current ref frame is 2611 // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t 2612 // the remaining ref frames in that direction. 2613 if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && 2614 cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION) 2615 job_info->early_exit[cur_dir] = 1; 2616 2617 #if CONFIG_MULTITHREAD 2618 pthread_mutex_unlock(gm_mt_mutex_); 2619 #endif 2620 } 2621 error_info->setjmp = 0; 2622 return 1; 2623 } 2624 2625 // Assigns global motion hook function and thread data to each worker. 2626 static inline void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook, 2627 int num_workers) { 2628 MultiThreadInfo *mt_info = &cpi->mt_info; 2629 mt_info->gm_sync.gm_mt_exit = false; 2630 for (int i = num_workers - 1; i >= 0; i--) { 2631 AVxWorker *worker = &mt_info->workers[i]; 2632 EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; 2633 2634 worker->hook = hook; 2635 worker->data1 = thread_data; 2636 worker->data2 = NULL; 2637 2638 thread_data->thread_id = i; 2639 // Set the starting tile for each thread. 2640 thread_data->start = i; 2641 2642 thread_data->cpi = cpi; 2643 if (i == 0) { 2644 thread_data->td = &cpi->td; 2645 } else { 2646 thread_data->td = thread_data->original_td; 2647 } 2648 2649 if (thread_data->td != &cpi->td) 2650 gm_alloc_data(cpi, &thread_data->td->gm_data); 2651 } 2652 } 2653 2654 // Assigns available threads to past/future direction. 2655 static inline void assign_thread_to_dir(int8_t *thread_id_to_dir, 2656 int num_workers) { 2657 int8_t frame_dir_idx = 0; 2658 2659 for (int i = 0; i < num_workers; i++) { 2660 thread_id_to_dir[i] = frame_dir_idx++; 2661 if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0; 2662 } 2663 } 2664 2665 // Computes number of workers for global motion multi-threading. 2666 static inline int compute_gm_workers(const AV1_COMP *cpi) { 2667 int total_refs = 2668 cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1]; 2669 int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search 2670 ? AOMMIN(MAX_DIRECTIONS, total_refs) 2671 : total_refs; 2672 num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers); 2673 return (num_gm_workers); 2674 } 2675 2676 // Frees the memory allocated for each worker in global motion multi-threading. 2677 static inline void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) { 2678 MultiThreadInfo *mt_info = &cpi->mt_info; 2679 for (int j = 0; j < num_workers; j++) { 2680 EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; 2681 ThreadData *td = thread_data->td; 2682 if (td != &cpi->td) gm_dealloc_data(&td->gm_data); 2683 } 2684 } 2685 2686 // Implements multi-threading for global motion. 2687 void av1_global_motion_estimation_mt(AV1_COMP *cpi) { 2688 GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; 2689 2690 av1_zero(*job_info); 2691 2692 int num_workers = compute_gm_workers(cpi); 2693 2694 assign_thread_to_dir(job_info->thread_id_to_dir, num_workers); 2695 prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers); 2696 launch_workers(&cpi->mt_info, num_workers); 2697 sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers); 2698 gm_dealloc_thread_data(cpi, num_workers); 2699 } 2700 #endif // !CONFIG_REALTIME_ONLY 2701 2702 static inline int get_next_job_allintra( 2703 AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end, 2704 int *current_mi_row, int mib_size) { 2705 if (row_mt_sync->next_mi_row < mi_row_end) { 2706 *current_mi_row = row_mt_sync->next_mi_row; 2707 row_mt_sync->num_threads_working++; 2708 row_mt_sync->next_mi_row += mib_size; 2709 return 1; 2710 } 2711 return 0; 2712 } 2713 2714 static inline void prepare_wiener_var_workers(AV1_COMP *const cpi, 2715 AVxWorkerHook hook, 2716 const int num_workers) { 2717 MultiThreadInfo *const mt_info = &cpi->mt_info; 2718 for (int i = num_workers - 1; i >= 0; i--) { 2719 AVxWorker *const worker = &mt_info->workers[i]; 2720 EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; 2721 2722 worker->hook = hook; 2723 worker->data1 = thread_data; 2724 worker->data2 = NULL; 2725 2726 thread_data->thread_id = i; 2727 // Set the starting tile for each thread, in this case the preprocessing 2728 // stage does not need tiles. So we set it to 0. 2729 thread_data->start = 0; 2730 2731 thread_data->cpi = cpi; 2732 if (i == 0) { 2733 thread_data->td = &cpi->td; 2734 } else { 2735 thread_data->td = thread_data->original_td; 2736 } 2737 2738 if (thread_data->td != &cpi->td) { 2739 thread_data->td->mb = cpi->td.mb; 2740 av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td); 2741 } 2742 } 2743 } 2744 2745 static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { 2746 const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; 2747 const BLOCK_SIZE bsize = cpi->weber_bsize; 2748 const int mb_step = mi_size_wide[bsize]; 2749 assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL); 2750 const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; 2751 const int mt_unit_cols = 2752 (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step; 2753 const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; 2754 AV1EncRowMultiThreadSync *const intra_row_mt_sync = 2755 &cpi->ppi->intra_row_mt_sync; 2756 2757 // Update the wiener variance computation of every row in the frame to 2758 // indicate that it is complete in order to avoid dependent workers waiting 2759 // indefinitely. 2760 for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows; 2761 mi_row += mb_step, ++mt_thread_id) { 2762 intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, 2763 mt_unit_cols - 1, mt_unit_cols); 2764 } 2765 } 2766 2767 static int cal_mb_wiener_var_hook(void *arg1, void *unused) { 2768 (void)unused; 2769 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 2770 AV1_COMP *const cpi = thread_data->cpi; 2771 MACROBLOCK *x = &thread_data->td->mb; 2772 MACROBLOCKD *xd = &x->e_mbd; 2773 const BLOCK_SIZE bsize = cpi->weber_bsize; 2774 const int mb_step = mi_size_wide[bsize]; 2775 AV1EncRowMultiThreadSync *const intra_row_mt_sync = 2776 &cpi->ppi->intra_row_mt_sync; 2777 AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; 2778 (void)enc_row_mt; 2779 #if CONFIG_MULTITHREAD 2780 pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_; 2781 #endif 2782 2783 struct aom_internal_error_info *const error_info = &thread_data->error_info; 2784 xd->error_info = error_info; 2785 2786 // The jmp_buf is valid only for the duration of the function that calls 2787 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 2788 // before it returns. 2789 if (setjmp(error_info->jmp)) { 2790 error_info->setjmp = 0; 2791 #if CONFIG_MULTITHREAD 2792 pthread_mutex_lock(enc_row_mt_mutex); 2793 enc_row_mt->mb_wiener_mt_exit = true; 2794 pthread_mutex_unlock(enc_row_mt_mutex); 2795 #endif 2796 set_mb_wiener_var_calc_done(cpi); 2797 return 0; 2798 } 2799 error_info->setjmp = 1; 2800 DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); 2801 DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); 2802 DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); 2803 DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); 2804 double sum_rec_distortion = 0; 2805 double sum_est_rate = 0; 2806 while (1) { 2807 int current_mi_row = -1; 2808 #if CONFIG_MULTITHREAD 2809 pthread_mutex_lock(enc_row_mt_mutex); 2810 #endif 2811 int has_jobs = enc_row_mt->mb_wiener_mt_exit 2812 ? 0 2813 : get_next_job_allintra(intra_row_mt_sync, 2814 cpi->common.mi_params.mi_rows, 2815 ¤t_mi_row, mb_step); 2816 #if CONFIG_MULTITHREAD 2817 pthread_mutex_unlock(enc_row_mt_mutex); 2818 #endif 2819 if (!has_jobs) break; 2820 // TODO(chengchen): properly accumulate the distortion and rate. 2821 av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff, 2822 qcoeff, dqcoeff, &sum_rec_distortion, 2823 &sum_est_rate, 2824 thread_data->td->wiener_tmp_pred_buf); 2825 #if CONFIG_MULTITHREAD 2826 pthread_mutex_lock(enc_row_mt_mutex); 2827 #endif 2828 intra_row_mt_sync->num_threads_working--; 2829 #if CONFIG_MULTITHREAD 2830 pthread_mutex_unlock(enc_row_mt_mutex); 2831 #endif 2832 } 2833 error_info->setjmp = 0; 2834 return 1; 2835 } 2836 2837 static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) { 2838 av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); 2839 2840 MultiThreadInfo *mt_info = &cpi->mt_info; 2841 for (int j = 0; j < num_workers; ++j) { 2842 EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; 2843 ThreadData *td = thread_data->td; 2844 if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td); 2845 } 2846 } 2847 2848 // This function is the multi-threading version of computing the wiener 2849 // variance. 2850 // Note that the wiener variance is used for allintra mode (1 pass) and its 2851 // computation is before the frame encoding, so we don't need to consider 2852 // the number of tiles, instead we allocate all available threads to 2853 // the computation. 2854 void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, 2855 double *sum_rec_distortion, 2856 double *sum_est_rate) { 2857 (void)sum_rec_distortion; 2858 (void)sum_est_rate; 2859 AV1_COMMON *const cm = &cpi->common; 2860 MultiThreadInfo *const mt_info = &cpi->mt_info; 2861 AV1EncRowMultiThreadSync *const intra_row_mt_sync = 2862 &cpi->ppi->intra_row_mt_sync; 2863 2864 // TODO(chengchen): the memory usage could be improved. 2865 const int mi_rows = cm->mi_params.mi_rows; 2866 row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows); 2867 2868 intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0; 2869 intra_row_mt_sync->num_threads_working = num_workers; 2870 intra_row_mt_sync->next_mi_row = 0; 2871 memset(intra_row_mt_sync->num_finished_cols, -1, 2872 sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows); 2873 mt_info->enc_row_mt.mb_wiener_mt_exit = false; 2874 2875 prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers); 2876 launch_workers(mt_info, num_workers); 2877 sync_enc_workers(mt_info, cm, num_workers); 2878 dealloc_mb_wiener_var_mt_data(cpi, num_workers); 2879 } 2880 2881 // Compare and order tiles based on absolute sum of tx coeffs. 2882 static int compare_tile_order(const void *a, const void *b) { 2883 const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a; 2884 const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b; 2885 2886 if (tile_a->abs_sum_level > tile_b->abs_sum_level) 2887 return -1; 2888 else if (tile_a->abs_sum_level == tile_b->abs_sum_level) 2889 return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1); 2890 else 2891 return 1; 2892 } 2893 2894 // Get next tile index to be processed for pack bitstream 2895 static inline int get_next_pack_bs_tile_idx( 2896 AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { 2897 assert(pack_bs_sync->next_job_idx <= num_tiles); 2898 if (pack_bs_sync->next_job_idx == num_tiles) return -1; 2899 2900 return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++] 2901 .tile_idx; 2902 } 2903 2904 // Calculates bitstream chunk size based on total buffer size and tile or tile 2905 // group size. 2906 static inline size_t get_bs_chunk_size(int tg_or_tile_size, 2907 const int frame_or_tg_size, 2908 size_t *remain_buf_size, 2909 size_t max_buf_size, int is_last_chunk) { 2910 size_t this_chunk_size; 2911 assert(*remain_buf_size > 0); 2912 if (is_last_chunk) { 2913 this_chunk_size = *remain_buf_size; 2914 *remain_buf_size = 0; 2915 } else { 2916 const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size; 2917 this_chunk_size = (size_t)(size_scale / frame_or_tg_size); 2918 *remain_buf_size -= this_chunk_size; 2919 assert(*remain_buf_size > 0); 2920 } 2921 assert(this_chunk_size > 0); 2922 return this_chunk_size; 2923 } 2924 2925 // Initializes params required for pack bitstream tile. 2926 static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst, 2927 struct aom_write_bit_buffer *saved_wb, 2928 PackBSParams *const pack_bs_params_arr, 2929 uint8_t obu_extn_header) { 2930 MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; 2931 AV1_COMMON *const cm = &cpi->common; 2932 const CommonTileParams *const tiles = &cm->tiles; 2933 const int num_tiles = tiles->cols * tiles->rows; 2934 // Fixed size tile groups for the moment 2935 const int num_tg_hdrs = cpi->num_tg; 2936 // Tile group size in terms of number of tiles. 2937 const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs; 2938 uint8_t *tile_dst = dst; 2939 uint8_t *tile_data_curr = dst; 2940 // Max tile group count can not be more than MAX_TILES. 2941 int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units 2942 int tile_idx; 2943 int tg_idx = 0; 2944 int tile_count_in_tg = 0; 2945 int new_tg = 1; 2946 2947 // Populate pack bitstream params of all tiles. 2948 for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { 2949 const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info; 2950 PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; 2951 // Calculate tile size in mi units. 2952 const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) * 2953 (tile_info->mi_row_end - tile_info->mi_row_start); 2954 int is_last_tile_in_tg = 0; 2955 tile_count_in_tg++; 2956 if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1)) 2957 is_last_tile_in_tg = 1; 2958 2959 // Populate pack bitstream params of this tile. 2960 pack_bs_params->curr_tg_hdr_size = 0; 2961 pack_bs_params->obu_extn_header = obu_extn_header; 2962 pack_bs_params->saved_wb = saved_wb; 2963 pack_bs_params->obu_header_size = 0; 2964 pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg; 2965 pack_bs_params->new_tg = new_tg; 2966 pack_bs_params->tile_col = tile_info->tile_col; 2967 pack_bs_params->tile_row = tile_info->tile_row; 2968 pack_bs_params->tile_size_mi = tile_size_mi; 2969 tg_size_mi[tg_idx] += tile_size_mi; 2970 2971 if (new_tg) new_tg = 0; 2972 if (is_last_tile_in_tg) { 2973 tile_count_in_tg = 0; 2974 new_tg = 1; 2975 tg_idx++; 2976 } 2977 } 2978 2979 assert(cpi->available_bs_size > 0); 2980 size_t tg_buf_size[MAX_TILES] = { 0 }; 2981 size_t max_buf_size = cpi->available_bs_size; 2982 size_t remain_buf_size = max_buf_size; 2983 const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols; 2984 2985 tile_idx = 0; 2986 // Prepare obu, tile group and frame header of each tile group. 2987 for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) { 2988 PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; 2989 int is_last_tg = tg_idx == cpi->num_tg - 1; 2990 // Prorate bitstream buffer size based on tile group size and available 2991 // buffer size. This buffer will be used to store headers and tile data. 2992 tg_buf_size[tg_idx] = 2993 get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size, 2994 max_buf_size, is_last_tg); 2995 2996 pack_bs_params->dst = tile_dst; 2997 pack_bs_params->tile_data_curr = tile_dst; 2998 2999 // Write obu, tile group and frame header at first tile in the tile 3000 // group. 3001 av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx); 3002 tile_dst += tg_buf_size[tg_idx]; 3003 3004 // Exclude headers from tile group buffer size. 3005 tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size; 3006 tile_idx += tg_size_in_tiles; 3007 } 3008 3009 tg_idx = 0; 3010 // Calculate bitstream buffer size of each tile in the tile group. 3011 for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { 3012 PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; 3013 3014 if (pack_bs_params->new_tg) { 3015 max_buf_size = tg_buf_size[tg_idx]; 3016 remain_buf_size = max_buf_size; 3017 } 3018 3019 // Prorate bitstream buffer size of this tile based on tile size and 3020 // available buffer size. For this proration, header size is not accounted. 3021 const size_t tile_buf_size = get_bs_chunk_size( 3022 pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size, 3023 max_buf_size, pack_bs_params->is_last_tile_in_tg); 3024 pack_bs_params->tile_buf_size = tile_buf_size; 3025 3026 // Update base address of bitstream buffer for tile and tile group. 3027 if (pack_bs_params->new_tg) { 3028 tile_dst = pack_bs_params->dst; 3029 tile_data_curr = pack_bs_params->tile_data_curr; 3030 // Account header size in first tile of a tile group. 3031 pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size; 3032 } else { 3033 pack_bs_params->dst = tile_dst; 3034 pack_bs_params->tile_data_curr = tile_data_curr; 3035 } 3036 3037 if (pack_bs_params->is_last_tile_in_tg) tg_idx++; 3038 tile_dst += pack_bs_params->tile_buf_size; 3039 } 3040 } 3041 3042 // Worker hook function of pack bitsteam multithreading. 3043 static int pack_bs_worker_hook(void *arg1, void *arg2) { 3044 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 3045 PackBSParams *const pack_bs_params = (PackBSParams *)arg2; 3046 AV1_COMP *const cpi = thread_data->cpi; 3047 AV1_COMMON *const cm = &cpi->common; 3048 AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync; 3049 const CommonTileParams *const tiles = &cm->tiles; 3050 const int num_tiles = tiles->cols * tiles->rows; 3051 3052 #if CONFIG_MULTITHREAD 3053 pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_; 3054 #endif 3055 MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; 3056 struct aom_internal_error_info *const error_info = &thread_data->error_info; 3057 xd->error_info = error_info; 3058 3059 // The jmp_buf is valid only for the duration of the function that calls 3060 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 3061 // before it returns. 3062 if (setjmp(error_info->jmp)) { 3063 error_info->setjmp = 0; 3064 #if CONFIG_MULTITHREAD 3065 pthread_mutex_lock(pack_bs_mutex); 3066 pack_bs_sync->pack_bs_mt_exit = true; 3067 pthread_mutex_unlock(pack_bs_mutex); 3068 #endif 3069 return 0; 3070 } 3071 error_info->setjmp = 1; 3072 3073 while (1) { 3074 #if CONFIG_MULTITHREAD 3075 pthread_mutex_lock(pack_bs_mutex); 3076 #endif 3077 const int tile_idx = 3078 pack_bs_sync->pack_bs_mt_exit 3079 ? -1 3080 : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles); 3081 #if CONFIG_MULTITHREAD 3082 pthread_mutex_unlock(pack_bs_mutex); 3083 #endif 3084 // When pack_bs_mt_exit is set to true, other workers need not pursue any 3085 // further jobs. 3086 if (tile_idx == -1) break; 3087 TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; 3088 thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; 3089 3090 av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]); 3091 } 3092 3093 error_info->setjmp = 0; 3094 return 1; 3095 } 3096 3097 // Prepares thread data and workers of pack bitsteam multithreading. 3098 static void prepare_pack_bs_workers(AV1_COMP *const cpi, 3099 PackBSParams *const pack_bs_params, 3100 AVxWorkerHook hook, const int num_workers) { 3101 MultiThreadInfo *const mt_info = &cpi->mt_info; 3102 for (int i = num_workers - 1; i >= 0; i--) { 3103 AVxWorker *worker = &mt_info->workers[i]; 3104 EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; 3105 if (i == 0) { 3106 thread_data->td = &cpi->td; 3107 } else { 3108 thread_data->td = thread_data->original_td; 3109 } 3110 3111 if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb; 3112 3113 thread_data->cpi = cpi; 3114 thread_data->start = i; 3115 thread_data->thread_id = i; 3116 av1_reset_pack_bs_thread_data(thread_data->td); 3117 3118 worker->hook = hook; 3119 worker->data1 = thread_data; 3120 worker->data2 = pack_bs_params; 3121 } 3122 3123 AV1_COMMON *const cm = &cpi->common; 3124 AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; 3125 const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; 3126 pack_bs_sync->next_job_idx = 0; 3127 pack_bs_sync->pack_bs_mt_exit = false; 3128 3129 PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; 3130 // Reset tile order data of pack bitstream 3131 av1_zero_array(pack_bs_tile_order, num_tiles); 3132 3133 // Populate pack bitstream tile order structure 3134 for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) { 3135 pack_bs_tile_order[tile_idx].abs_sum_level = 3136 cpi->tile_data[tile_idx].abs_sum_level; 3137 pack_bs_tile_order[tile_idx].tile_idx = tile_idx; 3138 } 3139 3140 // Sort tiles in descending order based on tile area. 3141 qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order), 3142 compare_tile_order); 3143 } 3144 3145 // Accumulates data after pack bitsteam processing. 3146 static void accumulate_pack_bs_data( 3147 AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr, 3148 uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info, 3149 int *const largest_tile_id, unsigned int *max_tile_size, 3150 uint32_t *const obu_header_size, uint8_t **tile_data_start, 3151 const int num_workers) { 3152 const AV1_COMMON *const cm = &cpi->common; 3153 const CommonTileParams *const tiles = &cm->tiles; 3154 const int tile_count = tiles->cols * tiles->rows; 3155 // Fixed size tile groups for the moment 3156 size_t curr_tg_data_size = 0; 3157 int is_first_tg = 1; 3158 uint8_t *curr_tg_start = dst; 3159 size_t src_offset = 0; 3160 size_t dst_offset = 0; 3161 3162 for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) { 3163 // PackBSParams stores all parameters required to pack tile and header 3164 // info. 3165 const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; 3166 uint32_t tile_size = 0; 3167 3168 if (pack_bs_params->new_tg) { 3169 curr_tg_start = dst + *total_size; 3170 curr_tg_data_size = pack_bs_params->curr_tg_hdr_size; 3171 *tile_data_start += pack_bs_params->curr_tg_hdr_size; 3172 *obu_header_size = pack_bs_params->obu_header_size; 3173 } 3174 curr_tg_data_size += 3175 pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4); 3176 3177 if (pack_bs_params->buf.size > *max_tile_size) { 3178 *largest_tile_id = tile_idx; 3179 *max_tile_size = (unsigned int)pack_bs_params->buf.size; 3180 } 3181 tile_size += 3182 (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size; 3183 3184 // Pack all the chunks of tile bitstreams together 3185 if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size); 3186 3187 if (pack_bs_params->is_last_tile_in_tg) 3188 av1_write_last_tile_info( 3189 cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size, 3190 curr_tg_start, &tile_size, tile_data_start, largest_tile_id, 3191 &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header); 3192 src_offset += pack_bs_params->tile_buf_size; 3193 dst_offset += tile_size; 3194 *total_size += tile_size; 3195 } 3196 3197 // Accumulate thread data 3198 MultiThreadInfo *const mt_info = &cpi->mt_info; 3199 for (int idx = num_workers - 1; idx >= 0; idx--) { 3200 ThreadData const *td = mt_info->tile_thr_data[idx].td; 3201 av1_accumulate_pack_bs_thread_data(cpi, td); 3202 } 3203 } 3204 3205 void av1_write_tile_obu_mt( 3206 AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, 3207 struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, 3208 const FrameHeaderInfo *fh_info, int *const largest_tile_id, 3209 unsigned int *max_tile_size, uint32_t *const obu_header_size, 3210 uint8_t **tile_data_start, const int num_workers) { 3211 MultiThreadInfo *const mt_info = &cpi->mt_info; 3212 3213 PackBSParams pack_bs_params[MAX_TILES]; 3214 uint32_t tile_size[MAX_TILES] = { 0 }; 3215 3216 for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++) 3217 pack_bs_params[tile_idx].total_size = &tile_size[tile_idx]; 3218 3219 init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header); 3220 prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook, 3221 num_workers); 3222 launch_workers(mt_info, num_workers); 3223 sync_enc_workers(mt_info, &cpi->common, num_workers); 3224 accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info, 3225 largest_tile_id, max_tile_size, obu_header_size, 3226 tile_data_start, num_workers); 3227 } 3228 3229 // Deallocate memory for CDEF search multi-thread synchronization. 3230 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { 3231 (void)cdef_sync; 3232 assert(cdef_sync != NULL); 3233 #if CONFIG_MULTITHREAD 3234 if (cdef_sync->mutex_ != NULL) { 3235 pthread_mutex_destroy(cdef_sync->mutex_); 3236 aom_free(cdef_sync->mutex_); 3237 } 3238 #endif // CONFIG_MULTITHREAD 3239 } 3240 3241 // Updates the row and column indices of the next job to be processed. 3242 // Also updates end_of_frame flag when the processing of all blocks is complete. 3243 static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { 3244 cdef_sync->fbc++; 3245 if (cdef_sync->fbc == nhfb) { 3246 cdef_sync->fbr++; 3247 if (cdef_sync->fbr == nvfb) { 3248 cdef_sync->end_of_frame = 1; 3249 } else { 3250 cdef_sync->fbc = 0; 3251 } 3252 } 3253 } 3254 3255 // Initializes cdef_sync parameters. 3256 static inline void cdef_reset_job_info(AV1CdefSync *cdef_sync) { 3257 #if CONFIG_MULTITHREAD 3258 if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); 3259 #endif // CONFIG_MULTITHREAD 3260 cdef_sync->end_of_frame = 0; 3261 cdef_sync->fbr = 0; 3262 cdef_sync->fbc = 0; 3263 cdef_sync->cdef_mt_exit = false; 3264 } 3265 3266 // Checks if a job is available. If job is available, 3267 // populates next job information and returns 1, else returns 0. 3268 static inline int cdef_get_next_job(AV1CdefSync *cdef_sync, 3269 CdefSearchCtx *cdef_search_ctx, 3270 volatile int *cur_fbr, 3271 volatile int *cur_fbc, 3272 volatile int *sb_count) { 3273 #if CONFIG_MULTITHREAD 3274 pthread_mutex_lock(cdef_sync->mutex_); 3275 #endif // CONFIG_MULTITHREAD 3276 int do_next_block = 0; 3277 const int nvfb = cdef_search_ctx->nvfb; 3278 const int nhfb = cdef_search_ctx->nhfb; 3279 3280 // If a block is skip, do not process the block and 3281 // check the skip condition for the next block. 3282 while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame && 3283 cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr, 3284 cdef_sync->fbc)) { 3285 update_next_job_info(cdef_sync, nvfb, nhfb); 3286 } 3287 3288 // Populates information needed for current job and update the row, 3289 // column indices of the next block to be processed. 3290 if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { 3291 do_next_block = 1; 3292 *cur_fbr = cdef_sync->fbr; 3293 *cur_fbc = cdef_sync->fbc; 3294 *sb_count = cdef_search_ctx->sb_count; 3295 cdef_search_ctx->sb_count++; 3296 update_next_job_info(cdef_sync, nvfb, nhfb); 3297 } 3298 #if CONFIG_MULTITHREAD 3299 pthread_mutex_unlock(cdef_sync->mutex_); 3300 #endif // CONFIG_MULTITHREAD 3301 return do_next_block; 3302 } 3303 3304 // Hook function for each thread in CDEF search multi-threading. 3305 static int cdef_filter_block_worker_hook(void *arg1, void *arg2) { 3306 EncWorkerData *thread_data = (EncWorkerData *)arg1; 3307 AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2; 3308 3309 #if CONFIG_MULTITHREAD 3310 pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_; 3311 #endif 3312 struct aom_internal_error_info *const error_info = &thread_data->error_info; 3313 CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx; 3314 3315 // The jmp_buf is valid only for the duration of the function that calls 3316 // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 3317 // before it returns. 3318 if (setjmp(error_info->jmp)) { 3319 error_info->setjmp = 0; 3320 #if CONFIG_MULTITHREAD 3321 pthread_mutex_lock(cdef_mutex_); 3322 cdef_sync->cdef_mt_exit = true; 3323 pthread_mutex_unlock(cdef_mutex_); 3324 #endif 3325 return 0; 3326 } 3327 error_info->setjmp = 1; 3328 3329 volatile int cur_fbr, cur_fbc, sb_count; 3330 while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc, 3331 &sb_count)) { 3332 av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc, 3333 sb_count); 3334 } 3335 error_info->setjmp = 0; 3336 return 1; 3337 } 3338 3339 // Assigns CDEF search hook function and thread data to each worker. 3340 static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook, 3341 int num_workers) { 3342 MultiThreadInfo *mt_info = &cpi->mt_info; 3343 for (int i = num_workers - 1; i >= 0; i--) { 3344 AVxWorker *worker = &mt_info->workers[i]; 3345 EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; 3346 3347 thread_data->cpi = cpi; 3348 worker->hook = hook; 3349 worker->data1 = thread_data; 3350 worker->data2 = &mt_info->cdef_sync; 3351 } 3352 } 3353 3354 // Implements multi-threading for CDEF search. 3355 void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) { 3356 MultiThreadInfo *mt_info = &cpi->mt_info; 3357 AV1CdefSync *cdef_sync = &mt_info->cdef_sync; 3358 const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH]; 3359 3360 cdef_reset_job_info(cdef_sync); 3361 prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers); 3362 launch_workers(mt_info, num_workers); 3363 sync_enc_workers(mt_info, &cpi->common, num_workers); 3364 } 3365 3366 // Computes num_workers for temporal filter multi-threading. 3367 static inline int compute_num_tf_workers(const AV1_COMP *cpi) { 3368 // For single-pass encode, using no. of workers as per tf block size was not 3369 // found to improve speed. Hence the thread assignment for single-pass encode 3370 // is kept based on compute_num_enc_workers(). 3371 if (cpi->oxcf.pass < AOM_RC_SECOND_PASS) 3372 return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3373 3374 if (cpi->oxcf.max_threads <= 1) return 1; 3375 3376 const int frame_height = cpi->common.height; 3377 const BLOCK_SIZE block_size = TF_BLOCK_SIZE; 3378 const int mb_height = block_size_high[block_size]; 3379 const int mb_rows = get_num_blocks(frame_height, mb_height); 3380 return AOMMIN(cpi->oxcf.max_threads, mb_rows); 3381 } 3382 3383 // Computes num_workers for tpl multi-threading. 3384 static inline int compute_num_tpl_workers(AV1_COMP *cpi) { 3385 return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3386 } 3387 3388 // Computes num_workers for loop filter multi-threading. 3389 static inline int compute_num_lf_workers(AV1_COMP *cpi) { 3390 return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3391 } 3392 3393 // Computes num_workers for cdef multi-threading. 3394 static inline int compute_num_cdef_workers(AV1_COMP *cpi) { 3395 return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3396 } 3397 3398 // Computes num_workers for loop-restoration multi-threading. 3399 static inline int compute_num_lr_workers(AV1_COMP *cpi) { 3400 return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3401 } 3402 3403 // Computes num_workers for pack bitstream multi-threading. 3404 static inline int compute_num_pack_bs_workers(AV1_COMP *cpi) { 3405 if (cpi->oxcf.max_threads <= 1) return 1; 3406 return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads); 3407 } 3408 3409 // Computes num_workers for all intra multi-threading. 3410 static inline int compute_num_ai_workers(AV1_COMP *cpi) { 3411 if (cpi->oxcf.max_threads <= 1) return 1; 3412 // The multi-threading implementation of deltaq-mode = 3 in allintra 3413 // mode is based on row multi threading. 3414 if (!cpi->oxcf.row_mt) return 1; 3415 cpi->weber_bsize = BLOCK_8X8; 3416 const BLOCK_SIZE bsize = cpi->weber_bsize; 3417 const int mb_step = mi_size_wide[bsize]; 3418 const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step; 3419 return AOMMIN(num_mb_rows, cpi->oxcf.max_threads); 3420 } 3421 3422 static int compute_num_mod_workers(AV1_COMP *cpi, 3423 MULTI_THREADED_MODULES mod_name) { 3424 int num_mod_workers = 0; 3425 switch (mod_name) { 3426 case MOD_FP: 3427 if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS) 3428 num_mod_workers = 0; 3429 else 3430 num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3431 break; 3432 case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break; 3433 case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break; 3434 case MOD_GME: num_mod_workers = 1; break; 3435 case MOD_ENC: 3436 num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads); 3437 break; 3438 case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break; 3439 case MOD_CDEF_SEARCH: 3440 num_mod_workers = compute_num_cdef_workers(cpi); 3441 break; 3442 case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break; 3443 case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break; 3444 case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break; 3445 case MOD_FRAME_ENC: 3446 num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC]; 3447 break; 3448 case MOD_AI: 3449 if (cpi->oxcf.pass == AOM_RC_ONE_PASS) { 3450 num_mod_workers = compute_num_ai_workers(cpi); 3451 } else { 3452 num_mod_workers = 0; 3453 } 3454 break; 3455 default: assert(0); break; 3456 } 3457 return (num_mod_workers); 3458 } 3459 // Computes the number of workers for each MT modules in the encoder 3460 void av1_compute_num_workers_for_mt(AV1_COMP *cpi) { 3461 for (int i = MOD_FP; i < NUM_MT_MODULES; i++) { 3462 cpi->ppi->p_mt_info.num_mod_workers[i] = 3463 compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i); 3464 } 3465 }