tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nonrd_opt.c (39728B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include "config/aom_dsp_rtcd.h"
     13 #include "config/av1_rtcd.h"
     14 
     15 #include "av1/common/reconinter.h"
     16 
     17 #include "av1/encoder/encodemv.h"
     18 #include "av1/encoder/nonrd_opt.h"
     19 #include "av1/encoder/rdopt.h"
     20 
     21 static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = {
     22  av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16
     23 };
     24 
     25 #define DECLARE_BLOCK_YRD_BUFFERS()                      \
     26  DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \
     27  DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]);  \
     28  DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]);   \
     29  uint16_t eob[1];
     30 
     31 #define DECLARE_BLOCK_YRD_VARS()                                          \
     32  /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the  \
     33   * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \
     34   * as a non-const so we can reassign it to macroblock_plane::coeff. */  \
     35  int16_t *low_coeff = (int16_t *)coeff_buf;                              \
     36  int16_t *const low_qcoeff = (int16_t *)qcoeff_buf;                      \
     37  int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf;                    \
     38  const int diff_stride = bw;
     39 
     40 #define DECLARE_LOOP_VARS_BLOCK_YRD() \
     41  const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
     42 
     43 static AOM_FORCE_INLINE void update_yrd_loop_vars(
     44    MACROBLOCK *x, int *skippable, int step, int ncoeffs,
     45    int16_t *const low_coeff, int16_t *const low_qcoeff,
     46    int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost,
     47    int tx_blk_id) {
     48  const int is_txfm_skip = (ncoeffs == 0);
     49  *skippable &= is_txfm_skip;
     50  x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
     51  *eob_cost += get_msb(ncoeffs + 1);
     52  if (ncoeffs == 1)
     53    this_rdc->rate += (int)abs(low_qcoeff[0]);
     54  else if (ncoeffs > 1)
     55    this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
     56 
     57  this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
     58 }
     59 
     60 static inline void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
     61                                                int max_blocks_high,
     62                                                int max_blocks_wide,
     63                                                int num_4x4_w, int step,
     64                                                int block_step) {
     65  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
     66  const int bw = 4 * num_4x4_w;
     67  const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide);
     68  int block = 0;
     69 
     70  for (int r = 0; r < max_blocks_high; r += block_step) {
     71    for (int c = 0; c < num_4x4; c += 2 * block_step) {
     72      const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2];
     73      int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block);
     74      aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff);
     75      block += 2 * step;
     76    }
     77  }
     78 }
     79 
     80 #if CONFIG_AV1_HIGHBITDEPTH
     81 #define DECLARE_BLOCK_YRD_HBD_VARS()     \
     82  tran_low_t *const coeff = coeff_buf;   \
     83  tran_low_t *const qcoeff = qcoeff_buf; \
     84  tran_low_t *const dqcoeff = dqcoeff_buf;
     85 
     86 static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
     87    MACROBLOCK *x, int *skippable, int step, int ncoeffs,
     88    tran_low_t *const coeff, tran_low_t *const qcoeff,
     89    tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost,
     90    int tx_blk_id) {
     91  const MACROBLOCKD *xd = &x->e_mbd;
     92  const int is_txfm_skip = (ncoeffs == 0);
     93  *skippable &= is_txfm_skip;
     94  x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
     95  *eob_cost += get_msb(ncoeffs + 1);
     96 
     97  int64_t dummy;
     98  if (ncoeffs == 1)
     99    this_rdc->rate += (int)abs(qcoeff[0]);
    100  else if (ncoeffs > 1)
    101    this_rdc->rate += aom_satd(qcoeff, step << 4);
    102  this_rdc->dist +=
    103      av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2;
    104 }
    105 #endif
    106 
    107 /*!\brief Calculates RD Cost using Hadamard transform.
    108 *
    109 * \ingroup nonrd_mode_search
    110 * \callgraph
    111 * \callergraph
    112 * Calculates RD Cost using Hadamard transform. For low bit depth this function
    113 * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
    114 * \param[in]    x              Pointer to structure holding all the data for
    115                                the current macroblock
    116 * \param[in]    this_rdc       Pointer to calculated RD Cost
    117 * \param[in]    skippable      Pointer to a flag indicating possible tx skip
    118 * \param[in]    bsize          Current block size
    119 * \param[in]    tx_size        Transform size
    120 * \param[in]    is_inter_mode  Flag to indicate inter mode
    121 *
    122 * \remark Nothing is returned. Instead, calculated RD cost is placed to
    123 * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
    124 * coefficients for Hadamard transform
    125 */
    126 void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
    127                   BLOCK_SIZE bsize, TX_SIZE tx_size) {
    128  MACROBLOCKD *xd = &x->e_mbd;
    129  const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y];
    130  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
    131  assert(bsize < BLOCK_SIZES_ALL);
    132  const int num_4x4_w = mi_size_wide[bsize];
    133  const int num_4x4_h = mi_size_high[bsize];
    134  const int step = 1 << (tx_size << 1);
    135  const int block_step = (1 << tx_size);
    136  const int row_step = step * num_4x4_w >> tx_size;
    137  int block = 0;
    138  const int max_blocks_wide =
    139      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
    140  const int max_blocks_high =
    141      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
    142  int eob_cost = 0;
    143  const int bw = 4 * num_4x4_w;
    144  const int bh = 4 * num_4x4_h;
    145  const int use_hbd = is_cur_buf_hbd(xd);
    146  int num_blk_skip_w = num_4x4_w;
    147 
    148 #if CONFIG_AV1_HIGHBITDEPTH
    149  if (use_hbd) {
    150    aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
    151                              p->src.stride, pd->dst.buf, pd->dst.stride);
    152  } else {
    153    aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
    154                       pd->dst.buf, pd->dst.stride);
    155  }
    156 #else
    157  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
    158                     pd->dst.buf, pd->dst.stride);
    159 #endif
    160 
    161  // Keep the intermediate value on the stack here. Writing directly to
    162  // skippable causes speed regression due to load-and-store issues in
    163  // update_yrd_loop_vars.
    164  int temp_skippable = 1;
    165  this_rdc->dist = 0;
    166  this_rdc->rate = 0;
    167  // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
    168  // can be done per function call. Hence the call of Hadamard txfm is
    169  // abstracted here for the specified cases.
    170  int is_tx_8x8_dual_applicable =
    171      (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
    172       block_size_high[bsize] >= 8);
    173 
    174 #if CONFIG_AV1_HIGHBITDEPTH
    175  // As of now, dual implementation of hadamard txfm is available for low
    176  // bitdepth.
    177  if (use_hbd) is_tx_8x8_dual_applicable = 0;
    178 #endif
    179 
    180  if (is_tx_8x8_dual_applicable) {
    181    aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
    182                                 step, block_step);
    183  }
    184 
    185  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
    186  DECLARE_BLOCK_YRD_BUFFERS()
    187  DECLARE_BLOCK_YRD_VARS()
    188 #if CONFIG_AV1_HIGHBITDEPTH
    189  DECLARE_BLOCK_YRD_HBD_VARS()
    190 #else
    191  (void)use_hbd;
    192 #endif
    193 
    194  // Keep track of the row and column of the blocks we use so that we know
    195  // if we are in the unrestricted motion border.
    196  for (int r = 0; r < max_blocks_high; r += block_step) {
    197    for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
    198      DECLARE_LOOP_VARS_BLOCK_YRD()
    199 
    200      switch (tx_size) {
    201 #if CONFIG_AV1_HIGHBITDEPTH
    202        case TX_16X16:
    203          if (use_hbd) {
    204            aom_hadamard_16x16(src_diff, diff_stride, coeff);
    205            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
    206                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
    207                            dqcoeff, p->dequant_QTX, eob,
    208                            // default_scan_fp_16x16_transpose and
    209                            // av1_default_iscan_fp_16x16_transpose have to be
    210                            // used together.
    211                            default_scan_fp_16x16_transpose,
    212                            av1_default_iscan_fp_16x16_transpose);
    213          } else {
    214            aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
    215            av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
    216                            p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
    217                            p->dequant_QTX, eob,
    218                            // default_scan_lp_16x16_transpose and
    219                            // av1_default_iscan_lp_16x16_transpose have to be
    220                            // used together.
    221                            default_scan_lp_16x16_transpose,
    222                            av1_default_iscan_lp_16x16_transpose);
    223          }
    224          break;
    225        case TX_8X8:
    226          if (use_hbd) {
    227            aom_hadamard_8x8(src_diff, diff_stride, coeff);
    228            av1_quantize_fp(
    229                coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
    230                p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
    231                default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
    232          } else {
    233            if (is_tx_8x8_dual_applicable) {
    234              // The coeffs are pre-computed for the whole block, so re-assign
    235              // low_coeff to the appropriate location.
    236              const int block_offset = BLOCK_OFFSET(block + s);
    237              low_coeff = (int16_t *)p->coeff + block_offset;
    238            } else {
    239              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
    240            }
    241            av1_quantize_lp(
    242                low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff,
    243                low_dqcoeff, p->dequant_QTX, eob,
    244                // default_scan_8x8_transpose and
    245                // av1_default_iscan_8x8_transpose have to be used together.
    246                default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
    247          }
    248          break;
    249        default:
    250          assert(tx_size == TX_4X4);
    251          // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate
    252          // normal coefficients order, so we don't need to change the scan
    253          // order here.
    254          if (use_hbd) {
    255            aom_fdct4x4(src_diff, coeff, diff_stride);
    256            av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
    257                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
    258                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
    259                            scan_order->iscan);
    260          } else {
    261            aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
    262            av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
    263                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
    264                            scan_order->scan, scan_order->iscan);
    265          }
    266          break;
    267 #else
    268        case TX_16X16:
    269          aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
    270          av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX,
    271                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
    272                          default_scan_lp_16x16_transpose,
    273                          av1_default_iscan_lp_16x16_transpose);
    274          break;
    275        case TX_8X8:
    276          if (is_tx_8x8_dual_applicable) {
    277            // The coeffs are pre-computed for the whole block, so re-assign
    278            // low_coeff to the appropriate location.
    279            const int block_offset = BLOCK_OFFSET(block + s);
    280            low_coeff = (int16_t *)p->coeff + block_offset;
    281          } else {
    282            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
    283          }
    284          av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
    285                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
    286                          default_scan_8x8_transpose,
    287                          av1_default_iscan_8x8_transpose);
    288          break;
    289        default:
    290          aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
    291          av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
    292                          low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
    293                          scan_order->scan, scan_order->iscan);
    294          break;
    295 #endif
    296      }
    297      assert(*eob <= 1024);
    298 #if CONFIG_AV1_HIGHBITDEPTH
    299      if (use_hbd)
    300        update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff,
    301                                 dqcoeff, this_rdc, &eob_cost,
    302                                 r * num_blk_skip_w + c);
    303      else
    304 #endif
    305        update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
    306                             low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
    307                             r * num_blk_skip_w + c);
    308    }
    309    block += row_step;
    310  }
    311 
    312  this_rdc->skip_txfm = *skippable = temp_skippable;
    313  if (this_rdc->sse < INT64_MAX) {
    314    this_rdc->sse = (this_rdc->sse << 6) >> 2;
    315    if (temp_skippable) {
    316      this_rdc->dist = 0;
    317      this_rdc->dist = this_rdc->sse;
    318      return;
    319    }
    320  }
    321 
    322  // If skippable is set, rate gets clobbered later.
    323  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
    324  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
    325 }
    326 
    327 // Explicitly enumerate the cases so the compiler can generate SIMD for the
    328 // function. According to the disassembler, gcc generates SSE codes for each of
    329 // the possible block sizes. The hottest case is tx_width 16, which takes up
    330 // about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
    331 // av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
    332 // potential room of improvement for writing AVX2 optimization is only 3% * 8% =
    333 // 0.24% of total encoding time.
    334 static inline void scale_square_buf_vals(int16_t *dst, int tx_width,
    335                                         const int16_t *src, int src_stride) {
    336 #define DO_SCALING                                                   \
    337  do {                                                               \
    338    for (int idy = 0; idy < tx_width; ++idy) {                       \
    339      for (int idx = 0; idx < tx_width; ++idx) {                     \
    340        dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
    341      }                                                              \
    342    }                                                                \
    343  } while (0)
    344 
    345  if (tx_width == 4) {
    346    DO_SCALING;
    347  } else if (tx_width == 8) {
    348    DO_SCALING;
    349  } else if (tx_width == 16) {
    350    DO_SCALING;
    351  } else {
    352    assert(0);
    353  }
    354 
    355 #undef DO_SCALING
    356 }
    357 
    358 /*!\brief Calculates RD Cost when the block uses Identity transform.
    359 * Note that this function is only for low bit depth encoding, since it
    360 * is called in real-time mode for now, which sets high bit depth to 0:
    361 * -DCONFIG_AV1_HIGHBITDEPTH=0
    362 *
    363 * \ingroup nonrd_mode_search
    364 * \callgraph
    365 * \callergraph
    366 * Calculates RD Cost. For low bit depth this function
    367 * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
    368 * \param[in]    x              Pointer to structure holding all the data for
    369                                the current macroblock
    370 * \param[in]    pred_buf       Pointer to the prediction buffer
    371 * \param[in]    pred_stride    Stride for the prediction buffer
    372 * \param[in]    this_rdc       Pointer to calculated RD Cost
    373 * \param[in]    skippable      Pointer to a flag indicating possible tx skip
    374 * \param[in]    bsize          Current block size
    375 * \param[in]    tx_size        Transform size
    376 *
    377 * \remark Nothing is returned. Instead, calculated RD cost is placed to
    378 * \c this_rdc. \c skippable flag is set if all coefficients are zero.
    379 */
    380 void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
    381                        int pred_stride, RD_STATS *this_rdc, int *skippable,
    382                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
    383  MACROBLOCKD *xd = &x->e_mbd;
    384  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
    385  assert(bsize < BLOCK_SIZES_ALL);
    386  const int num_4x4_w = mi_size_wide[bsize];
    387  const int num_4x4_h = mi_size_high[bsize];
    388  const int step = 1 << (tx_size << 1);
    389  const int block_step = (1 << tx_size);
    390  const int max_blocks_wide =
    391      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
    392  const int max_blocks_high =
    393      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
    394  int eob_cost = 0;
    395  const int bw = 4 * num_4x4_w;
    396  const int bh = 4 * num_4x4_h;
    397  const int num_blk_skip_w = num_4x4_w;
    398  // Keep the intermediate value on the stack here. Writing directly to
    399  // skippable causes speed regression due to load-and-store issues in
    400  // update_yrd_loop_vars.
    401  int temp_skippable = 1;
    402  int tx_wd = 0;
    403  const SCAN_ORDER *scan_order = NULL;
    404  switch (tx_size) {
    405    case TX_64X64:
    406      assert(0);  // Not implemented
    407      break;
    408    case TX_32X32:
    409      assert(0);  // Not used
    410      break;
    411    case TX_16X16:
    412      scan_order = &av1_fast_idtx_scan_order_16x16;
    413      tx_wd = 16;
    414      break;
    415    case TX_8X8:
    416      scan_order = &av1_fast_idtx_scan_order_8x8;
    417      tx_wd = 8;
    418      break;
    419    default:
    420      assert(tx_size == TX_4X4);
    421      scan_order = &av1_fast_idtx_scan_order_4x4;
    422      tx_wd = 4;
    423      break;
    424  }
    425  assert(scan_order != NULL);
    426 
    427  this_rdc->dist = 0;
    428  this_rdc->rate = 0;
    429  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
    430                     pred_buf, pred_stride);
    431  // Keep track of the row and column of the blocks we use so that we know
    432  // if we are in the unrestricted motion border.
    433  DECLARE_BLOCK_YRD_BUFFERS()
    434  DECLARE_BLOCK_YRD_VARS()
    435  for (int r = 0; r < max_blocks_high; r += block_step) {
    436    for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
    437      DECLARE_LOOP_VARS_BLOCK_YRD()
    438      scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
    439      av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
    440                      p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
    441                      eob, scan_order->scan, scan_order->iscan);
    442      assert(*eob <= 1024);
    443      update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
    444                           low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
    445                           r * num_blk_skip_w + c);
    446    }
    447  }
    448  this_rdc->skip_txfm = *skippable = temp_skippable;
    449  if (this_rdc->sse < INT64_MAX) {
    450    this_rdc->sse = (this_rdc->sse << 6) >> 2;
    451    if (temp_skippable) {
    452      this_rdc->dist = 0;
    453      this_rdc->dist = this_rdc->sse;
    454      return;
    455    }
    456  }
    457  // If skippable is set, rate gets clobbered later.
    458  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
    459  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
    460 }
    461 
    462 int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
    463                               MACROBLOCK *x, MACROBLOCKD *xd,
    464                               RD_STATS *this_rdc, int start_plane,
    465                               int stop_plane) {
    466  // Note our transform coeffs are 8 times an orthogonal transform.
    467  // Hence quantizer step is also 8 times. To get effective quantizer
    468  // we need to divide by 8 before sending to modeling function.
    469  unsigned int sse;
    470  int rate;
    471  int64_t dist;
    472  int plane;
    473  int64_t tot_sse = 0;
    474 
    475  this_rdc->rate = 0;
    476  this_rdc->dist = 0;
    477  this_rdc->skip_txfm = 0;
    478 
    479  for (plane = start_plane; plane <= stop_plane; ++plane) {
    480    struct macroblock_plane *const p = &x->plane[plane];
    481    struct macroblockd_plane *const pd = &xd->plane[plane];
    482    const uint32_t dc_quant = p->dequant_QTX[0];
    483    const uint32_t ac_quant = p->dequant_QTX[1];
    484    const BLOCK_SIZE bs = plane_bsize;
    485    unsigned int var;
    486    if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue;
    487 
    488    var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
    489                                  pd->dst.stride, &sse);
    490    assert(sse >= var);
    491    tot_sse += sse;
    492 
    493    av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
    494                                 dc_quant >> 3, &rate, &dist);
    495 
    496    this_rdc->rate += rate >> 1;
    497    this_rdc->dist += dist << 3;
    498 
    499    av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
    500                                 &rate, &dist);
    501 
    502    this_rdc->rate += rate;
    503    this_rdc->dist += dist << 4;
    504  }
    505 
    506  if (this_rdc->rate == 0) {
    507    this_rdc->skip_txfm = 1;
    508  }
    509 
    510  if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
    511      RDCOST(x->rdmult, 0, tot_sse << 4)) {
    512    this_rdc->rate = 0;
    513    this_rdc->dist = tot_sse << 4;
    514    this_rdc->skip_txfm = 1;
    515  }
    516 
    517  return tot_sse;
    518 }
    519 
    520 static void compute_intra_yprediction(const AV1_COMMON *cm,
    521                                      PREDICTION_MODE mode, BLOCK_SIZE bsize,
    522                                      MACROBLOCK *x, MACROBLOCKD *xd) {
    523  const SequenceHeader *seq_params = cm->seq_params;
    524  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
    525  struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
    526  uint8_t *const src_buf_base = p->src.buf;
    527  uint8_t *const dst_buf_base = pd->dst.buf;
    528  const int src_stride = p->src.stride;
    529  const int dst_stride = pd->dst.stride;
    530  int plane = 0;
    531  int row, col;
    532  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
    533  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
    534  // transform size varies per plane, look it up in a common way.
    535  const TX_SIZE tx_size = max_txsize_lookup[bsize];
    536  const BLOCK_SIZE plane_bsize =
    537      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
    538  // If mb_to_right_edge is < 0 we are in a situation in which
    539  // the current block size extends into the UMV and we won't
    540  // visit the sub blocks that are wholly within the UMV.
    541  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
    542  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
    543  // Keep track of the row and column of the blocks we use so that we know
    544  // if we are in the unrestricted motion border.
    545  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
    546    // Skip visiting the sub blocks that are wholly within the UMV.
    547    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
    548      p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
    549      pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
    550      av1_predict_intra_block(
    551          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
    552          block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
    553          FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
    554          0, 0, plane);
    555    }
    556  }
    557  p->src.buf = src_buf_base;
    558  pd->dst.buf = dst_buf_base;
    559 }
    560 
    561 // Checks whether Intra mode needs to be pruned based on
    562 // 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
    563 // speed features.
    564 static inline bool is_prune_intra_mode(
    565    AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize,
    566    uint8_t segment_id, SOURCE_SAD source_sad_nonrd,
    567    uint8_t color_sensitivity[MAX_MB_PLANE - 1]) {
    568  const PREDICTION_MODE this_mode = intra_mode_list[mode_index];
    569  if (mode_index > 2 || force_intra_check == 0) {
    570    if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
    571      return true;
    572 
    573    if (this_mode == DC_PRED) return false;
    574 
    575    if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false;
    576 
    577    const bool has_color_sensitivity =
    578        color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] &&
    579        color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)];
    580    if (has_color_sensitivity &&
    581        (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad ||
    582         cyclic_refresh_segment_id_boosted(segment_id) ||
    583         source_sad_nonrd > kMedSad))
    584      return false;
    585 
    586    return true;
    587  }
    588  return false;
    589 }
    590 
    591 /*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
    592 *
    593 * \ingroup nonrd_mode_search
    594 * \callgraph
    595 * \callergraph
    596 * Calculates RD Cost for an intra mode for a single TX block using Hadamard
    597 * transform.
    598 * \param[in]    plane          Color plane
    599 * \param[in]    block          Index of a TX block in a prediction block
    600 * \param[in]    row            Row of a current TX block
    601 * \param[in]    col            Column of a current TX block
    602 * \param[in]    plane_bsize    Block size of a current prediction block
    603 * \param[in]    tx_size        Transform size
    604 * \param[in]    arg            Pointer to a structure that holds parameters
    605 *                              for intra mode search
    606 *
    607 * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode
    608 * are set in \c args->rdc and \c args->mode
    609 */
    610 void av1_estimate_block_intra(int plane, int block, int row, int col,
    611                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
    612                              void *arg) {
    613  struct estimate_block_intra_args *const args = arg;
    614  AV1_COMP *const cpi = args->cpi;
    615  AV1_COMMON *const cm = &cpi->common;
    616  MACROBLOCK *const x = args->x;
    617  MACROBLOCKD *const xd = &x->e_mbd;
    618  struct macroblock_plane *const p = &x->plane[plane];
    619  struct macroblockd_plane *const pd = &xd->plane[plane];
    620  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
    621  uint8_t *const src_buf_base = p->src.buf;
    622  uint8_t *const dst_buf_base = pd->dst.buf;
    623  const int64_t src_stride = p->src.stride;
    624  const int64_t dst_stride = pd->dst.stride;
    625 
    626  (void)block;
    627 
    628  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
    629 
    630  if (args->prune_mode_based_on_sad || args->prune_palette_sad) {
    631    unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf(
    632        p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
    633    const unsigned int sad_threshold =
    634        args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4)
    635                                   : UINT_MAX;
    636    // Skip the evaluation of current mode if its SAD is more than a threshold.
    637    if (args->prune_mode_based_on_sad && this_sad > sad_threshold) {
    638      // For the current mode, set rate and distortion to maximum possible
    639      // values and return.
    640      // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip
    641      // the evaluation of the current mode.
    642      args->rdc->rate = INT_MAX;
    643      args->rdc->dist = INT64_MAX;
    644      return;
    645    }
    646    if (this_sad < args->best_sad) {
    647      args->best_sad = this_sad;
    648    }
    649  }
    650 
    651  RD_STATS this_rdc;
    652  av1_invalid_rd_stats(&this_rdc);
    653 
    654  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
    655  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
    656 
    657  if (plane == 0) {
    658    av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx,
    659                  AOMMIN(tx_size, TX_16X16));
    660  } else {
    661    av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane);
    662  }
    663 
    664  p->src.buf = src_buf_base;
    665  pd->dst.buf = dst_buf_base;
    666  assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX);
    667  args->rdc->rate += this_rdc.rate;
    668  args->rdc->dist += this_rdc.dist;
    669 }
    670 
    671 /*!\brief Estimates best intra mode for inter mode search
    672 *
    673 * \ingroup nonrd_mode_search
    674 * \callgraph
    675 * \callergraph
    676 *
    677 * Using heuristics based on best inter mode, block size, and other decides
    678 * whether to check intra modes. If so, estimates and selects best intra mode
    679 * from the reduced set of intra modes (max 4 intra modes checked)
    680 *
    681 * \param[in]    cpi                      Top-level encoder structure
    682 * \param[in]    x                        Pointer to structure holding all the
    683 *                                        data for the current macroblock
    684 * \param[in]    bsize                    Current block size
    685 * \param[in]    best_early_term          Flag, indicating that TX for the
    686 *                                        best inter mode was skipped
    687 * \param[in]    ref_cost_intra           Cost of signalling intra mode
    688 * \param[in]    reuse_prediction         Flag, indicating prediction re-use
    689 * \param[in]    orig_dst                 Original destination buffer
    690 * \param[in]    tmp_buffers              Pointer to a temporary buffers for
    691 *                                        prediction re-use
    692 * \param[out]   this_mode_pred           Pointer to store prediction buffer
    693 *                                        for prediction re-use
    694 * \param[in]    best_rdc                 Pointer to RD cost for the best
    695 *                                        selected intra mode
    696 * \param[in]    best_pickmode            Pointer to a structure containing
    697 *                                        best mode picked so far
    698 * \param[in]    ctx                      Pointer to structure holding coding
    699 *                                        contexts and modes for the block
    700 *
    701 * \remark Nothing is returned. Instead, calculated RD cost is placed to
    702 * \c best_rdc and best selected mode is placed to \c best_pickmode
    703 *
    704 */
    705 void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
    706                             int best_early_term, unsigned int ref_cost_intra,
    707                             int reuse_prediction, struct buf_2d *orig_dst,
    708                             PRED_BUFFER *tmp_buffers,
    709                             PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
    710                             BEST_PICKMODE *best_pickmode,
    711                             PICK_MODE_CONTEXT *ctx,
    712                             unsigned int *best_sad_norm) {
    713  AV1_COMMON *const cm = &cpi->common;
    714  MACROBLOCKD *const xd = &x->e_mbd;
    715  MB_MODE_INFO *const mi = xd->mi[0];
    716  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
    717  const unsigned char segment_id = mi->segment_id;
    718  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
    719  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
    720  const bool is_screen_content =
    721      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
    722  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
    723  const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
    724 
    725  const CommonQuantParams *quant_params = &cm->quant_params;
    726 
    727  RD_STATS this_rdc;
    728 
    729  int intra_cost_penalty = av1_get_intra_cost_penalty(
    730      quant_params->base_qindex, quant_params->y_dc_delta_q,
    731      cm->seq_params->bit_depth);
    732  int64_t inter_mode_thresh =
    733      RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
    734  int perform_intra_pred = rt_sf->check_intra_pred_nonrd;
    735  int force_intra_check = 0;
    736  // For spatial enhancement layer: turn off intra prediction if the
    737  // previous spatial layer as golden ref is not chosen as best reference.
    738  // only do this for temporal enhancement layer and on non-key frames.
    739  if (cpi->svc.spatial_layer_id > 0 &&
    740      best_pickmode->best_ref_frame != GOLDEN_FRAME &&
    741      cpi->svc.temporal_layer_id > 0 &&
    742      !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
    743    perform_intra_pred = 0;
    744 
    745  int do_early_exit_rdthresh = 1;
    746 
    747  uint32_t spatial_var_thresh = 50;
    748  int motion_thresh = 32;
    749  // Adjust thresholds to make intra mode likely tested if the other
    750  // references (golden, alt) are skipped/not checked. For now always
    751  // adjust for svc mode.
    752  if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 &&
    753                            rt_sf->nonrd_prune_ref_frame_search > 0)) {
    754    spatial_var_thresh = 150;
    755    motion_thresh = 0;
    756  }
    757 
    758  // Some adjustments to checking intra mode based on source variance.
    759  if (x->source_variance < spatial_var_thresh) {
    760    // If the best inter mode is large motion or non-LAST ref reduce intra cost
    761    // penalty, so intra mode is more likely tested.
    762    if (best_rdc->rdcost != INT64_MAX &&
    763        (best_pickmode->best_ref_frame != LAST_FRAME ||
    764         abs(mi->mv[0].as_mv.row) >= motion_thresh ||
    765         abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
    766      intra_cost_penalty = intra_cost_penalty >> 2;
    767      inter_mode_thresh =
    768          RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
    769      do_early_exit_rdthresh = 0;
    770    }
    771    if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
    772         x->content_state_sb.source_sad_nonrd >= kHighSad) ||
    773        (is_screen_content && x->source_variance < 50 &&
    774         ((bsize >= BLOCK_32X32 &&
    775           x->content_state_sb.source_sad_nonrd != kZeroSad) ||
    776          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
    777          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)))
    778      force_intra_check = 1;
    779    // For big blocks worth checking intra (since only DC will be checked),
    780    // even if best_early_term is set.
    781    if (bsize >= BLOCK_32X32) best_early_term = 0;
    782  } else if (rt_sf->source_metrics_sb_nonrd &&
    783             x->content_state_sb.source_sad_nonrd <= kLowSad) {
    784    perform_intra_pred = 0;
    785  }
    786 
    787  if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) {
    788    if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV)
    789      perform_intra_pred = 0;
    790    else if (rt_sf->skip_intra_pred == 2)
    791      perform_intra_pred = 0;
    792  }
    793 
    794  if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
    795        (perform_intra_pred && !best_early_term &&
    796         bsize <= cpi->sf.part_sf.max_intra_bsize))) {
    797    return;
    798  }
    799 
    800  // Early exit based on RD cost calculated using known rate. When
    801  // is_screen_content is true, more bias is given to intra modes. Hence,
    802  // considered conservative threshold in early exit for the same.
    803  const int64_t known_rd = is_screen_content
    804                               ? CALC_BIASED_RDCOST(inter_mode_thresh)
    805                               : inter_mode_thresh;
    806  if (known_rd > best_rdc->rdcost) return;
    807 
    808  struct estimate_block_intra_args args;
    809  init_estimate_block_intra_args(&args, cpi, x);
    810  if (prune_palette_testing_inter(cpi, x->source_variance))
    811    args.prune_palette_sad = true;
    812  TX_SIZE intra_tx_size = AOMMIN(
    813      AOMMIN(max_txsize_lookup[bsize],
    814             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
    815      TX_16X16);
    816  if (is_screen_content && cpi->rc.high_source_sad &&
    817      x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16)
    818    intra_tx_size = TX_4X4;
    819 
    820  PRED_BUFFER *const best_pred = best_pickmode->best_pred;
    821  if (reuse_prediction && best_pred != NULL) {
    822    const int bh = block_size_high[bsize];
    823    const int bw = block_size_wide[bsize];
    824    if (best_pred->data == orig_dst->buf) {
    825      *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
    826      aom_convolve_copy(best_pred->data, best_pred->stride,
    827                        (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
    828                        bh);
    829      best_pickmode->best_pred = *this_mode_pred;
    830    }
    831  }
    832  pd->dst = *orig_dst;
    833 
    834  for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) {
    835    const PREDICTION_MODE this_mode = intra_mode_list[midx];
    836    const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
    837    const int64_t mode_rd_thresh = rd_threshes[mode_index];
    838 
    839    if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id,
    840                            x->content_state_sb.source_sad_nonrd,
    841                            x->color_sensitivity))
    842      continue;
    843 
    844    if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
    845      // For spatially flat blocks with zero motion only check
    846      // DC mode.
    847      if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
    848          x->source_variance == 0 && this_mode != DC_PRED)
    849        continue;
    850      // Only test Intra for big blocks if spatial_variance is small.
    851      else if (bsize > BLOCK_32X32 && x->source_variance > 50)
    852        continue;
    853    }
    854 
    855    if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
    856                            rd_thresh_freq_fact[mode_index]) &&
    857        (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
    858      continue;
    859    }
    860    const BLOCK_SIZE uv_bsize =
    861        get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
    862                             xd->plane[AOM_PLANE_U].subsampling_y);
    863 
    864    mi->mode = this_mode;
    865    mi->ref_frame[0] = INTRA_FRAME;
    866    mi->ref_frame[1] = NONE_FRAME;
    867 
    868    av1_invalid_rd_stats(&this_rdc);
    869    args.mode = this_mode;
    870    args.skippable = 1;
    871    args.rdc = &this_rdc;
    872    mi->tx_size = intra_tx_size;
    873    compute_intra_yprediction(cm, this_mode, bsize, x, xd);
    874    // Look into selecting tx_size here, based on prediction residual.
    875    av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size);
    876    // TODO(kyslov@) Need to account for skippable
    877    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
    878      av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U,
    879                                             av1_estimate_block_intra, &args);
    880    }
    881    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
    882      av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V,
    883                                             av1_estimate_block_intra, &args);
    884    }
    885 
    886    int mode_cost = 0;
    887    if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
    888      mode_cost +=
    889          x->mode_costs.angle_delta_cost[this_mode - V_PRED]
    890                                        [MAX_ANGLE_DELTA +
    891                                         mi->angle_delta[PLANE_TYPE_Y]];
    892    }
    893    if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
    894      mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
    895    }
    896    this_rdc.rate += ref_cost_intra;
    897    this_rdc.rate += intra_cost_penalty;
    898    this_rdc.rate += mode_cost;
    899    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
    900 
    901    if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
    902      // For blocks with low spatial variance and color sad,
    903      // favor the intra-modes, only on scene/slide change.
    904      if (cpi->rc.high_source_sad && x->source_variance < 800 &&
    905          (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
    906           x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]))
    907        this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost);
    908      // Otherwise bias against intra for blocks with zero
    909      // motion and no color, on non-scene/slide changes.
    910      else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
    911               x->content_state_sb.source_sad_nonrd == kZeroSad &&
    912               x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
    913               x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
    914        this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1;
    915    }
    916 
    917    if (this_rdc.rdcost < best_rdc->rdcost) {
    918      *best_rdc = this_rdc;
    919      best_pickmode->best_mode = this_mode;
    920      best_pickmode->best_tx_size = mi->tx_size;
    921      best_pickmode->best_ref_frame = INTRA_FRAME;
    922      best_pickmode->best_second_ref_frame = NONE;
    923      best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm;
    924      mi->uv_mode = this_mode;
    925      mi->mv[0].as_int = INVALID_MV;
    926      mi->mv[1].as_int = INVALID_MV;
    927      if (!this_rdc.skip_txfm)
    928        memset(ctx->blk_skip, 0,
    929               sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
    930    }
    931  }
    932  if (best_pickmode->best_ref_frame == INTRA_FRAME)
    933    memset(ctx->blk_skip, 0,
    934           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
    935  mi->tx_size = best_pickmode->best_tx_size;
    936 
    937  *best_sad_norm = args.best_sad >>
    938                   (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
    939 }