tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mcomp.c (162025B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <limits.h>
     14 #include <math.h>
     15 #include <stdio.h>
     16 
     17 #include "config/aom_config.h"
     18 #include "config/aom_dsp_rtcd.h"
     19 
     20 #include "aom_dsp/aom_dsp_common.h"
     21 #include "aom_mem/aom_mem.h"
     22 #include "aom_ports/mem.h"
     23 
     24 #include "av1/common/av1_common_int.h"
     25 #include "av1/common/common.h"
     26 #include "av1/common/filter.h"
     27 #include "av1/common/mvref_common.h"
     28 #include "av1/common/reconinter.h"
     29 
     30 #include "av1/encoder/encoder.h"
     31 #include "av1/encoder/encodemv.h"
     32 #include "av1/encoder/mcomp.h"
     33 #include "av1/encoder/rdopt.h"
     34 #include "av1/encoder/reconinter_enc.h"
     35 
     36 static inline void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
     37                                       const MvCosts *mv_costs,
     38                                       const MV *ref_mv, int errorperbit,
     39                                       int sadperbit) {
     40  mv_cost_params->ref_mv = ref_mv;
     41  mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
     42  mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
     43  mv_cost_params->error_per_bit = errorperbit;
     44  mv_cost_params->sad_per_bit = sadperbit;
     45  // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the
     46  // population of mvjcost and mvcost are avoided. In case of IntraBC, these
     47  // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode().
     48  if (mv_costs != NULL) {
     49    mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
     50    mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
     51    mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
     52  }
     53 }
     54 
     55 static inline void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
     56  ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
     57  ms_buffers->src = &x->plane[0].src;
     58 
     59  av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
     60 
     61  ms_buffers->wsrc = x->obmc_buffer.wsrc;
     62  ms_buffers->obmc_mask = x->obmc_buffer.mask;
     63 }
     64 
     65 void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
     66  obmc_buffer->wsrc = NULL;
     67  obmc_buffer->mask = NULL;
     68  obmc_buffer->above_pred = NULL;
     69  obmc_buffer->left_pred = NULL;
     70 }
     71 
     72 void av1_make_default_fullpel_ms_params(
     73    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     74    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
     75    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
     76    SEARCH_METHODS search_method, int fine_search_interval) {
     77  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
     78  const int is_key_frame =
     79      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
     80 
     81  // High level params
     82  ms_params->bsize = bsize;
     83  ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
     84 
     85  init_ms_buffers(&ms_params->ms_buffers, x);
     86 
     87  av1_set_mv_search_method(ms_params, search_sites, search_method);
     88 
     89  ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
     90  ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
     91  ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
     92  ms_params->prune_mesh_search =
     93      (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0;
     94  ms_params->mesh_search_mv_diff_threshold = 4;
     95  ms_params->run_mesh_search = 0;
     96  ms_params->fine_search_interval = fine_search_interval;
     97 
     98  ms_params->is_intra_mode = 0;
     99 
    100  ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level;
    101 
    102  ms_params->mv_limits = x->mv_limits;
    103  av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
    104 
    105  if (cpi->oxcf.algo_cfg.sharpness == 3) {
    106    int top_margin = x->e_mbd.mi_row * MI_SIZE + 8;
    107    int left_margin = x->e_mbd.mi_col * MI_SIZE + 8;
    108    int bottom_margin =
    109        cpi->common.height - mi_size_high[bsize] * MI_SIZE - top_margin + 16;
    110    int right_margin =
    111        cpi->common.width - mi_size_wide[bsize] * MI_SIZE - left_margin + 16;
    112 
    113    bottom_margin = AOMMAX(bottom_margin, -top_margin);
    114    right_margin = AOMMAX(right_margin, -left_margin);
    115 
    116    FullMvLimits *mv_limits = &ms_params->mv_limits;
    117    mv_limits->row_min = AOMMAX(mv_limits->row_min, -top_margin);
    118    mv_limits->row_max = AOMMIN(mv_limits->row_max, bottom_margin);
    119    mv_limits->col_min = AOMMAX(mv_limits->col_min, -left_margin);
    120    mv_limits->col_max = AOMMIN(mv_limits->col_max, right_margin);
    121  }
    122 
    123  // Mvcost params
    124  init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
    125                      x->errorperbit, x->sadperbit);
    126 
    127  ms_params->sdf = ms_params->vfp->sdf;
    128  ms_params->sdx4df = ms_params->vfp->sdx4df;
    129  ms_params->sdx3df = ms_params->vfp->sdx3df;
    130 
    131  if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) {
    132    assert(ms_params->vfp->sdsf != NULL);
    133    ms_params->sdf = ms_params->vfp->sdsf;
    134    assert(ms_params->vfp->sdsx4df != NULL);
    135    ms_params->sdx4df = ms_params->vfp->sdsx4df;
    136    // Skip version of sadx3 is not available yet
    137    ms_params->sdx3df = ms_params->vfp->sdsx4df;
    138  } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 &&
    139             !is_key_frame) {
    140    FULLPEL_MV start_mv_clamped = start_mv;
    141    // adjust start_mv to make sure it is within MV range
    142    clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits);
    143 
    144    const struct buf_2d *const ref = ms_params->ms_buffers.ref;
    145    const int ref_stride = ref->stride;
    146    const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped);
    147    const struct buf_2d *const src = ms_params->ms_buffers.src;
    148    const uint8_t *src_buf = src->buf;
    149    const int src_stride = src->stride;
    150 
    151    unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows;
    152    assert(ms_params->vfp->sdsf != NULL);
    153    start_mv_sad_even_rows =
    154        ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride);
    155    start_mv_sad_odd_rows =
    156        ms_params->vfp->sdsf(src_buf + src_stride, src_stride,
    157                             best_address + ref_stride, ref_stride);
    158 
    159    // If the absolute SAD difference computed between the pred-to-src of even
    160    // and odd rows is small, skip every other row in sad computation.
    161    const int odd_to_even_diff_sad =
    162        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
    163    const int mult_thresh = 4;
    164    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
    165      ms_params->sdf = ms_params->vfp->sdsf;
    166      assert(ms_params->vfp->sdsx4df != NULL);
    167      ms_params->sdx4df = ms_params->vfp->sdsx4df;
    168      ms_params->sdx3df = ms_params->vfp->sdsx4df;
    169    }
    170  }
    171 }
    172 
    173 void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    174                              const IntraBCMVCosts *dv_costs) {
    175  ms_params->is_intra_mode = 1;
    176 
    177  MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
    178 
    179  mv_cost_params->mvjcost = dv_costs->joint_mv;
    180  mv_cost_params->mvcost[0] = dv_costs->dv_costs[0];
    181  mv_cost_params->mvcost[1] = dv_costs->dv_costs[1];
    182 }
    183 
    184 void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
    185                                       const struct AV1_COMP *cpi,
    186                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
    187                                       const MV *ref_mv, const int *cost_list) {
    188  const AV1_COMMON *cm = &cpi->common;
    189  // High level params
    190  ms_params->allow_hp = cm->features.allow_high_precision_mv;
    191  ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop;
    192  ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step;
    193  ms_params->cost_list = cond_cost_list_const(cpi, cost_list);
    194 
    195  av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
    196 
    197  if (cpi->oxcf.algo_cfg.sharpness == 3) {
    198    int top_margin = GET_MV_SUBPEL(x->e_mbd.mi_row * MI_SIZE + 8);
    199    int left_margin = GET_MV_SUBPEL(x->e_mbd.mi_col * MI_SIZE + 8);
    200    int bottom_margin =
    201        GET_MV_SUBPEL(cpi->common.height - mi_size_high[bsize] * MI_SIZE -
    202                      x->e_mbd.mi_row * MI_SIZE + 8);
    203    int right_margin =
    204        GET_MV_SUBPEL(cpi->common.width - mi_size_wide[bsize] * MI_SIZE -
    205                      x->e_mbd.mi_col * MI_SIZE + 8);
    206 
    207    bottom_margin = AOMMAX(bottom_margin, -top_margin);
    208    right_margin = AOMMAX(right_margin, -left_margin);
    209 
    210    SubpelMvLimits *mv_limits = &ms_params->mv_limits;
    211    mv_limits->row_min = AOMMAX(mv_limits->row_min, -top_margin);
    212    mv_limits->row_max = AOMMIN(mv_limits->row_max, bottom_margin);
    213    mv_limits->col_min = AOMMAX(mv_limits->col_min, -left_margin);
    214    mv_limits->col_max = AOMMIN(mv_limits->col_max, right_margin);
    215  }
    216 
    217  // Mvcost params
    218  init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
    219                      x->errorperbit, x->sadperbit);
    220 
    221  // Subpel variance params
    222  ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
    223  ms_params->var_params.subpel_search_type =
    224      cpi->sf.mv_sf.use_accurate_subpel_search;
    225  ms_params->var_params.w = block_size_wide[bsize];
    226  ms_params->var_params.h = block_size_high[bsize];
    227 
    228  // Ref and src buffers
    229  MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers;
    230  init_ms_buffers(ms_buffers, x);
    231 }
    232 
    233 void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
    234  // Calculate the outermost full-pixel MVs which are inside the limits set by
    235  // av1_set_subpel_mv_search_range().
    236  //
    237  // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar
    238  // for mv->row. We can then divide by 8 to find the fullpel MV limits. But
    239  // we have to be careful about the rounding. We want these bounds to be
    240  // at least as tight as the subpel limits, which means that we must round
    241  // the minimum values up and the maximum values down when dividing.
    242  int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL;
    243  int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL;
    244  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
    245  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
    246 
    247  col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
    248  row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
    249  col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
    250  row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
    251 
    252  // Get intersection of UMV window and valid MV window to reduce # of checks
    253  // in diamond search.
    254  mv_limits->col_min = AOMMAX(mv_limits->col_min, col_min);
    255  mv_limits->col_max = AOMMIN(mv_limits->col_max, col_max);
    256  mv_limits->row_min = AOMMAX(mv_limits->row_min, row_min);
    257  mv_limits->row_max = AOMMIN(mv_limits->row_max, row_max);
    258 
    259  mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max);
    260  mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max);
    261 }
    262 
    263 int av1_init_search_range(int size) {
    264  int sr = 0;
    265  // Minimum search size no matter what the passed in value.
    266  size = AOMMAX(16, size);
    267 
    268  while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
    269 
    270  sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
    271  return sr;
    272 }
    273 
    274 // ============================================================================
    275 //  Cost of motion vectors
    276 // ============================================================================
    277 // TODO(any): Adaptively adjust the regularization strength based on image size
    278 // and motion activity instead of using hard-coded values. It seems like we
    279 // roughly half the lambda for each increase in resolution
    280 // These are multiplier used to perform regularization in motion compensation
    281 // when x->mv_cost_type is set to MV_COST_L1.
    282 // LOWRES
    283 #define SSE_LAMBDA_LOWRES 2   // Used by mv_cost_err_fn
    284 #define SAD_LAMBDA_LOWRES 32  // Used by mvsad_err_cost during full pixel search
    285 // MIDRES
    286 #define SSE_LAMBDA_MIDRES 0   // Used by mv_cost_err_fn
    287 #define SAD_LAMBDA_MIDRES 15  // Used by mvsad_err_cost during full pixel search
    288 // HDRES
    289 #define SSE_LAMBDA_HDRES 1  // Used by mv_cost_err_fn
    290 #define SAD_LAMBDA_HDRES 8  // Used by mvsad_err_cost during full pixel search
    291 
    292 // Returns the rate of encoding the current motion vector based on the
    293 // joint_cost and comp_cost. joint_costs covers the cost of transmitting
    294 // JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
    295 // vector.
    296 static inline int mv_cost(const MV *mv, const int *joint_cost,
    297                          const int *const comp_cost[2]) {
    298  return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
    299         comp_cost[1][mv->col];
    300 }
    301 
    302 #define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr))
    303 // Returns the cost of encoding the motion vector diff := *mv - *ref. The cost
    304 // is defined as the rate required to encode diff * weight, rounded to the
    305 // nearest 2 ** 7.
    306 // This is NOT used during motion compensation.
    307 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
    308                    int *const mvcost[2], int weight) {
    309  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
    310  return ROUND_POWER_OF_TWO(
    311      mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
    312 }
    313 
    314 // Returns the cost of using the current mv during the motion search. This is
    315 // used when var is used as the error metric.
    316 #define PIXEL_TRANSFORM_ERROR_SCALE 4
    317 static inline int mv_err_cost(const MV *mv, const MV *ref_mv,
    318                              const int *mvjcost, const int *const mvcost[2],
    319                              int error_per_bit, MV_COST_TYPE mv_cost_type) {
    320  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
    321  const MV abs_diff = { abs(diff.row), abs(diff.col) };
    322 
    323  switch (mv_cost_type) {
    324    case MV_COST_ENTROPY:
    325      if (mvcost) {
    326        return (int)ROUND_POWER_OF_TWO_64(
    327            (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
    328            RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
    329                PIXEL_TRANSFORM_ERROR_SCALE);
    330      }
    331      return 0;
    332    case MV_COST_L1_LOWRES:
    333      return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3;
    334    case MV_COST_L1_MIDRES:
    335      return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3;
    336    case MV_COST_L1_HDRES:
    337      return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3;
    338    case MV_COST_NONE: return 0;
    339    default: assert(0 && "Invalid rd_cost_type"); return 0;
    340  }
    341 }
    342 
    343 static inline int mv_err_cost_(const MV *mv,
    344                               const MV_COST_PARAMS *mv_cost_params) {
    345  if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
    346    return 0;
    347  }
    348  return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
    349                     mv_cost_params->mvcost, mv_cost_params->error_per_bit,
    350                     mv_cost_params->mv_cost_type);
    351 }
    352 
    353 // Returns the cost of using the current mv during the motion search. This is
    354 // only used during full pixel motion search when sad is used as the error
    355 // metric
    356 static inline int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
    357                                 const int *mvjcost, const int *const mvcost[2],
    358                                 int sad_per_bit, MV_COST_TYPE mv_cost_type) {
    359  const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
    360                    GET_MV_SUBPEL(mv->col - ref_mv->col) };
    361 
    362  switch (mv_cost_type) {
    363    case MV_COST_ENTROPY:
    364      return ROUND_POWER_OF_TWO(
    365          (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) *
    366              sad_per_bit,
    367          AV1_PROB_COST_SHIFT);
    368    case MV_COST_L1_LOWRES:
    369      return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3;
    370    case MV_COST_L1_MIDRES:
    371      return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3;
    372    case MV_COST_L1_HDRES:
    373      return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3;
    374    case MV_COST_NONE: return 0;
    375    default: assert(0 && "Invalid rd_cost_type"); return 0;
    376  }
    377 }
    378 
    379 static inline int mvsad_err_cost_(const FULLPEL_MV *mv,
    380                                  const MV_COST_PARAMS *mv_cost_params) {
    381  return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
    382                        mv_cost_params->mvjcost, mv_cost_params->mvcost,
    383                        mv_cost_params->sad_per_bit,
    384                        mv_cost_params->mv_cost_type);
    385 }
    386 
    387 // =============================================================================
    388 //  Fullpixel Motion Search: Translational
    389 // =============================================================================
    390 #define MAX_PATTERN_SCALES 11
    391 #define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
    392 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
    393 
    394 // Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods.
    395 // level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND.
    396 static void init_dsmotion_compensation(search_site_config *cfg, int stride,
    397                                       int level) {
    398  int num_search_steps = 0;
    399  int stage_index = MAX_MVSEARCH_STEPS - 1;
    400 
    401  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
    402  cfg->site[stage_index][0].offset = 0;
    403  cfg->stride = stride;
    404 
    405  // Choose the initial step size depending on level.
    406  const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP;
    407 
    408  for (int radius = first_step; radius > 0;) {
    409    int num_search_pts = 8;
    410 
    411    const FULLPEL_MV search_site_mvs[13] = {
    412      { 0, 0 },           { -radius, 0 },      { radius, 0 },
    413      { 0, -radius },     { 0, radius },       { -radius, -radius },
    414      { radius, radius }, { -radius, radius }, { radius, -radius },
    415    };
    416 
    417    int i;
    418    for (i = 0; i <= num_search_pts; ++i) {
    419      search_site *const site = &cfg->site[stage_index][i];
    420      site->mv = search_site_mvs[i];
    421      site->offset = get_offset_from_fullmv(&site->mv, stride);
    422    }
    423    cfg->searches_per_step[stage_index] = num_search_pts;
    424    cfg->radius[stage_index] = radius;
    425    // Update the search radius based on level.
    426    if (!level || ((stage_index < 9) && level)) radius /= 2;
    427    --stage_index;
    428    ++num_search_steps;
    429  }
    430  cfg->num_search_steps = num_search_steps;
    431 }
    432 
    433 void av1_init_motion_fpf(search_site_config *cfg, int stride) {
    434  int num_search_steps = 0;
    435  int stage_index = MAX_MVSEARCH_STEPS - 1;
    436 
    437  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
    438  cfg->site[stage_index][0].offset = 0;
    439  cfg->stride = stride;
    440 
    441  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
    442    // Generate offsets for 8 search sites per step.
    443    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
    444    int num_search_pts = 12;
    445    if (radius == 1) num_search_pts = 8;
    446 
    447    const FULLPEL_MV search_site_mvs[13] = {
    448      { 0, 0 },
    449      { -radius, 0 },
    450      { radius, 0 },
    451      { 0, -radius },
    452      { 0, radius },
    453      { -radius, -tan_radius },
    454      { radius, tan_radius },
    455      { -tan_radius, radius },
    456      { tan_radius, -radius },
    457      { -radius, tan_radius },
    458      { radius, -tan_radius },
    459      { tan_radius, radius },
    460      { -tan_radius, -radius },
    461    };
    462 
    463    int i;
    464    for (i = 0; i <= num_search_pts; ++i) {
    465      search_site *const site = &cfg->site[stage_index][i];
    466      site->mv = search_site_mvs[i];
    467      site->offset = get_offset_from_fullmv(&site->mv, stride);
    468    }
    469    cfg->searches_per_step[stage_index] = num_search_pts;
    470    cfg->radius[stage_index] = radius;
    471    --stage_index;
    472    ++num_search_steps;
    473  }
    474  cfg->num_search_steps = num_search_steps;
    475 }
    476 
    477 // Search site initialization for NSTEP / NSTEP_8PT search methods.
    478 // level = 0: NSTEP, level = 1: NSTEP_8PT.
    479 static void init_motion_compensation_nstep(search_site_config *cfg, int stride,
    480                                           int level) {
    481  int num_search_steps = 0;
    482  int stage_index = 0;
    483  cfg->stride = stride;
    484  int radius = 1;
    485  const int num_stages = (level > 0) ? 16 : 15;
    486  for (stage_index = 0; stage_index < num_stages; ++stage_index) {
    487    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
    488    int num_search_pts = 12;
    489    if ((radius <= 5) || (level > 0)) {
    490      tan_radius = radius;
    491      num_search_pts = 8;
    492    }
    493    const FULLPEL_MV search_site_mvs[13] = {
    494      { 0, 0 },
    495      { -radius, 0 },
    496      { radius, 0 },
    497      { 0, -radius },
    498      { 0, radius },
    499      { -radius, -tan_radius },
    500      { radius, tan_radius },
    501      { -tan_radius, radius },
    502      { tan_radius, -radius },
    503      { -radius, tan_radius },
    504      { radius, -tan_radius },
    505      { tan_radius, radius },
    506      { -tan_radius, -radius },
    507    };
    508 
    509    for (int i = 0; i <= num_search_pts; ++i) {
    510      search_site *const site = &cfg->site[stage_index][i];
    511      site->mv = search_site_mvs[i];
    512      site->offset = get_offset_from_fullmv(&site->mv, stride);
    513    }
    514    cfg->searches_per_step[stage_index] = num_search_pts;
    515    cfg->radius[stage_index] = radius;
    516    ++num_search_steps;
    517    if (stage_index < 12)
    518      radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
    519  }
    520  cfg->num_search_steps = num_search_steps;
    521 }
    522 
    523 // Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND
    524 // search methods.
    525 static void init_motion_compensation_bigdia(search_site_config *cfg, int stride,
    526                                            int level) {
    527  (void)level;
    528  cfg->stride = stride;
    529  // First scale has 4-closest points, the rest have 8 points in diamond
    530  // shape at increasing scales
    531  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
    532    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
    533  };
    534 
    535  // BIGDIA search method candidates.
    536  // Note that the largest candidate step at each scale is 2^scale
    537  /* clang-format off */
    538  static const FULLPEL_MV
    539      site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
    540          { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 },
    541            { 0, 0 }, { 0, 0 } },
    542          { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
    543            { -1, 1 }, { -2, 0 } },
    544          { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
    545            { -2, 2 }, { -4, 0 } },
    546          { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
    547            { -4, 4 }, { -8, 0 } },
    548          { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
    549            { -8, 8 }, { -16, 0 } },
    550          { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
    551            { 0, 32 }, { -16, 16 }, { -32, 0 } },
    552          { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
    553            { 0, 64 }, { -32, 32 }, { -64, 0 } },
    554          { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
    555            { 0, 128 }, { -64, 64 }, { -128, 0 } },
    556          { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 },
    557            { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } },
    558          { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 },
    559            { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } },
    560          { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
    561            { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
    562        };
    563 
    564  /* clang-format on */
    565  int radius = 1;
    566  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
    567    cfg->searches_per_step[i] = bigdia_num_candidates[i];
    568    cfg->radius[i] = radius;
    569    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
    570      search_site *const site = &cfg->site[i][j];
    571      site->mv = site_candidates[i][j];
    572      site->offset = get_offset_from_fullmv(&site->mv, stride);
    573    }
    574    radius *= 2;
    575  }
    576  cfg->num_search_steps = MAX_PATTERN_SCALES;
    577 }
    578 
    579 // Search site initialization for SQUARE search method.
    580 static void init_motion_compensation_square(search_site_config *cfg, int stride,
    581                                            int level) {
    582  (void)level;
    583  cfg->stride = stride;
    584  // All scales have 8 closest points in square shape.
    585  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
    586    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
    587  };
    588 
    589  // Square search method candidates.
    590  // Note that the largest candidate step at each scale is 2^scale.
    591  /* clang-format off */
    592    static const FULLPEL_MV
    593        square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
    594             { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
    595               { -1, 1 }, { -1, 0 } },
    596             { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
    597               { -2, 2 }, { -2, 0 } },
    598             { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
    599               { -4, 4 }, { -4, 0 } },
    600             { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
    601               { -8, 8 }, { -8, 0 } },
    602             { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
    603               { 0, 16 }, { -16, 16 }, { -16, 0 } },
    604             { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
    605               { 0, 32 }, { -32, 32 }, { -32, 0 } },
    606             { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
    607               { 0, 64 }, { -64, 64 }, { -64, 0 } },
    608             { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 },
    609               { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } },
    610             { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 },
    611               { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } },
    612             { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 },
    613               { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } },
    614             { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
    615               { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
    616    };
    617 
    618  /* clang-format on */
    619  int radius = 1;
    620  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
    621    cfg->searches_per_step[i] = square_num_candidates[i];
    622    cfg->radius[i] = radius;
    623    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
    624      search_site *const site = &cfg->site[i][j];
    625      site->mv = square_candidates[i][j];
    626      site->offset = get_offset_from_fullmv(&site->mv, stride);
    627    }
    628    radius *= 2;
    629  }
    630  cfg->num_search_steps = MAX_PATTERN_SCALES;
    631 }
    632 
    633 // Search site initialization for HEX / FAST_HEX search methods.
    634 static void init_motion_compensation_hex(search_site_config *cfg, int stride,
    635                                         int level) {
    636  (void)level;
    637  cfg->stride = stride;
    638  // First scale has 8-closest points, the rest have 6 points in hex shape
    639  // at increasing scales.
    640  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
    641                                                              6, 6, 6, 6, 6 };
    642  // Note that the largest candidate step at each scale is 2^scale.
    643  /* clang-format off */
    644    static const FULLPEL_MV
    645        hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
    646        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
    647          { -1, 1 }, { -1, 0 } },
    648        { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
    649        { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
    650        { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
    651        { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 },
    652          { -8, 16 }, { -16, 0 } },
    653        { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
    654          { -32, 0 } },
    655        { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
    656          { -64, 0 } },
    657        { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 },
    658          { -64, 128 }, { -128, 0 } },
    659        { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 },
    660          { -128, 256 }, { -256, 0 } },
    661        { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 },
    662          { -256, 512 }, { -512, 0 } },
    663        { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
    664          { -512, 1024 }, { -1024, 0 } },
    665    };
    666 
    667  /* clang-format on */
    668  int radius = 1;
    669  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
    670    cfg->searches_per_step[i] = hex_num_candidates[i];
    671    cfg->radius[i] = radius;
    672    for (int j = 0; j < hex_num_candidates[i]; ++j) {
    673      search_site *const site = &cfg->site[i][j];
    674      site->mv = hex_candidates[i][j];
    675      site->offset = get_offset_from_fullmv(&site->mv, stride);
    676    }
    677    radius *= 2;
    678  }
    679  cfg->num_search_steps = MAX_PATTERN_SCALES;
    680 }
    681 
    682 const av1_init_search_site_config
    683    av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = {
    684      init_dsmotion_compensation,     init_motion_compensation_nstep,
    685      init_motion_compensation_nstep, init_dsmotion_compensation,
    686      init_motion_compensation_hex,   init_motion_compensation_bigdia,
    687      init_motion_compensation_square
    688    };
    689 
    690 // Checks whether the mv is within range of the mv_limits
    691 static inline int check_bounds(const FullMvLimits *mv_limits, int row, int col,
    692                               int range) {
    693  return ((row - range) >= mv_limits->row_min) &
    694         ((row + range) <= mv_limits->row_max) &
    695         ((col - range) >= mv_limits->col_min) &
    696         ((col + range) <= mv_limits->col_max);
    697 }
    698 
    699 static inline int get_mvpred_var_cost(
    700    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
    701    FULLPEL_MV_STATS *mv_stats) {
    702  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
    703  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
    704  const struct buf_2d *const src = ms_params->ms_buffers.src;
    705  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
    706  const uint8_t *src_buf = src->buf;
    707  const int src_stride = src->stride;
    708  const int ref_stride = ref->stride;
    709 
    710  int bestsme;
    711 
    712  bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
    713                    ref_stride, &mv_stats->sse);
    714  mv_stats->distortion = bestsme;
    715 
    716  mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
    717  bestsme += mv_stats->err_cost;
    718 
    719  return bestsme;
    720 }
    721 
    722 static inline int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    723                                 const struct buf_2d *const src,
    724                                 const uint8_t *const ref_address,
    725                                 const int ref_stride) {
    726  const uint8_t *src_buf = src->buf;
    727  const int src_stride = src->stride;
    728 
    729  return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
    730 }
    731 
    732 static inline int get_mvpred_compound_var_cost(
    733    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
    734    FULLPEL_MV_STATS *mv_stats) {
    735  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
    736  const struct buf_2d *const src = ms_params->ms_buffers.src;
    737  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
    738  const uint8_t *src_buf = src->buf;
    739  const int src_stride = src->stride;
    740  const int ref_stride = ref->stride;
    741 
    742  const uint8_t *mask = ms_params->ms_buffers.mask;
    743  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
    744  const int mask_stride = ms_params->ms_buffers.mask_stride;
    745  const int invert_mask = ms_params->ms_buffers.inv_mask;
    746  int bestsme;
    747 
    748  if (mask) {
    749    bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
    750                        src_buf, src_stride, second_pred, mask, mask_stride,
    751                        invert_mask, &mv_stats->sse);
    752  } else if (second_pred) {
    753    bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
    754                        src_buf, src_stride, &mv_stats->sse, second_pred);
    755  } else {
    756    bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
    757                      ref_stride, &mv_stats->sse);
    758  }
    759  mv_stats->distortion = bestsme;
    760 
    761  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
    762  mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
    763  bestsme += mv_stats->err_cost;
    764 
    765  return bestsme;
    766 }
    767 
    768 static inline int get_mvpred_compound_sad(
    769    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    770    const struct buf_2d *const src, const uint8_t *const ref_address,
    771    const int ref_stride) {
    772  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
    773  const uint8_t *src_buf = src->buf;
    774  const int src_stride = src->stride;
    775 
    776  const uint8_t *mask = ms_params->ms_buffers.mask;
    777  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
    778  const int mask_stride = ms_params->ms_buffers.mask_stride;
    779  const int invert_mask = ms_params->ms_buffers.inv_mask;
    780 
    781  if (mask) {
    782    return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred,
    783                     mask, mask_stride, invert_mask);
    784  } else if (second_pred) {
    785    assert(vfp->sdaf != NULL);
    786    return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
    787  } else {
    788    return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
    789  }
    790 }
    791 
    792 // Calculates and returns a sad+mvcost list around an integer best pel during
    793 // fullpixel motion search. The resulting list can be used to speed up subpel
    794 // motion search later.
    795 #define USE_SAD_COSTLIST 1
    796 
    797 // calc_int_cost_list uses var to populate the costlist, which is more accurate
    798 // than sad but slightly slower.
    799 static AOM_FORCE_INLINE void calc_int_cost_list(
    800    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    801    int *cost_list) {
    802  static const FULLPEL_MV neighbors[4] = {
    803    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
    804  };
    805  const int br = best_mv.row;
    806  const int bc = best_mv.col;
    807 
    808  FULLPEL_MV_STATS mv_stats;
    809  cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats);
    810 
    811  if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
    812    for (int i = 0; i < 4; i++) {
    813      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
    814                                       bc + neighbors[i].col };
    815      cost_list[i + 1] =
    816          get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
    817    }
    818  } else {
    819    for (int i = 0; i < 4; i++) {
    820      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
    821                                       bc + neighbors[i].col };
    822      if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
    823        cost_list[i + 1] = INT_MAX;
    824      } else {
    825        cost_list[i + 1] =
    826            get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
    827      }
    828    }
    829  }
    830 }
    831 
    832 // calc_int_sad_list uses sad to populate the costlist, which is less accurate
    833 // than var but faster.
    834 static AOM_FORCE_INLINE void calc_int_sad_list(
    835    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    836    int *cost_list, int costlist_has_sad) {
    837  static const FULLPEL_MV neighbors[4] = {
    838    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
    839  };
    840  const struct buf_2d *const src = ms_params->ms_buffers.src;
    841  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
    842  const int ref_stride = ref->stride;
    843  const int br = best_mv.row;
    844  const int bc = best_mv.col;
    845 
    846  assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv));
    847 
    848  // Refresh the costlist it does not contain valid sad
    849  if (!costlist_has_sad) {
    850    cost_list[0] = get_mvpred_sad(
    851        ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride);
    852 
    853    if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
    854      for (int i = 0; i < 4; i++) {
    855        const FULLPEL_MV this_mv = { br + neighbors[i].row,
    856                                     bc + neighbors[i].col };
    857        cost_list[i + 1] = get_mvpred_sad(
    858            ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
    859      }
    860    } else {
    861      for (int i = 0; i < 4; i++) {
    862        const FULLPEL_MV this_mv = { br + neighbors[i].row,
    863                                     bc + neighbors[i].col };
    864        if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
    865          cost_list[i + 1] = INT_MAX;
    866        } else {
    867          cost_list[i + 1] = get_mvpred_sad(
    868              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
    869        }
    870      }
    871    }
    872  }
    873 
    874  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
    875  cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params);
    876 
    877  for (int idx = 0; idx < 4; idx++) {
    878    if (cost_list[idx + 1] != INT_MAX) {
    879      const FULLPEL_MV this_mv = { br + neighbors[idx].row,
    880                                   bc + neighbors[idx].col };
    881      cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params);
    882    }
    883  }
    884 }
    885 
    886 // Computes motion vector cost and adds to the sad cost.
    887 // Then updates the best sad and motion vectors.
    888 // Inputs:
    889 //   this_sad: the sad to be evaluated.
    890 //   mv: the current motion vector.
    891 //   mv_cost_params: a structure containing information to compute mv cost.
    892 //   best_sad: the current best sad.
    893 //   raw_best_sad (optional): the current best sad without calculating mv cost.
    894 //   best_mv: the current best motion vector.
    895 //   second_best_mv (optional): the second best motion vector up to now.
    896 // Modifies:
    897 //   best_sad, raw_best_sad, best_mv, second_best_mv
    898 //   If the current sad is lower than the current best sad.
    899 // Returns:
    900 //   Whether the input sad (mv) is better than the current best.
    901 static inline int update_mvs_and_sad(const unsigned int this_sad,
    902                                     const FULLPEL_MV *mv,
    903                                     const MV_COST_PARAMS *mv_cost_params,
    904                                     unsigned int *best_sad,
    905                                     unsigned int *raw_best_sad,
    906                                     FULLPEL_MV *best_mv,
    907                                     FULLPEL_MV *second_best_mv) {
    908  if (this_sad >= *best_sad) return 0;
    909 
    910  // Add the motion vector cost.
    911  const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params);
    912  if (sad < *best_sad) {
    913    if (raw_best_sad) *raw_best_sad = this_sad;
    914    *best_sad = sad;
    915    if (second_best_mv) *second_best_mv = *best_mv;
    916    *best_mv = *mv;
    917    return 1;
    918  }
    919  return 0;
    920 }
    921 
    922 // Calculate sad4 and update the bestmv information
    923 // in FAST_DIAMOND search method.
    924 static inline void calc_sad4_update_bestmv(
    925    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    926    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
    927    const FULLPEL_MV center_mv, const uint8_t *center_address,
    928    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
    929    int *best_site, int cand_start, int *cost_list) {
    930  const struct buf_2d *const src = ms_params->ms_buffers.src;
    931  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
    932  const search_site *site = ms_params->search_sites->site[search_step];
    933 
    934  unsigned char const *block_offset[4];
    935  unsigned int sads_buf[4];
    936  unsigned int *sads;
    937  const uint8_t *src_buf = src->buf;
    938  const int src_stride = src->stride;
    939  if (cost_list) {
    940    sads = (unsigned int *)(cost_list + 1);
    941  } else {
    942    sads = sads_buf;
    943  }
    944  // Loop over number of candidates.
    945  for (int j = 0; j < 4; j++)
    946    block_offset[j] = site[cand_start + j].offset + center_address;
    947 
    948  // 4-point sad calculation.
    949  ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
    950 
    951  for (int j = 0; j < 4; j++) {
    952    const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row,
    953                                 center_mv.col + site[cand_start + j].mv.col };
    954    const int found_better_mv = update_mvs_and_sad(
    955        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
    956        /*second_best_mv=*/NULL);
    957    if (found_better_mv) *best_site = cand_start + j;
    958  }
    959 }
    960 
    961 static inline void calc_sad3_update_bestmv(
    962    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    963    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
    964    FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad,
    965    unsigned int *raw_bestsad, int search_step, int *best_site,
    966    const int *chkpts_indices, int *cost_list) {
    967  const struct buf_2d *const src = ms_params->ms_buffers.src;
    968  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
    969  const search_site *site = ms_params->search_sites->site[search_step];
    970  unsigned char const *block_offset[4] = {
    971    center_address + site[chkpts_indices[0]].offset,
    972    center_address + site[chkpts_indices[1]].offset,
    973    center_address + site[chkpts_indices[2]].offset,
    974    center_address,
    975  };
    976  unsigned int sads[4];
    977  ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
    978  for (int j = 0; j < 3; j++) {
    979    const int index = chkpts_indices[j];
    980    const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
    981                                 center_mv.col + site[index].mv.col };
    982    const int found_better_mv = update_mvs_and_sad(
    983        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
    984        /*second_best_mv=*/NULL);
    985    if (found_better_mv) *best_site = j;
    986  }
    987  if (cost_list) {
    988    for (int j = 0; j < 3; j++) {
    989      int index = chkpts_indices[j];
    990      cost_list[index + 1] = sads[j];
    991    }
    992  }
    993 }
    994 
    995 // Calculate sad and update the bestmv information
    996 // in FAST_DIAMOND search method.
    997 static inline void calc_sad_update_bestmv(
    998    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
    999    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
   1000    const FULLPEL_MV center_mv, const uint8_t *center_address,
   1001    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
   1002    int *best_site, const int num_candidates, int cand_start, int *cost_list) {
   1003  const struct buf_2d *const src = ms_params->ms_buffers.src;
   1004  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   1005  const search_site *site = ms_params->search_sites->site[search_step];
   1006  // Loop over number of candidates.
   1007  for (int i = cand_start; i < num_candidates; i++) {
   1008    const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row,
   1009                                 center_mv.col + site[i].mv.col };
   1010    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
   1011    int thissad = get_mvpred_sad(ms_params, src,
   1012                                 center_address + site[i].offset, ref->stride);
   1013    if (cost_list) {
   1014      cost_list[i + 1] = thissad;
   1015    }
   1016    const int found_better_mv = update_mvs_and_sad(
   1017        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
   1018        /*second_best_mv=*/NULL);
   1019    if (found_better_mv) *best_site = i;
   1020  }
   1021 }
   1022 
   1023 static inline void calc_sad_update_bestmv_with_indices(
   1024    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1025    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
   1026    const FULLPEL_MV center_mv, const uint8_t *center_address,
   1027    unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
   1028    int *best_site, const int num_candidates, const int *chkpts_indices,
   1029    int *cost_list) {
   1030  const struct buf_2d *const src = ms_params->ms_buffers.src;
   1031  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   1032  const search_site *site = ms_params->search_sites->site[search_step];
   1033  // Loop over number of candidates.
   1034  for (int i = 0; i < num_candidates; i++) {
   1035    int index = chkpts_indices[i];
   1036    const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
   1037                                 center_mv.col + site[index].mv.col };
   1038    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
   1039      if (cost_list) {
   1040        cost_list[index + 1] = INT_MAX;
   1041      }
   1042      continue;
   1043    }
   1044    const int thissad = get_mvpred_sad(
   1045        ms_params, src, center_address + site[index].offset, ref->stride);
   1046    if (cost_list) {
   1047      cost_list[index + 1] = thissad;
   1048    }
   1049    const int found_better_mv = update_mvs_and_sad(
   1050        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
   1051        /*second_best_mv=*/NULL);
   1052    if (found_better_mv) *best_site = i;
   1053  }
   1054 }
   1055 
   1056 // Generic pattern search function that searches over multiple scales.
   1057 // Each scale can have a different number of candidates and shape of
   1058 // candidates as indicated in the num_candidates and candidates arrays
   1059 // passed into this function
   1060 static int pattern_search(FULLPEL_MV start_mv,
   1061                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1062                          int search_step, const int do_init_search,
   1063                          int *cost_list, FULLPEL_MV *best_mv,
   1064                          FULLPEL_MV_STATS *best_mv_stats) {
   1065  static const int search_steps[MAX_MVSEARCH_STEPS] = {
   1066    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   1067  };
   1068  int i, s, t;
   1069 
   1070  const struct buf_2d *const src = ms_params->ms_buffers.src;
   1071  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   1072  const search_site_config *search_sites = ms_params->search_sites;
   1073  const int *num_candidates = search_sites->searches_per_step;
   1074  const int ref_stride = ref->stride;
   1075  const int last_is_4 = num_candidates[0] == 4;
   1076  int br, bc;
   1077  unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
   1078  int k = -1;
   1079  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   1080  search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
   1081  assert(search_step >= 0);
   1082  int best_init_s = search_steps[search_step];
   1083  // adjust ref_mv to make sure it is within MV range
   1084  clamp_fullmv(&start_mv, &ms_params->mv_limits);
   1085  br = start_mv.row;
   1086  bc = start_mv.col;
   1087  if (cost_list != NULL) {
   1088    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
   1089        INT_MAX;
   1090  }
   1091  int costlist_has_sad = 0;
   1092 
   1093  // Work out the start point for the search
   1094  raw_bestsad = get_mvpred_sad(ms_params, src,
   1095                               get_buf_from_fullmv(ref, &start_mv), ref_stride);
   1096  bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params);
   1097 
   1098  // Search all possible scales up to the search param around the center point
   1099  // pick the scale of the point that is best as the starting scale of
   1100  // further steps around it.
   1101  const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv);
   1102  if (do_init_search) {
   1103    s = best_init_s;
   1104    best_init_s = -1;
   1105    for (t = 0; t <= s; ++t) {
   1106      int best_site = -1;
   1107      FULLPEL_MV center_mv = { br, bc };
   1108      if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
   1109        // Call 4-point sad for multiples of 4 candidates.
   1110        const int no_of_4_cand_loops = num_candidates[t] >> 2;
   1111        for (i = 0; i < no_of_4_cand_loops; i++) {
   1112          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1113                                  center_address, &bestsad, &raw_bestsad, t,
   1114                                  &best_site, i * 4, /*cost_list=*/NULL);
   1115        }
   1116        // Rest of the candidates
   1117        const int remaining_cand = num_candidates[t] % 4;
   1118        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1119                               center_address, &bestsad, &raw_bestsad, t,
   1120                               &best_site, remaining_cand,
   1121                               no_of_4_cand_loops * 4, NULL);
   1122      } else {
   1123        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1124                               center_address, &bestsad, &raw_bestsad, t,
   1125                               &best_site, num_candidates[t], 0, NULL);
   1126      }
   1127      if (best_site == -1) {
   1128        continue;
   1129      } else {
   1130        best_init_s = t;
   1131        k = best_site;
   1132      }
   1133    }
   1134    if (best_init_s != -1) {
   1135      br += search_sites->site[best_init_s][k].mv.row;
   1136      bc += search_sites->site[best_init_s][k].mv.col;
   1137      center_address += search_sites->site[best_init_s][k].offset;
   1138    }
   1139  }
   1140 
   1141  // If the center point is still the best, just skip this and move to
   1142  // the refinement step.
   1143  if (best_init_s != -1) {
   1144    const int last_s = (last_is_4 && cost_list != NULL);
   1145    int best_site = -1;
   1146    s = best_init_s;
   1147 
   1148    for (; s >= last_s; s--) {
   1149      // No need to search all points the 1st time if initial search was used
   1150      if (!do_init_search || s != best_init_s) {
   1151        FULLPEL_MV center_mv = { br, bc };
   1152        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
   1153          // Call 4-point sad for multiples of 4 candidates.
   1154          const int no_of_4_cand_loops = num_candidates[s] >> 2;
   1155          for (i = 0; i < no_of_4_cand_loops; i++) {
   1156            calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
   1157                                    center_mv, center_address, &bestsad,
   1158                                    &raw_bestsad, s, &best_site, i * 4,
   1159                                    /*cost_list=*/NULL);
   1160          }
   1161          // Rest of the candidates
   1162          const int remaining_cand = num_candidates[s] % 4;
   1163          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1164                                 center_address, &bestsad, &raw_bestsad, s,
   1165                                 &best_site, remaining_cand,
   1166                                 no_of_4_cand_loops * 4, NULL);
   1167        } else {
   1168          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1169                                 center_address, &bestsad, &raw_bestsad, s,
   1170                                 &best_site, num_candidates[s], 0, NULL);
   1171        }
   1172 
   1173        if (best_site == -1) {
   1174          continue;
   1175        } else {
   1176          br += search_sites->site[s][best_site].mv.row;
   1177          bc += search_sites->site[s][best_site].mv.col;
   1178          center_address += search_sites->site[s][best_site].offset;
   1179          k = best_site;
   1180        }
   1181      }
   1182 
   1183      do {
   1184        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
   1185        best_site = -1;
   1186        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
   1187        next_chkpts_indices[1] = k;
   1188        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
   1189 
   1190        FULLPEL_MV center_mv = { br, bc };
   1191        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
   1192          calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1193                                  center_address, &bestsad, &raw_bestsad, s,
   1194                                  &best_site, next_chkpts_indices, NULL);
   1195        } else {
   1196          calc_sad_update_bestmv_with_indices(
   1197              ms_params, mv_cost_params, best_mv, center_mv, center_address,
   1198              &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
   1199              next_chkpts_indices, NULL);
   1200        }
   1201 
   1202        if (best_site != -1) {
   1203          k = next_chkpts_indices[best_site];
   1204          br += search_sites->site[s][k].mv.row;
   1205          bc += search_sites->site[s][k].mv.col;
   1206          center_address += search_sites->site[s][k].offset;
   1207        }
   1208      } while (best_site != -1);
   1209    }
   1210    // Note: If we enter the if below, then cost_list must be non-NULL.
   1211    if (s == 0) {
   1212      cost_list[0] = raw_bestsad;
   1213      costlist_has_sad = 1;
   1214      assert(num_candidates[s] == 4);
   1215      if (!do_init_search || s != best_init_s) {
   1216        FULLPEL_MV center_mv = { br, bc };
   1217        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
   1218          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1219                                  center_address, &bestsad, &raw_bestsad, s,
   1220                                  &best_site, 0, cost_list);
   1221        } else {
   1222          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1223                                 center_address, &bestsad, &raw_bestsad, s,
   1224                                 &best_site, /*num_candidates=*/4,
   1225                                 /*cand_start=*/0, cost_list);
   1226        }
   1227 
   1228        if (best_site != -1) {
   1229          br += search_sites->site[s][best_site].mv.row;
   1230          bc += search_sites->site[s][best_site].mv.col;
   1231          center_address += search_sites->site[s][best_site].offset;
   1232          k = best_site;
   1233        }
   1234      }
   1235      while (best_site != -1) {
   1236        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
   1237        best_site = -1;
   1238        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
   1239        next_chkpts_indices[1] = k;
   1240        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
   1241        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
   1242        cost_list[((k + 2) % 4) + 1] = cost_list[0];
   1243        cost_list[0] = raw_bestsad;
   1244 
   1245        FULLPEL_MV center_mv = { br, bc };
   1246        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
   1247          assert(PATTERN_CANDIDATES_REF == 3);
   1248          calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
   1249                                  center_address, &bestsad, &raw_bestsad, s,
   1250                                  &best_site, next_chkpts_indices, cost_list);
   1251        } else {
   1252          calc_sad_update_bestmv_with_indices(
   1253              ms_params, mv_cost_params, best_mv, center_mv, center_address,
   1254              &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
   1255              next_chkpts_indices, cost_list);
   1256        }
   1257 
   1258        if (best_site != -1) {
   1259          k = next_chkpts_indices[best_site];
   1260          br += search_sites->site[s][k].mv.row;
   1261          bc += search_sites->site[s][k].mv.col;
   1262          center_address += search_sites->site[s][k].offset;
   1263        }
   1264      }
   1265    }
   1266  }
   1267  best_mv->row = br;
   1268  best_mv->col = bc;
   1269 
   1270  assert(center_address == get_buf_from_fullmv(ref, best_mv) &&
   1271         "center address is out of sync with best_mv!\n");
   1272 
   1273  // Returns the one-away integer pel cost/sad around the best as follows:
   1274  // cost_list[0]: cost/sad at the best integer pel
   1275  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
   1276  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
   1277  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
   1278  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
   1279  if (cost_list) {
   1280    if (USE_SAD_COSTLIST) {
   1281      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
   1282    } else {
   1283      calc_int_cost_list(*best_mv, ms_params, cost_list);
   1284    }
   1285  }
   1286 
   1287  const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats);
   1288  return var_cost;
   1289 }
   1290 
   1291 // For the following foo_search, the input arguments are:
   1292 // start_mv: where we are starting our motion search
   1293 // ms_params: a collection of motion search parameters
   1294 // search_step: how many steps to skip in our motion search. For example,
   1295 //   a value 3 suggests that 3 search steps have already taken place prior to
   1296 //   this function call, so we jump directly to step 4 of the search process
   1297 // do_init_search: if on, do an initial search of all possible scales around the
   1298 //   start_mv, and then pick the best scale.
   1299 // cond_list: used to hold the cost around the best full mv so we can use it to
   1300 //   speed up subpel search later.
   1301 // best_mv: the best mv found in the motion search
   1302 static int hex_search(const FULLPEL_MV start_mv,
   1303                      const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1304                      const int search_step, const int do_init_search,
   1305                      int *cost_list, FULLPEL_MV *best_mv,
   1306                      FULLPEL_MV_STATS *best_mv_stats) {
   1307  return pattern_search(start_mv, ms_params, search_step, do_init_search,
   1308                        cost_list, best_mv, best_mv_stats);
   1309 }
   1310 
   1311 static int bigdia_search(const FULLPEL_MV start_mv,
   1312                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1313                         const int search_step, const int do_init_search,
   1314                         int *cost_list, FULLPEL_MV *best_mv,
   1315                         FULLPEL_MV_STATS *best_mv_stats) {
   1316  return pattern_search(start_mv, ms_params, search_step, do_init_search,
   1317                        cost_list, best_mv, best_mv_stats);
   1318 }
   1319 
   1320 static int square_search(const FULLPEL_MV start_mv,
   1321                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1322                         const int search_step, const int do_init_search,
   1323                         int *cost_list, FULLPEL_MV *best_mv,
   1324                         FULLPEL_MV_STATS *best_mv_stats) {
   1325  return pattern_search(start_mv, ms_params, search_step, do_init_search,
   1326                        cost_list, best_mv, best_mv_stats);
   1327 }
   1328 
   1329 static int fast_hex_search(const FULLPEL_MV start_mv,
   1330                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1331                           const int search_step, const int do_init_search,
   1332                           int *cost_list, FULLPEL_MV *best_mv,
   1333                           FULLPEL_MV_STATS *best_mv_stats) {
   1334  return hex_search(start_mv, ms_params,
   1335                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
   1336                    cost_list, best_mv, best_mv_stats);
   1337 }
   1338 
   1339 static int vfast_dia_search(const FULLPEL_MV start_mv,
   1340                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1341                            const int search_step, const int do_init_search,
   1342                            int *cost_list, FULLPEL_MV *best_mv,
   1343                            FULLPEL_MV_STATS *best_mv_stats) {
   1344  return bigdia_search(start_mv, ms_params,
   1345                       AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
   1346                       do_init_search, cost_list, best_mv, best_mv_stats);
   1347 }
   1348 
   1349 static int fast_dia_search(const FULLPEL_MV start_mv,
   1350                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1351                           const int search_step, const int do_init_search,
   1352                           int *cost_list, FULLPEL_MV *best_mv,
   1353                           FULLPEL_MV_STATS *best_mv_stats) {
   1354  return bigdia_search(start_mv, ms_params,
   1355                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
   1356                       do_init_search, cost_list, best_mv, best_mv_stats);
   1357 }
   1358 
   1359 static int fast_bigdia_search(const FULLPEL_MV start_mv,
   1360                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1361                              const int search_step, const int do_init_search,
   1362                              int *cost_list, FULLPEL_MV *best_mv,
   1363                              FULLPEL_MV_STATS *best_mv_stats) {
   1364  return bigdia_search(start_mv, ms_params,
   1365                       AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
   1366                       do_init_search, cost_list, best_mv, best_mv_stats);
   1367 }
   1368 
   1369 static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
   1370                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1371                              const int search_step, int *num00,
   1372                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
   1373 #define UPDATE_SEARCH_STEP                                      \
   1374  do {                                                          \
   1375    if (best_site != 0) {                                       \
   1376      tmp_second_best_mv = *best_mv;                            \
   1377      best_mv->row += site[best_site].mv.row;                   \
   1378      best_mv->col += site[best_site].mv.col;                   \
   1379      best_address += site[best_site].offset;                   \
   1380      is_off_center = 1;                                        \
   1381    }                                                           \
   1382                                                                \
   1383    if (is_off_center == 0) num_center_steps++;                 \
   1384                                                                \
   1385    if (best_site == 0 && step > 2) {                           \
   1386      int next_step_size = cfg->radius[step - 1];               \
   1387      while (next_step_size == cfg->radius[step] && step > 2) { \
   1388        num_center_steps++;                                     \
   1389        --step;                                                 \
   1390        next_step_size = cfg->radius[step - 1];                 \
   1391      }                                                         \
   1392    }                                                           \
   1393  } while (0)
   1394 
   1395  const struct buf_2d *const src = ms_params->ms_buffers.src;
   1396  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   1397 
   1398  const uint8_t *src_buf = src->buf;
   1399  const int src_stride = src->stride;
   1400  const int ref_stride = ref->stride;
   1401 
   1402  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   1403 
   1404  const search_site_config *cfg = ms_params->search_sites;
   1405 
   1406  int is_off_center = 0;
   1407  // Number of times that we have stayed in the middle. This is used to skip
   1408  // search steps in the future if diamond_search_sad is called again.
   1409  int num_center_steps = 0;
   1410 
   1411  // search_step determines the length of the initial step and hence the number
   1412  // of iterations.
   1413  const int tot_steps = cfg->num_search_steps - search_step;
   1414  FULLPEL_MV tmp_second_best_mv;
   1415  if (second_best_mv) {
   1416    tmp_second_best_mv = *second_best_mv;
   1417  }
   1418 
   1419  *best_mv = start_mv;
   1420 
   1421  // Check the starting position
   1422  const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
   1423  unsigned int bestsad = start_mv_sad;
   1424 
   1425  // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf
   1426  if (ms_params->ms_buffers.second_pred) {
   1427    for (int step = tot_steps - 1; step >= 0; --step) {
   1428      const search_site *site = cfg->site[step];
   1429      const int num_searches = cfg->searches_per_step[step];
   1430      int best_site = 0;
   1431 
   1432      for (int idx = 1; idx <= num_searches; idx++) {
   1433        const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
   1434                                     best_mv->col + site[idx].mv.col };
   1435 
   1436        if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
   1437          const uint8_t *const check_here = site[idx].offset + best_address;
   1438          unsigned int thissad =
   1439              get_mvpred_compound_sad(ms_params, src, check_here, ref_stride);
   1440 
   1441          if (thissad < bestsad) {
   1442            thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
   1443            if (thissad < bestsad) {
   1444              bestsad = thissad;
   1445              best_site = idx;
   1446            }
   1447          }
   1448        }
   1449      }
   1450      UPDATE_SEARCH_STEP;
   1451    }
   1452  } else {
   1453    for (int step = tot_steps - 1; step >= 0; --step) {
   1454      const search_site *site = cfg->site[step];
   1455      const int num_searches = cfg->searches_per_step[step];
   1456      int best_site = 0;
   1457 
   1458      int all_in = 1;
   1459      // Trap illegal vectors
   1460      all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min;
   1461      all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max;
   1462      all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min;
   1463      all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max;
   1464 
   1465      if (all_in) {
   1466        for (int idx = 1; idx <= num_searches; idx += 4) {
   1467          unsigned char const *block_offset[4];
   1468          unsigned int sads[4];
   1469 
   1470          for (int j = 0; j < 4; j++)
   1471            block_offset[j] = site[idx + j].offset + best_address;
   1472 
   1473          ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride,
   1474                            sads);
   1475          for (int j = 0; j < 4; j++) {
   1476            if (sads[j] < bestsad) {
   1477              const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
   1478                                           best_mv->col +
   1479                                               site[idx + j].mv.col };
   1480              unsigned int thissad =
   1481                  sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
   1482              if (thissad < bestsad) {
   1483                bestsad = thissad;
   1484                best_site = idx + j;
   1485              }
   1486            }
   1487          }
   1488        }
   1489      } else {
   1490        for (int idx = 1; idx <= num_searches; idx++) {
   1491          const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
   1492                                       best_mv->col + site[idx].mv.col };
   1493 
   1494          if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
   1495            const uint8_t *const check_here = site[idx].offset + best_address;
   1496            unsigned int thissad =
   1497                get_mvpred_sad(ms_params, src, check_here, ref_stride);
   1498 
   1499            if (thissad < bestsad) {
   1500              thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
   1501              if (thissad < bestsad) {
   1502                bestsad = thissad;
   1503                best_site = idx;
   1504              }
   1505            }
   1506          }
   1507        }
   1508      }
   1509      UPDATE_SEARCH_STEP;
   1510    }
   1511  }
   1512 
   1513  *num00 = num_center_steps;
   1514  if (second_best_mv) {
   1515    *second_best_mv = tmp_second_best_mv;
   1516  }
   1517 
   1518  return bestsad;
   1519 
   1520 #undef UPDATE_SEARCH_STEP
   1521 }
   1522 
   1523 static inline unsigned int get_start_mvpred_sad_cost(
   1524    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) {
   1525  const struct buf_2d *const src = ms_params->ms_buffers.src;
   1526  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   1527  const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
   1528 
   1529  unsigned int start_mv_sad =
   1530      mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params);
   1531 
   1532  if (ms_params->ms_buffers.second_pred)
   1533    start_mv_sad +=
   1534        get_mvpred_compound_sad(ms_params, src, best_address, ref->stride);
   1535  else
   1536    start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride);
   1537 
   1538  return start_mv_sad;
   1539 }
   1540 
   1541 static int full_pixel_diamond(FULLPEL_MV start_mv,
   1542                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1543                              const int step_param, int *cost_list,
   1544                              FULLPEL_MV *best_mv,
   1545                              FULLPEL_MV_STATS *best_mv_stats,
   1546                              FULLPEL_MV *second_best_mv) {
   1547  const search_site_config *cfg = ms_params->search_sites;
   1548  int thissme, n, num00 = 0;
   1549 
   1550  // Clamp start mv and calculate the cost
   1551  clamp_fullmv(&start_mv, &ms_params->mv_limits);
   1552  unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv);
   1553 
   1554  diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv,
   1555                     second_best_mv);
   1556 
   1557  int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats);
   1558 
   1559  // If there won't be more n-step search, check to see if refining search is
   1560  // needed.
   1561  const int further_steps = cfg->num_search_steps - 1 - step_param;
   1562  while (n < further_steps) {
   1563    ++n;
   1564 
   1565    // TODO(chiyotsai@google.com): There is another bug here where the second
   1566    // best mv gets incorrectly overwritten. Fix it later.
   1567    FULLPEL_MV tmp_best_mv;
   1568    FULLPEL_MV_STATS tmp_best_mv_stats;
   1569    diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n,
   1570                       &num00, &tmp_best_mv, second_best_mv);
   1571 
   1572    thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv,
   1573                                           &tmp_best_mv_stats);
   1574 
   1575    if (thissme < bestsme) {
   1576      bestsme = thissme;
   1577      *best_mv = tmp_best_mv;
   1578      *best_mv_stats = tmp_best_mv_stats;
   1579    }
   1580 
   1581    if (num00) {
   1582      // Advance the loop by num00 steps
   1583      n += num00;
   1584      num00 = 0;
   1585    }
   1586  }
   1587 
   1588  // Return cost list.
   1589  if (cost_list) {
   1590    if (USE_SAD_COSTLIST) {
   1591      const int costlist_has_sad = 0;
   1592      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
   1593    } else {
   1594      calc_int_cost_list(*best_mv, ms_params, cost_list);
   1595    }
   1596  }
   1597  return bestsme;
   1598 }
   1599 
   1600 // Exhaustive motion search around a given centre position with a given
   1601 // step size.
   1602 static int exhaustive_mesh_search(FULLPEL_MV start_mv,
   1603                                  const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1604                                  const int range, const int step,
   1605                                  FULLPEL_MV *best_mv,
   1606                                  FULLPEL_MV *second_best_mv) {
   1607  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   1608  const struct buf_2d *const src = ms_params->ms_buffers.src;
   1609  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   1610  const int ref_stride = ref->stride;
   1611  unsigned int best_sad = INT_MAX;
   1612  int r, c, i;
   1613  int start_col, end_col, start_row, end_row;
   1614  const int col_step = (step > 1) ? step : 4;
   1615 
   1616  assert(step >= 1);
   1617 
   1618  clamp_fullmv(&start_mv, &ms_params->mv_limits);
   1619  *best_mv = start_mv;
   1620  best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv),
   1621                            ref_stride);
   1622  best_sad += mvsad_err_cost_(&start_mv, mv_cost_params);
   1623  start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row);
   1624  start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col);
   1625  end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row);
   1626  end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col);
   1627 
   1628  for (r = start_row; r <= end_row; r += step) {
   1629    for (c = start_col; c <= end_col; c += col_step) {
   1630      // Step > 1 means we are not checking every location in this pass.
   1631      if (step > 1) {
   1632        const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
   1633        unsigned int sad = get_mvpred_sad(
   1634            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
   1635        update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
   1636                           /*raw_best_sad=*/NULL, best_mv, second_best_mv);
   1637      } else {
   1638        // 4 sads in a single call if we are checking every location
   1639        if (c + 3 <= end_col) {
   1640          unsigned int sads[4];
   1641          const uint8_t *addrs[4];
   1642          for (i = 0; i < 4; ++i) {
   1643            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
   1644            addrs[i] = get_buf_from_fullmv(ref, &mv);
   1645          }
   1646 
   1647          ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
   1648 
   1649          for (i = 0; i < 4; ++i) {
   1650            if (sads[i] < best_sad) {
   1651              const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
   1652              update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad,
   1653                                 /*raw_best_sad=*/NULL, best_mv,
   1654                                 second_best_mv);
   1655            }
   1656          }
   1657        } else {
   1658          for (i = 0; i < end_col - c; ++i) {
   1659            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
   1660            unsigned int sad = get_mvpred_sad(
   1661                ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
   1662            update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
   1663                               /*raw_best_sad=*/NULL, best_mv, second_best_mv);
   1664          }
   1665        }
   1666      }
   1667    }
   1668  }
   1669 
   1670  return best_sad;
   1671 }
   1672 
   1673 // Runs an limited range exhaustive mesh search using a pattern set
   1674 // according to the encode speed profile.
   1675 static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
   1676                                 const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1677                                 const struct MESH_PATTERN *const mesh_patterns,
   1678                                 int *cost_list, FULLPEL_MV *best_mv,
   1679                                 FULLPEL_MV_STATS *mv_stats,
   1680                                 FULLPEL_MV *second_best_mv) {
   1681  const int kMinRange = 7;
   1682  const int kMaxRange = 256;
   1683  const int kMinInterval = 1;
   1684 
   1685  int bestsme;
   1686  int i;
   1687  int interval = mesh_patterns[0].interval;
   1688  int range = mesh_patterns[0].range;
   1689  int baseline_interval_divisor;
   1690 
   1691  // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref
   1692  // version of sad and variance function. We still need to check the
   1693  // performance when compound ref exhaustive search is enabled.
   1694  assert(!ms_params->ms_buffers.second_pred &&
   1695         "Mesh search does not support compound mode!");
   1696 
   1697  *best_mv = start_mv;
   1698 
   1699  // Trap illegal values for interval and range for this function.
   1700  if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) ||
   1701      (interval > range))
   1702    return INT_MAX;
   1703 
   1704  baseline_interval_divisor = range / interval;
   1705 
   1706  // Check size of proposed first range against magnitude of the centre
   1707  // value used as a starting point.
   1708  range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
   1709  range = AOMMIN(range, kMaxRange);
   1710  interval = AOMMAX(interval, range / baseline_interval_divisor);
   1711  // Use a small search step/interval for certain kind of clips.
   1712  // For example, screen content clips with a lot of texts.
   1713  // Large interval could lead to a false matching position, and it can't find
   1714  // the best global candidate in following iterations due to reduced search
   1715  // range. The solution here is to use a small search iterval in the beginning
   1716  // and thus reduces the chance of missing the best candidate.
   1717  if (ms_params->fine_search_interval) {
   1718    interval = AOMMIN(interval, 4);
   1719  }
   1720 
   1721  // initial search
   1722  bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
   1723                                   best_mv, second_best_mv);
   1724 
   1725  if ((interval > kMinInterval) && (range > kMinRange)) {
   1726    // Progressive searches with range and step size decreasing each time
   1727    // till we reach a step size of 1. Then break out.
   1728    for (i = 1; i < MAX_MESH_STEP; ++i) {
   1729      // First pass with coarser step and longer range
   1730      bestsme = exhaustive_mesh_search(
   1731          *best_mv, ms_params, mesh_patterns[i].range,
   1732          mesh_patterns[i].interval, best_mv, second_best_mv);
   1733 
   1734      if (mesh_patterns[i].interval == 1) break;
   1735    }
   1736  }
   1737 
   1738  if (bestsme < INT_MAX) {
   1739    bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats);
   1740  }
   1741 
   1742  // Return cost list.
   1743  if (cost_list) {
   1744    if (USE_SAD_COSTLIST) {
   1745      const int costlist_has_sad = 0;
   1746      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
   1747    } else {
   1748      calc_int_cost_list(*best_mv, ms_params, cost_list);
   1749    }
   1750  }
   1751  return bestsme;
   1752 }
   1753 
   1754 // This function is called when we do joint motion search in comp_inter_inter
   1755 // mode, or when searching for one component of an ext-inter compound mode.
   1756 int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1757                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) {
   1758  static const search_neighbors neighbors[8] = {
   1759    { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
   1760    { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
   1761    { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
   1762    { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
   1763    { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
   1764    { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
   1765    { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
   1766    { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
   1767  };
   1768 
   1769  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P *
   1770                                SEARCH_GRID_STRIDE_8P] = { 0 };
   1771  int grid_center = SEARCH_GRID_CENTER_8P;
   1772  int grid_coord = grid_center;
   1773 
   1774  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   1775  const FullMvLimits *mv_limits = &ms_params->mv_limits;
   1776  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
   1777  const struct buf_2d *src = ms_buffers->src;
   1778  const struct buf_2d *ref = ms_buffers->ref;
   1779  const int ref_stride = ref->stride;
   1780 
   1781  *best_mv = start_mv;
   1782  clamp_fullmv(best_mv, mv_limits);
   1783 
   1784  unsigned int best_sad = get_mvpred_compound_sad(
   1785      ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride);
   1786  best_sad += mvsad_err_cost_(best_mv, mv_cost_params);
   1787 
   1788  do_refine_search_grid[grid_coord] = 1;
   1789 
   1790  for (int i = 0; i < SEARCH_RANGE_8P; ++i) {
   1791    int best_site = -1;
   1792 
   1793    for (int j = 0; j < 8; ++j) {
   1794      grid_coord = grid_center + neighbors[j].coord_offset;
   1795      if (do_refine_search_grid[grid_coord] == 1) {
   1796        continue;
   1797      }
   1798      const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row,
   1799                              best_mv->col + neighbors[j].coord.col };
   1800 
   1801      do_refine_search_grid[grid_coord] = 1;
   1802      if (av1_is_fullmv_in_range(mv_limits, mv)) {
   1803        unsigned int sad;
   1804        sad = get_mvpred_compound_sad(
   1805            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
   1806        if (sad < best_sad) {
   1807          sad += mvsad_err_cost_(&mv, mv_cost_params);
   1808 
   1809          if (sad < best_sad) {
   1810            best_sad = sad;
   1811            best_site = j;
   1812          }
   1813        }
   1814      }
   1815    }
   1816 
   1817    if (best_site == -1) {
   1818      break;
   1819    } else {
   1820      best_mv->row += neighbors[best_site].coord.row;
   1821      best_mv->col += neighbors[best_site].coord.col;
   1822      grid_center += neighbors[best_site].coord_offset;
   1823    }
   1824  }
   1825  return best_sad;
   1826 }
   1827 
   1828 int av1_full_pixel_search(const FULLPEL_MV start_mv,
   1829                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1830                          const int step_param, int *cost_list,
   1831                          FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
   1832                          FULLPEL_MV *second_best_mv) {
   1833  const BLOCK_SIZE bsize = ms_params->bsize;
   1834  const SEARCH_METHODS search_method = ms_params->search_method;
   1835 
   1836  const int is_intra_mode = ms_params->is_intra_mode;
   1837  int run_mesh_search = ms_params->run_mesh_search;
   1838 
   1839  int var = 0;
   1840  MARK_MV_INVALID(best_mv);
   1841  if (second_best_mv) {
   1842    MARK_MV_INVALID(second_best_mv);
   1843  }
   1844 
   1845  if (cost_list) {
   1846    cost_list[0] = INT_MAX;
   1847    cost_list[1] = INT_MAX;
   1848    cost_list[2] = INT_MAX;
   1849    cost_list[3] = INT_MAX;
   1850    cost_list[4] = INT_MAX;
   1851  }
   1852 
   1853  assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride);
   1854 
   1855  switch (search_method) {
   1856    case FAST_BIGDIA:
   1857      var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
   1858                               best_mv, best_mv_stats);
   1859      break;
   1860    case VFAST_DIAMOND:
   1861      var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
   1862                             best_mv, best_mv_stats);
   1863      break;
   1864    case FAST_DIAMOND:
   1865      var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
   1866                            best_mv, best_mv_stats);
   1867      break;
   1868    case FAST_HEX:
   1869      var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
   1870                            best_mv, best_mv_stats);
   1871      break;
   1872    case HEX:
   1873      var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv,
   1874                       best_mv_stats);
   1875      break;
   1876    case SQUARE:
   1877      var = square_search(start_mv, ms_params, step_param, 1, cost_list,
   1878                          best_mv, best_mv_stats);
   1879      break;
   1880    case BIGDIA:
   1881      var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list,
   1882                          best_mv, best_mv_stats);
   1883      break;
   1884    case NSTEP:
   1885    case NSTEP_8PT:
   1886    case DIAMOND:
   1887    case CLAMPED_DIAMOND:
   1888      var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
   1889                               best_mv, best_mv_stats, second_best_mv);
   1890      break;
   1891    default: assert(0 && "Invalid search method.");
   1892  }
   1893 
   1894  // Should we allow a follow on exhaustive search?
   1895  if (!run_mesh_search &&
   1896      ((search_method == NSTEP) || (search_method == NSTEP_8PT)) &&
   1897      !ms_params->ms_buffers.second_pred) {
   1898    int exhaustive_thr = ms_params->force_mesh_thresh;
   1899    exhaustive_thr >>=
   1900        10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
   1901    // Threshold variance for an exhaustive full search.
   1902    if (var > exhaustive_thr) run_mesh_search = 1;
   1903  }
   1904 
   1905  // TODO(yunqing): the following is used to reduce mesh search in temporal
   1906  // filtering. Can extend it to intrabc.
   1907  if (!is_intra_mode && ms_params->prune_mesh_search) {
   1908    const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
   1909                                        abs(start_mv.col - best_mv->col));
   1910    if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) {
   1911      run_mesh_search = 0;
   1912    }
   1913  }
   1914 
   1915  if (ms_params->sdf != ms_params->vfp->sdf) {
   1916    // If we are skipping rows when we perform the motion search, we need to
   1917    // check the quality of skipping. If it's bad, then we run mesh search with
   1918    // skip row features off.
   1919    // TODO(chiyotsai@google.com): Handle the case where we have a vertical
   1920    // offset of 1 before we hit this statement to avoid having to redo
   1921    // motion search.
   1922    const struct buf_2d *src = ms_params->ms_buffers.src;
   1923    const struct buf_2d *ref = ms_params->ms_buffers.ref;
   1924    const int src_stride = src->stride;
   1925    const int ref_stride = ref->stride;
   1926 
   1927    const uint8_t *src_address = src->buf;
   1928    const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
   1929    const int sad =
   1930        ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
   1931    const int skip_sad =
   1932        ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
   1933    // We will keep the result of skipping rows if it's good enough. Here, good
   1934    // enough means the error is less than 1 per pixel.
   1935    const int kSADThresh =
   1936        1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
   1937    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
   1938      // There is a large discrepancy between skipping and not skipping, so we
   1939      // need to redo the motion search.
   1940      FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
   1941      new_ms_params.sdf = new_ms_params.vfp->sdf;
   1942      new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
   1943      new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
   1944 
   1945      return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
   1946                                   cost_list, best_mv, best_mv_stats,
   1947                                   second_best_mv);
   1948    }
   1949  }
   1950 
   1951  if (run_mesh_search) {
   1952    int var_ex;
   1953    FULLPEL_MV tmp_mv_ex;
   1954    FULLPEL_MV_STATS tmp_mv_stats;
   1955    // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
   1956    // or non-intraBC)
   1957    // TODO(chiyotsai@google.com):  There is a bug here where the second best mv
   1958    // gets overwritten without actually comparing the rdcost.
   1959    const MESH_PATTERN *const mesh_patterns =
   1960        ms_params->mesh_patterns[is_intra_mode];
   1961    // TODO(chiyotsai@google.com): the second best mv is not set correctly by
   1962    // full_pixel_exhaustive, which can incorrectly override it.
   1963    var_ex =
   1964        full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list,
   1965                              &tmp_mv_ex, &tmp_mv_stats, second_best_mv);
   1966    if (var_ex < var) {
   1967      var = var_ex;
   1968      *best_mv_stats = tmp_mv_stats;
   1969      *best_mv = tmp_mv_ex;
   1970    }
   1971  }
   1972 
   1973  return var;
   1974 }
   1975 
   1976 int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
   1977                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   1978                            IntraBCHashInfo *intrabc_hash_info,
   1979                            FULLPEL_MV *best_mv) {
   1980  if (!av1_use_hash_me(cpi)) return INT_MAX;
   1981 
   1982  const BLOCK_SIZE bsize = ms_params->bsize;
   1983  const int block_width = block_size_wide[bsize];
   1984  const int block_height = block_size_high[bsize];
   1985 
   1986  if (block_width != block_height) return INT_MAX;
   1987 
   1988  const FullMvLimits *mv_limits = &ms_params->mv_limits;
   1989  const MSBuffers *ms_buffer = &ms_params->ms_buffers;
   1990 
   1991  const uint8_t *src = ms_buffer->src->buf;
   1992  const int src_stride = ms_buffer->src->stride;
   1993 
   1994  const int mi_row = xd->mi_row;
   1995  const int mi_col = xd->mi_col;
   1996  const int x_pos = mi_col * MI_SIZE;
   1997  const int y_pos = mi_row * MI_SIZE;
   1998 
   1999  uint32_t hash_value1, hash_value2;
   2000  int best_hash_cost = INT_MAX;
   2001 
   2002  // for the hashMap
   2003  hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
   2004 
   2005  av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
   2006                           &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
   2007 
   2008  int count = av1_hash_table_count(ref_frame_hash, hash_value1);
   2009  if (count <= 1) {
   2010    return INT_MAX;
   2011  }
   2012  if (cpi->sf.mv_sf.prune_intrabc_candidate_block_hash_search) {
   2013    count = AOMMIN(64, count);
   2014  }
   2015 
   2016  Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
   2017  for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) {
   2018    block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator));
   2019    if (hash_value2 == ref_block_hash.hash_value2) {
   2020      // Make sure the prediction is from valid area.
   2021      const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
   2022                      GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
   2023      if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
   2024                           cpi->common.seq_params->mib_size_log2))
   2025        continue;
   2026 
   2027      FULLPEL_MV hash_mv;
   2028      hash_mv.col = ref_block_hash.x - x_pos;
   2029      hash_mv.row = ref_block_hash.y - y_pos;
   2030      if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
   2031      FULLPEL_MV_STATS mv_stats;
   2032      const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats);
   2033      if (refCost < best_hash_cost) {
   2034        best_hash_cost = refCost;
   2035        *best_mv = hash_mv;
   2036      }
   2037    }
   2038  }
   2039 
   2040  return best_hash_cost;
   2041 }
   2042 
   2043 int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
   2044                     int search_size, int full_search, int *sad) {
   2045  int best_sad = INT_MAX;
   2046  int this_sad;
   2047  int d;
   2048  int center, offset = 0;
   2049  int bw = search_size << 1;
   2050 
   2051  if (full_search) {
   2052    for (d = 0; d <= bw; d++) {
   2053      this_sad = aom_vector_var(&ref[d], src, bwl);
   2054      if (this_sad < best_sad) {
   2055        best_sad = this_sad;
   2056        offset = d;
   2057      }
   2058    }
   2059    center = offset;
   2060    *sad = best_sad;
   2061    return (center - (bw >> 1));
   2062  }
   2063 
   2064  for (d = 0; d <= bw; d += 16) {
   2065    this_sad = aom_vector_var(&ref[d], src, bwl);
   2066    if (this_sad < best_sad) {
   2067      best_sad = this_sad;
   2068      offset = d;
   2069    }
   2070  }
   2071  center = offset;
   2072 
   2073  for (d = -8; d <= 8; d += 16) {
   2074    int this_pos = offset + d;
   2075    // check limit
   2076    if (this_pos < 0 || this_pos > bw) continue;
   2077    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
   2078    if (this_sad < best_sad) {
   2079      best_sad = this_sad;
   2080      center = this_pos;
   2081    }
   2082  }
   2083  offset = center;
   2084 
   2085  for (d = -4; d <= 4; d += 8) {
   2086    int this_pos = offset + d;
   2087    // check limit
   2088    if (this_pos < 0 || this_pos > bw) continue;
   2089    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
   2090    if (this_sad < best_sad) {
   2091      best_sad = this_sad;
   2092      center = this_pos;
   2093    }
   2094  }
   2095  offset = center;
   2096 
   2097  for (d = -2; d <= 2; d += 4) {
   2098    int this_pos = offset + d;
   2099    // check limit
   2100    if (this_pos < 0 || this_pos > bw) continue;
   2101    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
   2102    if (this_sad < best_sad) {
   2103      best_sad = this_sad;
   2104      center = this_pos;
   2105    }
   2106  }
   2107  offset = center;
   2108 
   2109  for (d = -1; d <= 1; d += 2) {
   2110    int this_pos = offset + d;
   2111    // check limit
   2112    if (this_pos < 0 || this_pos > bw) continue;
   2113    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
   2114    if (this_sad < best_sad) {
   2115      best_sad = this_sad;
   2116      center = this_pos;
   2117    }
   2118  }
   2119  *sad = best_sad;
   2120  return (center - (bw >> 1));
   2121 }
   2122 
   2123 // A special fast version of motion search used in rt mode.
   2124 // The search window along columns and row is given by:
   2125 //  +/- me_search_size_col/row.
   2126 unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   2127                                           BLOCK_SIZE bsize, int mi_row,
   2128                                           int mi_col, const MV *ref_mv,
   2129                                           unsigned int *y_sad_zero,
   2130                                           int me_search_size_col,
   2131                                           int me_search_size_row) {
   2132  const AV1_COMMON *const cm = &cpi->common;
   2133  MACROBLOCKD *xd = &x->e_mbd;
   2134  MB_MODE_INFO *mi = xd->mi[0];
   2135  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
   2136  int idx;
   2137  const int bw = block_size_wide[bsize];
   2138  const int bh = block_size_high[bsize];
   2139  const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
   2140  const int full_search = is_screen;
   2141  const bool screen_scroll_superblock =
   2142      is_screen && bsize == cm->seq_params->sb_size;
   2143  // Keep border a multiple of 16.
   2144  const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
   2145  int search_size_width = me_search_size_col;
   2146  int search_size_height = me_search_size_row;
   2147  // Adjust based on boundary.
   2148  if (((mi_col << 2) - search_size_width < -border) ||
   2149      ((mi_col << 2) + search_size_width > cm->width + border))
   2150    search_size_width = border;
   2151  if (((mi_row << 2) - search_size_height < -border) ||
   2152      ((mi_row << 2) + search_size_height > cm->height + border))
   2153    search_size_height = border;
   2154  const int src_stride = x->plane[0].src.stride;
   2155  const int ref_stride = xd->plane[0].pre[0].stride;
   2156  uint8_t const *ref_buf, *src_buf;
   2157  int_mv *best_int_mv = &xd->mi[0]->mv[0];
   2158  unsigned int best_sad, tmp_sad, this_sad[4];
   2159  int best_sad_col, best_sad_row;
   2160  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
   2161  const int col_norm_factor = 3 + (bw >> 5);
   2162  const YV12_BUFFER_CONFIG *scaled_ref_frame =
   2163      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
   2164  static const MV search_pos[4] = {
   2165    { -1, 0 },
   2166    { 0, -1 },
   2167    { 0, 1 },
   2168    { 1, 0 },
   2169  };
   2170 
   2171  if (scaled_ref_frame) {
   2172    int i;
   2173    // Swap out the reference frame for a version that's been scaled to
   2174    // match the resolution of the current frame, allowing the existing
   2175    // motion search code to be used without additional modifications.
   2176    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
   2177    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
   2178                         MAX_MB_PLANE);
   2179  }
   2180 
   2181  if (xd->bd != 8) {
   2182    best_int_mv->as_fullmv = kZeroFullMv;
   2183    best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
   2184                                           xd->plane[0].pre[0].buf, ref_stride);
   2185 
   2186    if (scaled_ref_frame) {
   2187      int i;
   2188      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
   2189    }
   2190    return best_sad;
   2191  }
   2192  const int width_ref_buf = (search_size_width << 1) + bw;
   2193  const int height_ref_buf = (search_size_height << 1) + bh;
   2194  int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf));
   2195  int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf));
   2196  int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
   2197  int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
   2198  if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
   2199    aom_free(hbuf);
   2200    aom_free(vbuf);
   2201    aom_free(src_hbuf);
   2202    aom_free(src_vbuf);
   2203    aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
   2204                       "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
   2205  }
   2206 
   2207  // Set up prediction 1-D reference set for rows.
   2208  ref_buf = xd->plane[0].pre[0].buf - search_size_width;
   2209  aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh,
   2210                  row_norm_factor);
   2211 
   2212  // Set up prediction 1-D reference set for cols
   2213  ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride;
   2214  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf,
   2215                  col_norm_factor);
   2216 
   2217  // Set up src 1-D reference set
   2218  src_buf = x->plane[0].src.buf;
   2219  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
   2220  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
   2221 
   2222  // Find the best match per 1-D search
   2223  best_int_mv->as_fullmv.col =
   2224      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
   2225                       search_size_width, full_search, &best_sad_col);
   2226  best_int_mv->as_fullmv.row =
   2227      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
   2228                       search_size_height, full_search, &best_sad_row);
   2229 
   2230  // For screen: select between horiz or vert motion.
   2231  if (is_screen) {
   2232    if (best_sad_col < best_sad_row)
   2233      best_int_mv->as_fullmv.row = 0;
   2234    else
   2235      best_int_mv->as_fullmv.col = 0;
   2236  }
   2237 
   2238  FULLPEL_MV this_mv = best_int_mv->as_fullmv;
   2239  src_buf = x->plane[0].src.buf;
   2240  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
   2241  best_sad =
   2242      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
   2243 
   2244  // Evaluate zero MV if found MV is non-zero.
   2245  if (best_int_mv->as_int != 0) {
   2246    tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
   2247                                          xd->plane[0].pre[0].buf, ref_stride);
   2248    *y_sad_zero = tmp_sad;
   2249    if (tmp_sad < best_sad) {
   2250      best_int_mv->as_fullmv = kZeroFullMv;
   2251      this_mv = best_int_mv->as_fullmv;
   2252      ref_buf = xd->plane[0].pre[0].buf;
   2253      best_sad = tmp_sad;
   2254    }
   2255  } else {
   2256    *y_sad_zero = best_sad;
   2257  }
   2258 
   2259  if (!screen_scroll_superblock) {
   2260    const uint8_t *const pos[4] = {
   2261      ref_buf - ref_stride,
   2262      ref_buf - 1,
   2263      ref_buf + 1,
   2264      ref_buf + ref_stride,
   2265    };
   2266 
   2267    cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
   2268                                   this_sad);
   2269 
   2270    for (idx = 0; idx < 4; ++idx) {
   2271      if (this_sad[idx] < best_sad) {
   2272        best_sad = this_sad[idx];
   2273        best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
   2274        best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
   2275      }
   2276    }
   2277 
   2278    if (this_sad[0] < this_sad[3])
   2279      this_mv.row -= 1;
   2280    else
   2281      this_mv.row += 1;
   2282 
   2283    if (this_sad[1] < this_sad[2])
   2284      this_mv.col -= 1;
   2285    else
   2286      this_mv.col += 1;
   2287 
   2288    ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
   2289 
   2290    tmp_sad =
   2291        cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
   2292    if (best_sad > tmp_sad) {
   2293      best_int_mv->as_fullmv = this_mv;
   2294      best_sad = tmp_sad;
   2295    }
   2296  }
   2297 
   2298  FullMvLimits mv_limits = x->mv_limits;
   2299  av1_set_mv_search_range(&mv_limits, ref_mv);
   2300  clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits);
   2301 
   2302  convert_fullmv_to_mv(best_int_mv);
   2303 
   2304  if (scaled_ref_frame) {
   2305    int i;
   2306    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
   2307  }
   2308 
   2309  aom_free(hbuf);
   2310  aom_free(vbuf);
   2311  aom_free(src_hbuf);
   2312  aom_free(src_vbuf);
   2313  return best_sad;
   2314 }
   2315 
   2316 // =============================================================================
   2317 //  Fullpixel Motion Search: OBMC
   2318 // =============================================================================
   2319 static inline int get_obmc_mvpred_var(
   2320    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
   2321  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   2322  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   2323  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
   2324  const int32_t *wsrc = ms_buffers->wsrc;
   2325  const int32_t *mask = ms_buffers->obmc_mask;
   2326  const struct buf_2d *ref_buf = ms_buffers->ref;
   2327 
   2328  const MV mv = get_mv_from_fullmv(this_mv);
   2329  unsigned int unused;
   2330 
   2331  return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc,
   2332                  mask, &unused) +
   2333         mv_err_cost_(&mv, mv_cost_params);
   2334 }
   2335 
   2336 static int obmc_refining_search_sad(
   2337    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) {
   2338  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
   2339  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   2340  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
   2341  const int32_t *wsrc = ms_buffers->wsrc;
   2342  const int32_t *mask = ms_buffers->obmc_mask;
   2343  const struct buf_2d *ref_buf = ms_buffers->ref;
   2344  const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
   2345  const int kSearchRange = 8;
   2346 
   2347  unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv),
   2348                                       ref_buf->stride, wsrc, mask) +
   2349                          mvsad_err_cost_(best_mv, mv_cost_params);
   2350 
   2351  for (int i = 0; i < kSearchRange; i++) {
   2352    int best_site = -1;
   2353 
   2354    for (int j = 0; j < 4; j++) {
   2355      const FULLPEL_MV mv = { best_mv->row + neighbors[j].row,
   2356                              best_mv->col + neighbors[j].col };
   2357      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
   2358        unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv),
   2359                                        ref_buf->stride, wsrc, mask);
   2360        if (sad < best_sad) {
   2361          sad += mvsad_err_cost_(&mv, mv_cost_params);
   2362 
   2363          if (sad < best_sad) {
   2364            best_sad = sad;
   2365            best_site = j;
   2366          }
   2367        }
   2368      }
   2369    }
   2370 
   2371    if (best_site == -1) {
   2372      break;
   2373    } else {
   2374      best_mv->row += neighbors[best_site].row;
   2375      best_mv->col += neighbors[best_site].col;
   2376    }
   2377  }
   2378  return best_sad;
   2379 }
   2380 
   2381 static int obmc_diamond_search_sad(
   2382    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
   2383    FULLPEL_MV *best_mv, int search_step, int *num00) {
   2384  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
   2385  const search_site_config *cfg = ms_params->search_sites;
   2386  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   2387  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
   2388  const int32_t *wsrc = ms_buffers->wsrc;
   2389  const int32_t *mask = ms_buffers->obmc_mask;
   2390  const struct buf_2d *const ref_buf = ms_buffers->ref;
   2391 
   2392  // search_step determines the length of the initial step and hence the number
   2393  // of iterations.
   2394  const int tot_steps = cfg->num_search_steps - search_step;
   2395  const uint8_t *best_address, *init_ref;
   2396  int best_sad = INT_MAX;
   2397  int best_site = 0;
   2398 
   2399  clamp_fullmv(&start_mv, &ms_params->mv_limits);
   2400  best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
   2401  *num00 = 0;
   2402  *best_mv = start_mv;
   2403 
   2404  // Check the starting position
   2405  best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
   2406             mvsad_err_cost_(best_mv, mv_cost_params);
   2407 
   2408  for (int step = tot_steps - 1; step >= 0; --step) {
   2409    const search_site *const site = cfg->site[step];
   2410    best_site = 0;
   2411    for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
   2412      const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row,
   2413                              best_mv->col + site[idx].mv.col };
   2414      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
   2415        int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride,
   2416                               wsrc, mask);
   2417        if (sad < best_sad) {
   2418          sad += mvsad_err_cost_(&mv, mv_cost_params);
   2419 
   2420          if (sad < best_sad) {
   2421            best_sad = sad;
   2422            best_site = idx;
   2423          }
   2424        }
   2425      }
   2426    }
   2427 
   2428    if (best_site != 0) {
   2429      best_mv->row += site[best_site].mv.row;
   2430      best_mv->col += site[best_site].mv.col;
   2431      best_address += site[best_site].offset;
   2432    } else if (best_address == init_ref) {
   2433      (*num00)++;
   2434    }
   2435  }
   2436  return best_sad;
   2437 }
   2438 
   2439 static int obmc_full_pixel_diamond(
   2440    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
   2441    int step_param, FULLPEL_MV *best_mv) {
   2442  const search_site_config *cfg = ms_params->search_sites;
   2443  FULLPEL_MV tmp_mv;
   2444  int thissme, n, num00 = 0;
   2445  int bestsme =
   2446      obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n);
   2447  if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv);
   2448  *best_mv = tmp_mv;
   2449 
   2450  // If there won't be more n-step search, check to see if refining search is
   2451  // needed.
   2452  const int further_steps = cfg->num_search_steps - 1 - step_param;
   2453 
   2454  while (n < further_steps) {
   2455    ++n;
   2456 
   2457    if (num00) {
   2458      num00--;
   2459    } else {
   2460      thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv,
   2461                                        step_param + n, &num00);
   2462      if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
   2463 
   2464      if (thissme < bestsme) {
   2465        bestsme = thissme;
   2466        *best_mv = tmp_mv;
   2467      }
   2468    }
   2469  }
   2470 
   2471  return bestsme;
   2472 }
   2473 
   2474 int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
   2475                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
   2476                               const int step_param, FULLPEL_MV *best_mv) {
   2477  if (!ms_params->fast_obmc_search) {
   2478    const int bestsme =
   2479        obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv);
   2480    return bestsme;
   2481  } else {
   2482    *best_mv = start_mv;
   2483    clamp_fullmv(best_mv, &ms_params->mv_limits);
   2484    int thissme = obmc_refining_search_sad(ms_params, best_mv);
   2485    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv);
   2486    return thissme;
   2487  }
   2488 }
   2489 
   2490 // =============================================================================
   2491 //  Subpixel Motion Search: Translational
   2492 // =============================================================================
   2493 #define INIT_SUBPEL_STEP_SIZE (4)
   2494 /*
   2495 * To avoid the penalty for crossing cache-line read, preload the reference
   2496 * area in a small buffer, which is aligned to make sure there won't be crossing
   2497 * cache-line read while reading from this buffer. This reduced the cpu
   2498 * cycles spent on reading ref data in sub-pixel filter functions.
   2499 * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
   2500 * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
   2501 * could reduce the area.
   2502 */
   2503 
   2504 // Returns the subpel offset used by various subpel variance functions [m]sv[a]f
   2505 static inline int get_subpel_part(int x) { return x & 7; }
   2506 
   2507 // Gets the address of the ref buffer at subpel location (r, c), rounded to the
   2508 // nearest fullpel precision toward - \infty
   2509 static inline const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
   2510                                             const MV mv) {
   2511  const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
   2512  return &buf->buf[offset];
   2513 }
   2514 
   2515 // Estimates the variance of prediction residue using bilinear filter for fast
   2516 // search.
   2517 static inline int estimated_pref_error(
   2518    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2519    unsigned int *sse) {
   2520  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
   2521 
   2522  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   2523  const uint8_t *src = ms_buffers->src->buf;
   2524  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
   2525  const int src_stride = ms_buffers->src->stride;
   2526  const int ref_stride = ms_buffers->ref->stride;
   2527  const uint8_t *second_pred = ms_buffers->second_pred;
   2528  const uint8_t *mask = ms_buffers->mask;
   2529  const int mask_stride = ms_buffers->mask_stride;
   2530  const int invert_mask = ms_buffers->inv_mask;
   2531 
   2532  const int subpel_x_q3 = get_subpel_part(this_mv->col);
   2533  const int subpel_y_q3 = get_subpel_part(this_mv->row);
   2534 
   2535  if (second_pred == NULL) {
   2536    return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
   2537                    sse);
   2538  } else if (mask) {
   2539    return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
   2540                     second_pred, mask, mask_stride, invert_mask, sse);
   2541  } else {
   2542    return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
   2543                     sse, second_pred);
   2544  }
   2545 }
   2546 
   2547 // Calculates the variance of prediction residue.
   2548 static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
   2549                                const MV *this_mv,
   2550                                const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2551                                unsigned int *sse) {
   2552  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
   2553  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
   2554 
   2555  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   2556  const uint8_t *src = ms_buffers->src->buf;
   2557  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
   2558  const int src_stride = ms_buffers->src->stride;
   2559  const int ref_stride = ms_buffers->ref->stride;
   2560  const uint8_t *second_pred = ms_buffers->second_pred;
   2561  const uint8_t *mask = ms_buffers->mask;
   2562  const int mask_stride = ms_buffers->mask_stride;
   2563  const int invert_mask = ms_buffers->inv_mask;
   2564  const int w = var_params->w;
   2565  const int h = var_params->h;
   2566 
   2567  const int mi_row = xd->mi_row;
   2568  const int mi_col = xd->mi_col;
   2569  const int subpel_x_q3 = get_subpel_part(this_mv->col);
   2570  const int subpel_y_q3 = get_subpel_part(this_mv->row);
   2571 
   2572  unsigned int besterr;
   2573 #if CONFIG_AV1_HIGHBITDEPTH
   2574  if (is_cur_buf_hbd(xd)) {
   2575    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
   2576    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
   2577    if (second_pred != NULL) {
   2578      if (mask) {
   2579        aom_highbd_comp_mask_upsampled_pred(
   2580            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
   2581            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
   2582            invert_mask, xd->bd, subpel_search_type);
   2583      } else {
   2584        aom_highbd_comp_avg_upsampled_pred(
   2585            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
   2586            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
   2587            subpel_search_type);
   2588      }
   2589    } else {
   2590      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
   2591                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
   2592                                xd->bd, subpel_search_type);
   2593    }
   2594    besterr = vfp->vf(pred8, w, src, src_stride, sse);
   2595  } else {
   2596    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
   2597    if (second_pred != NULL) {
   2598      if (mask) {
   2599        aom_comp_mask_upsampled_pred(
   2600            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
   2601            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
   2602            invert_mask, subpel_search_type);
   2603      } else {
   2604        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
   2605                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
   2606                                    ref, ref_stride, subpel_search_type);
   2607      }
   2608    } else {
   2609      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
   2610                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
   2611                         subpel_search_type);
   2612    }
   2613 
   2614    besterr = vfp->vf(pred, w, src, src_stride, sse);
   2615  }
   2616 #else
   2617  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
   2618  if (second_pred != NULL) {
   2619    if (mask) {
   2620      aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
   2621                                   second_pred, w, h, subpel_x_q3, subpel_y_q3,
   2622                                   ref, ref_stride, mask, mask_stride,
   2623                                   invert_mask, subpel_search_type);
   2624    } else {
   2625      aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
   2626                                  second_pred, w, h, subpel_x_q3, subpel_y_q3,
   2627                                  ref, ref_stride, subpel_search_type);
   2628    }
   2629  } else {
   2630    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
   2631                       subpel_y_q3, ref, ref_stride, subpel_search_type);
   2632  }
   2633 
   2634  besterr = vfp->vf(pred, w, src, src_stride, sse);
   2635 #endif
   2636  return besterr;
   2637 }
   2638 
   2639 // Estimates whether this_mv is better than best_mv. This function incorporates
   2640 // both prediction error and residue into account. It is suffixed "fast" because
   2641 // it uses bilinear filter to estimate the prediction.
   2642 static inline unsigned int check_better_fast(
   2643    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
   2644    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2645    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2646    unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) {
   2647  unsigned int cost;
   2648  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
   2649    unsigned int sse;
   2650    int thismse;
   2651    if (is_scaled) {
   2652      thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
   2653    } else {
   2654      thismse = estimated_pref_error(this_mv, var_params, &sse);
   2655    }
   2656    cost = mv_err_cost_(this_mv, mv_cost_params);
   2657    cost += thismse;
   2658 
   2659    if (cost < *besterr) {
   2660      *besterr = cost;
   2661      *best_mv = *this_mv;
   2662      *distortion = thismse;
   2663      *sse1 = sse;
   2664      *has_better_mv |= 1;
   2665    }
   2666  } else {
   2667    cost = INT_MAX;
   2668  }
   2669  return cost;
   2670 }
   2671 
   2672 // Checks whether this_mv is better than best_mv. This function incorporates
   2673 // both prediction error and residue into account.
   2674 static AOM_FORCE_INLINE unsigned int check_better(
   2675    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
   2676    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2677    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2678    unsigned int *sse1, int *distortion, int *is_better) {
   2679  unsigned int cost;
   2680  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
   2681    unsigned int sse;
   2682    int thismse;
   2683    thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
   2684    cost = mv_err_cost_(this_mv, mv_cost_params);
   2685    cost += thismse;
   2686    if (cost < *besterr) {
   2687      *besterr = cost;
   2688      *best_mv = *this_mv;
   2689      *distortion = thismse;
   2690      *sse1 = sse;
   2691      *is_better |= 1;
   2692    }
   2693  } else {
   2694    cost = INT_MAX;
   2695  }
   2696  return cost;
   2697 }
   2698 
   2699 static inline MV get_best_diag_step(int step_size, unsigned int left_cost,
   2700                                    unsigned int right_cost,
   2701                                    unsigned int up_cost,
   2702                                    unsigned int down_cost) {
   2703  const MV diag_step = { up_cost <= down_cost ? -step_size : step_size,
   2704                         left_cost <= right_cost ? -step_size : step_size };
   2705 
   2706  return diag_step;
   2707 }
   2708 
   2709 // Searches the four cardinal direction for a better mv, then follows up with a
   2710 // search in the best quadrant. This uses bilinear filter to speed up the
   2711 // calculation.
   2712 static AOM_FORCE_INLINE MV first_level_check_fast(
   2713    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
   2714    int hstep, const SubpelMvLimits *mv_limits,
   2715    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2716    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2717    unsigned int *sse1, int *distortion, int is_scaled) {
   2718  // Check the four cardinal directions
   2719  const MV left_mv = { this_mv.row, this_mv.col - hstep };
   2720  int dummy = 0;
   2721  const unsigned int left = check_better_fast(
   2722      xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
   2723      sse1, distortion, &dummy, is_scaled);
   2724 
   2725  const MV right_mv = { this_mv.row, this_mv.col + hstep };
   2726  const unsigned int right = check_better_fast(
   2727      xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params,
   2728      besterr, sse1, distortion, &dummy, is_scaled);
   2729 
   2730  const MV top_mv = { this_mv.row - hstep, this_mv.col };
   2731  const unsigned int up = check_better_fast(
   2732      xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
   2733      sse1, distortion, &dummy, is_scaled);
   2734 
   2735  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
   2736  const unsigned int down = check_better_fast(
   2737      xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params,
   2738      besterr, sse1, distortion, &dummy, is_scaled);
   2739 
   2740  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
   2741  const MV diag_mv = { this_mv.row + diag_step.row,
   2742                       this_mv.col + diag_step.col };
   2743 
   2744  // Check the diagonal direction with the best mv
   2745  check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
   2746                    mv_cost_params, besterr, sse1, distortion, &dummy,
   2747                    is_scaled);
   2748 
   2749  return diag_step;
   2750 }
   2751 
   2752 // Performs a following up search after first_level_check_fast is called. This
   2753 // performs two extra chess pattern searches in the best quadrant.
   2754 static AOM_FORCE_INLINE void second_level_check_fast(
   2755    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step,
   2756    MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
   2757    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2758    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2759    unsigned int *sse1, int *distortion, int is_scaled) {
   2760  assert(diag_step.row == hstep || diag_step.row == -hstep);
   2761  assert(diag_step.col == hstep || diag_step.col == -hstep);
   2762  const int tr = this_mv.row;
   2763  const int tc = this_mv.col;
   2764  const int br = best_mv->row;
   2765  const int bc = best_mv->col;
   2766  int dummy = 0;
   2767  if (tr != br && tc != bc) {
   2768    assert(diag_step.col == bc - tc);
   2769    assert(diag_step.row == br - tr);
   2770    const MV chess_mv_1 = { br, bc + diag_step.col };
   2771    const MV chess_mv_2 = { br + diag_step.row, bc };
   2772    check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params,
   2773                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2774                      is_scaled);
   2775 
   2776    check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params,
   2777                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2778                      is_scaled);
   2779  } else if (tr == br && tc != bc) {
   2780    assert(diag_step.col == bc - tc);
   2781    // Continue searching in the best direction
   2782    const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
   2783    const MV top_long_mv = { br - hstep, bc + diag_step.col };
   2784    check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params,
   2785                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2786                      is_scaled);
   2787    check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params,
   2788                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2789                      is_scaled);
   2790 
   2791    // Search in the direction opposite of the best quadrant
   2792    const MV rev_mv = { br - diag_step.row, bc };
   2793    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
   2794                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2795                      is_scaled);
   2796  } else if (tr != br && tc == bc) {
   2797    assert(diag_step.row == br - tr);
   2798    // Continue searching in the best direction
   2799    const MV right_long_mv = { br + diag_step.row, bc + hstep };
   2800    const MV left_long_mv = { br + diag_step.row, bc - hstep };
   2801    check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params,
   2802                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2803                      is_scaled);
   2804    check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params,
   2805                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2806                      is_scaled);
   2807 
   2808    // Search in the direction opposite of the best quadrant
   2809    const MV rev_mv = { br, bc - diag_step.col };
   2810    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
   2811                      mv_cost_params, besterr, sse1, distortion, &dummy,
   2812                      is_scaled);
   2813  }
   2814 }
   2815 
   2816 // Combines first level check and second level check when applicable. This first
   2817 // searches the four cardinal directions, and perform several
   2818 // diagonal/chess-pattern searches in the best quadrant.
   2819 static AOM_FORCE_INLINE void two_level_checks_fast(
   2820    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
   2821    int hstep, const SubpelMvLimits *mv_limits,
   2822    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2823    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2824    unsigned int *sse1, int *distortion, int iters, int is_scaled) {
   2825  const MV diag_step = first_level_check_fast(
   2826      xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params,
   2827      besterr, sse1, distortion, is_scaled);
   2828  if (iters > 1) {
   2829    second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep,
   2830                            mv_limits, var_params, mv_cost_params, besterr,
   2831                            sse1, distortion, is_scaled);
   2832  }
   2833 }
   2834 
   2835 static AOM_FORCE_INLINE MV
   2836 first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv,
   2837                  MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits,
   2838                  const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2839                  const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2840                  unsigned int *sse1, int *distortion) {
   2841  int dummy = 0;
   2842  const MV left_mv = { this_mv.row, this_mv.col - hstep };
   2843  const MV right_mv = { this_mv.row, this_mv.col + hstep };
   2844  const MV top_mv = { this_mv.row - hstep, this_mv.col };
   2845  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
   2846 
   2847  const unsigned int left =
   2848      check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
   2849                   mv_cost_params, besterr, sse1, distortion, &dummy);
   2850  const unsigned int right =
   2851      check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
   2852                   mv_cost_params, besterr, sse1, distortion, &dummy);
   2853  const unsigned int up =
   2854      check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
   2855                   mv_cost_params, besterr, sse1, distortion, &dummy);
   2856  const unsigned int down =
   2857      check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
   2858                   mv_cost_params, besterr, sse1, distortion, &dummy);
   2859 
   2860  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
   2861  const MV diag_mv = { this_mv.row + diag_step.row,
   2862                       this_mv.col + diag_step.col };
   2863 
   2864  // Check the diagonal direction with the best mv
   2865  check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
   2866               besterr, sse1, distortion, &dummy);
   2867 
   2868  return diag_step;
   2869 }
   2870 
   2871 // A newer version of second level check that gives better quality.
   2872 // TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
   2873 // from av1_find_best_sub_pixel_tree
   2874 static AOM_FORCE_INLINE void second_level_check_v2(
   2875    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
   2876    MV *best_mv, const SubpelMvLimits *mv_limits,
   2877    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2878    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   2879    unsigned int *sse1, int *distortion, int is_scaled) {
   2880  assert(best_mv->row == this_mv.row + diag_step.row ||
   2881         best_mv->col == this_mv.col + diag_step.col);
   2882  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
   2883    return;
   2884  } else if (this_mv.row == best_mv->row) {
   2885    // Search away from diagonal step since diagonal search did not provide any
   2886    // improvement
   2887    diag_step.row *= -1;
   2888  } else if (this_mv.col == best_mv->col) {
   2889    diag_step.col *= -1;
   2890  }
   2891 
   2892  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
   2893  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
   2894  const MV diag_bias_mv = { best_mv->row + diag_step.row,
   2895                            best_mv->col + diag_step.col };
   2896  int has_better_mv = 0;
   2897 
   2898  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
   2899    check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
   2900                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
   2901    check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
   2902                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
   2903 
   2904    // Do an additional search if the second iteration gives a better mv
   2905    if (has_better_mv) {
   2906      check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
   2907                   mv_cost_params, besterr, sse1, distortion, &has_better_mv);
   2908    }
   2909  } else {
   2910    check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
   2911                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
   2912                      is_scaled);
   2913    check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
   2914                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
   2915                      is_scaled);
   2916 
   2917    // Do an additional search if the second iteration gives a better mv
   2918    if (has_better_mv) {
   2919      check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
   2920                        mv_cost_params, besterr, sse1, distortion,
   2921                        &has_better_mv, is_scaled);
   2922    }
   2923  }
   2924 }
   2925 
   2926 // Gets the error at the beginning when the mv has fullpel precision
   2927 static unsigned int setup_center_error(
   2928    const MACROBLOCKD *xd, const MV *bestmv,
   2929    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2930    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
   2931  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
   2932  const int w = var_params->w;
   2933  const int h = var_params->h;
   2934 
   2935  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   2936  const uint8_t *src = ms_buffers->src->buf;
   2937  const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv);
   2938  const int src_stride = ms_buffers->src->stride;
   2939  const int y_stride = ms_buffers->ref->stride;
   2940  const uint8_t *second_pred = ms_buffers->second_pred;
   2941  const uint8_t *mask = ms_buffers->mask;
   2942  const int mask_stride = ms_buffers->mask_stride;
   2943  const int invert_mask = ms_buffers->inv_mask;
   2944 
   2945  unsigned int besterr;
   2946 
   2947  if (second_pred != NULL) {
   2948 #if CONFIG_AV1_HIGHBITDEPTH
   2949    if (is_cur_buf_hbd(xd)) {
   2950      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
   2951      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
   2952      if (mask) {
   2953        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
   2954                                  mask, mask_stride, invert_mask);
   2955      } else {
   2956        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
   2957      }
   2958      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   2959    } else {
   2960      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
   2961      if (mask) {
   2962        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
   2963                           mask_stride, invert_mask);
   2964      } else {
   2965        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
   2966      }
   2967      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   2968    }
   2969 #else
   2970    (void)xd;
   2971    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
   2972    if (mask) {
   2973      aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
   2974                         mask_stride, invert_mask);
   2975    } else {
   2976      aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
   2977    }
   2978    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   2979 #endif
   2980  } else {
   2981    besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
   2982  }
   2983  *distortion = besterr;
   2984  besterr += mv_err_cost_(bestmv, mv_cost_params);
   2985  return besterr;
   2986 }
   2987 
   2988 // Gets the error at the beginning when the mv has fullpel precision
   2989 static unsigned int upsampled_setup_center_error(
   2990    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
   2991    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   2992    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
   2993  unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1);
   2994  *distortion = besterr;
   2995  besterr += mv_err_cost_(bestmv, mv_cost_params);
   2996  return besterr;
   2997 }
   2998 
   2999 static inline int divide_and_round(int n, int d) {
   3000  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
   3001 }
   3002 
   3003 static inline int is_cost_list_wellbehaved(const int *cost_list) {
   3004  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
   3005         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
   3006 }
   3007 
   3008 // Returns surface minima estimate at given precision in 1/2^n bits.
   3009 // Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
   3010 // For a given set of costs S0, S1, S2, S3, S4 at points
   3011 // (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
   3012 // the solution for the location of the minima (x0, y0) is given by:
   3013 // x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
   3014 // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
   3015 // The code below is an integerized version of that.
   3016 static inline void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
   3017                                     int bits) {
   3018  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
   3019                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
   3020  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
   3021                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
   3022 }
   3023 
   3024 // Checks the list of mvs searched in the last iteration and see if we are
   3025 // repeating it. If so, return 1. Otherwise we update the last_mv_search_list
   3026 // with current_mv and return 0.
   3027 static inline int check_repeated_mv_and_update(int_mv *last_mv_search_list,
   3028                                               const MV current_mv, int iter) {
   3029  if (last_mv_search_list) {
   3030    if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
   3031      return 1;
   3032    }
   3033 
   3034    last_mv_search_list[iter].as_mv = current_mv;
   3035  }
   3036  return 0;
   3037 }
   3038 
   3039 static inline int setup_center_error_facade(
   3040    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
   3041    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3042    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
   3043    int is_scaled) {
   3044  if (is_scaled) {
   3045    return upsampled_setup_center_error(xd, cm, bestmv, var_params,
   3046                                        mv_cost_params, sse1, distortion);
   3047  } else {
   3048    return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
   3049                              distortion);
   3050  }
   3051 }
   3052 
   3053 int av1_find_best_sub_pixel_tree_pruned_more(
   3054    MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3055    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
   3056    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
   3057    unsigned int *sse1, int_mv *last_mv_search_list) {
   3058  (void)cm;
   3059  const int allow_hp = ms_params->allow_hp;
   3060  const int forced_stop = ms_params->forced_stop;
   3061  const int iters_per_step = ms_params->iters_per_step;
   3062  const int *cost_list = ms_params->cost_list;
   3063  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3064  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   3065  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
   3066 
   3067  // The iteration we are current searching for. Iter 0 corresponds to fullpel
   3068  // mv, iter 1 to half pel, and so on
   3069  int iter = 0;
   3070  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
   3071  unsigned int besterr = INT_MAX;
   3072  *bestmv = start_mv;
   3073 
   3074  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
   3075                                             ? &cm->sf_identity
   3076                                             : xd->block_ref_scale_factors[0];
   3077  const int is_scaled = av1_is_scaled(sf);
   3078 
   3079  if (start_mv_stats != NULL && !is_scaled) {
   3080    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
   3081    *distortion = start_mv_stats->distortion;
   3082    *sse1 = start_mv_stats->sse;
   3083  } else {
   3084    besterr =
   3085        setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
   3086                                  sse1, distortion, is_scaled);
   3087  }
   3088 
   3089  // If forced_stop is FULL_PEL, return.
   3090  if (forced_stop == FULL_PEL) return besterr;
   3091 
   3092  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
   3093    return INT_MAX;
   3094  }
   3095  iter++;
   3096 
   3097  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
   3098      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
   3099      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
   3100    int ir, ic;
   3101    get_cost_surf_min(cost_list, &ir, &ic, 1);
   3102    if (ir != 0 || ic != 0) {
   3103      const MV this_mv = { start_mv.row + ir * hstep,
   3104                           start_mv.col + ic * hstep };
   3105      int dummy = 0;
   3106      check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params,
   3107                        mv_cost_params, &besterr, sse1, distortion, &dummy,
   3108                        is_scaled);
   3109    }
   3110  } else {
   3111    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
   3112                          var_params, mv_cost_params, &besterr, sse1,
   3113                          distortion, iters_per_step, is_scaled);
   3114  }
   3115 
   3116  // Each subsequent iteration checks at least one point in common with
   3117  // the last iteration could be 2 ( if diag selected) 1/4 pel
   3118  if (forced_stop < HALF_PEL) {
   3119    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
   3120      return INT_MAX;
   3121    }
   3122    iter++;
   3123 
   3124    hstep >>= 1;
   3125    start_mv = *bestmv;
   3126    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
   3127                          var_params, mv_cost_params, &besterr, sse1,
   3128                          distortion, iters_per_step, is_scaled);
   3129  }
   3130 
   3131  if (allow_hp && forced_stop == EIGHTH_PEL) {
   3132    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
   3133      return INT_MAX;
   3134    }
   3135    iter++;
   3136 
   3137    hstep >>= 1;
   3138    start_mv = *bestmv;
   3139    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
   3140                          var_params, mv_cost_params, &besterr, sse1,
   3141                          distortion, iters_per_step, is_scaled);
   3142  }
   3143 
   3144  return besterr;
   3145 }
   3146 
   3147 int av1_find_best_sub_pixel_tree_pruned(
   3148    MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3149    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
   3150    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
   3151    unsigned int *sse1, int_mv *last_mv_search_list) {
   3152  (void)cm;
   3153  (void)start_mv_stats;
   3154  const int allow_hp = ms_params->allow_hp;
   3155  const int forced_stop = ms_params->forced_stop;
   3156  const int iters_per_step = ms_params->iters_per_step;
   3157  const int *cost_list = ms_params->cost_list;
   3158  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3159  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   3160  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
   3161 
   3162  // The iteration we are current searching for. Iter 0 corresponds to fullpel
   3163  // mv, iter 1 to half pel, and so on
   3164  int iter = 0;
   3165  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
   3166  unsigned int besterr = INT_MAX;
   3167  *bestmv = start_mv;
   3168 
   3169  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
   3170                                             ? &cm->sf_identity
   3171                                             : xd->block_ref_scale_factors[0];
   3172  const int is_scaled = av1_is_scaled(sf);
   3173 
   3174  if (start_mv_stats != NULL && !is_scaled) {
   3175    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
   3176    *distortion = start_mv_stats->distortion;
   3177    *sse1 = start_mv_stats->sse;
   3178  } else {
   3179    besterr =
   3180        setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
   3181                                  sse1, distortion, is_scaled);
   3182  }
   3183 
   3184  // If forced_stop is FULL_PEL, return.
   3185  if (forced_stop == FULL_PEL) return besterr;
   3186 
   3187  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
   3188    return INT_MAX;
   3189  }
   3190  iter++;
   3191 
   3192  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
   3193      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
   3194      cost_list[4] != INT_MAX) {
   3195    const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
   3196                                  (cost_list[2] < cost_list[4] ? 0 : 2);
   3197 
   3198    const MV left_mv = { start_mv.row, start_mv.col - hstep };
   3199    const MV right_mv = { start_mv.row, start_mv.col + hstep };
   3200    const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
   3201    const MV top_mv = { start_mv.row - hstep, start_mv.col };
   3202 
   3203    const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
   3204    const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
   3205    const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
   3206    const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
   3207 
   3208    int dummy = 0;
   3209 
   3210    switch (whichdir) {
   3211      case 0:  // bottom left quadrant
   3212        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
   3213                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3214                          is_scaled);
   3215        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
   3216                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3217                          is_scaled);
   3218        check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits,
   3219                          var_params, mv_cost_params, &besterr, sse1,
   3220                          distortion, &dummy, is_scaled);
   3221        break;
   3222      case 1:  // bottom right quadrant
   3223        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
   3224                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3225                          is_scaled);
   3226        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
   3227                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3228                          is_scaled);
   3229        check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits,
   3230                          var_params, mv_cost_params, &besterr, sse1,
   3231                          distortion, &dummy, is_scaled);
   3232        break;
   3233      case 2:  // top left quadrant
   3234        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
   3235                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3236                          is_scaled);
   3237        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
   3238                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3239                          is_scaled);
   3240        check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params,
   3241                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3242                          is_scaled);
   3243        break;
   3244      case 3:  // top right quadrant
   3245        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
   3246                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3247                          is_scaled);
   3248        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
   3249                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3250                          is_scaled);
   3251        check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params,
   3252                          mv_cost_params, &besterr, sse1, distortion, &dummy,
   3253                          is_scaled);
   3254        break;
   3255    }
   3256  } else {
   3257    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
   3258                          var_params, mv_cost_params, &besterr, sse1,
   3259                          distortion, iters_per_step, is_scaled);
   3260  }
   3261 
   3262  // Each subsequent iteration checks at least one point in common with
   3263  // the last iteration could be 2 ( if diag selected) 1/4 pel
   3264  if (forced_stop < HALF_PEL) {
   3265    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
   3266      return INT_MAX;
   3267    }
   3268    iter++;
   3269 
   3270    hstep >>= 1;
   3271    start_mv = *bestmv;
   3272    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
   3273                          var_params, mv_cost_params, &besterr, sse1,
   3274                          distortion, iters_per_step, is_scaled);
   3275  }
   3276 
   3277  if (allow_hp && forced_stop == EIGHTH_PEL) {
   3278    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
   3279      return INT_MAX;
   3280    }
   3281    iter++;
   3282 
   3283    hstep >>= 1;
   3284    start_mv = *bestmv;
   3285    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
   3286                          var_params, mv_cost_params, &besterr, sse1,
   3287                          distortion, iters_per_step, is_scaled);
   3288  }
   3289 
   3290  return besterr;
   3291 }
   3292 
   3293 int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3294                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
   3295                                 MV start_mv,
   3296                                 const FULLPEL_MV_STATS *start_mv_stats,
   3297                                 MV *bestmv, int *distortion,
   3298                                 unsigned int *sse1,
   3299                                 int_mv *last_mv_search_list) {
   3300  (void)start_mv_stats;
   3301  const int allow_hp = ms_params->allow_hp;
   3302  const int forced_stop = ms_params->forced_stop;
   3303  const int iters_per_step = ms_params->iters_per_step;
   3304  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   3305  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
   3306  const SUBPEL_SEARCH_TYPE subpel_search_type =
   3307      ms_params->var_params.subpel_search_type;
   3308  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3309 
   3310  // How many steps to take. A round of 0 means fullpel search only, 1 means
   3311  // half-pel, and so on.
   3312  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
   3313  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
   3314 
   3315  unsigned int besterr = INT_MAX;
   3316 
   3317  *bestmv = start_mv;
   3318 
   3319  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
   3320                                             ? &cm->sf_identity
   3321                                             : xd->block_ref_scale_factors[0];
   3322  const int is_scaled = av1_is_scaled(sf);
   3323 
   3324  if (start_mv_stats != NULL && !is_scaled) {
   3325    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
   3326    *distortion = start_mv_stats->distortion;
   3327    *sse1 = start_mv_stats->sse;
   3328  } else {
   3329    if (subpel_search_type != USE_2_TAPS_ORIG) {
   3330      besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
   3331                                             mv_cost_params, sse1, distortion);
   3332    } else {
   3333      besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
   3334                                   distortion);
   3335    }
   3336  }
   3337 
   3338  // If forced_stop is FULL_PEL, return.
   3339  if (!round) return besterr;
   3340 
   3341  for (int iter = 0; iter < round; ++iter) {
   3342    MV iter_center_mv = *bestmv;
   3343    if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
   3344                                     iter)) {
   3345      return INT_MAX;
   3346    }
   3347 
   3348    MV diag_step;
   3349    if (subpel_search_type != USE_2_TAPS_ORIG) {
   3350      diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
   3351                                    mv_limits, var_params, mv_cost_params,
   3352                                    &besterr, sse1, distortion);
   3353    } else {
   3354      diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep,
   3355                                         mv_limits, var_params, mv_cost_params,
   3356                                         &besterr, sse1, distortion, is_scaled);
   3357    }
   3358 
   3359    // Check diagonal sub-pixel position
   3360    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
   3361      second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
   3362                            mv_limits, var_params, mv_cost_params, &besterr,
   3363                            sse1, distortion, is_scaled);
   3364    }
   3365 
   3366    hstep >>= 1;
   3367  }
   3368 
   3369  return besterr;
   3370 }
   3371 
   3372 // Note(yunqingwang): The following 2 functions are only used in the motion
   3373 // vector unit test, which return extreme motion vectors allowed by the MV
   3374 // limits.
   3375 // Returns the maximum MV.
   3376 int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3377                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
   3378                                MV start_mv,
   3379                                const FULLPEL_MV_STATS *start_mv_stats,
   3380                                MV *bestmv, int *distortion, unsigned int *sse1,
   3381                                int_mv *last_mv_search_list) {
   3382  (void)xd;
   3383  (void)cm;
   3384  (void)start_mv;
   3385  (void)start_mv_stats;
   3386  (void)last_mv_search_list;
   3387 
   3388  const int allow_hp = ms_params->allow_hp;
   3389  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3390 
   3391  bestmv->row = mv_limits->row_max;
   3392  bestmv->col = mv_limits->col_max;
   3393 
   3394  unsigned int besterr = 0;
   3395 
   3396  // In the sub-pel motion search, if hp is not used, then the last bit of mv
   3397  // has to be 0.
   3398  lower_mv_precision(bestmv, allow_hp, 0);
   3399  *distortion = besterr;
   3400  *sse1 = besterr;
   3401  return besterr;
   3402 }
   3403 
   3404 // Returns the minimum MV.
   3405 int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3406                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
   3407                                MV start_mv,
   3408                                const FULLPEL_MV_STATS *start_mv_stats,
   3409                                MV *bestmv, int *distortion, unsigned int *sse1,
   3410                                int_mv *last_mv_search_list) {
   3411  (void)xd;
   3412  (void)cm;
   3413  (void)start_mv;
   3414  (void)start_mv_stats;
   3415  (void)last_mv_search_list;
   3416 
   3417  const int allow_hp = ms_params->allow_hp;
   3418  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3419 
   3420  bestmv->row = mv_limits->row_min;
   3421  bestmv->col = mv_limits->col_min;
   3422 
   3423  unsigned int besterr = 0;
   3424  // In the sub-pel motion search, if hp is not used, then the last bit of mv
   3425  // has to be 0.
   3426  lower_mv_precision(bestmv, allow_hp, 0);
   3427  *distortion = besterr;
   3428  *sse1 = besterr;
   3429  return besterr;
   3430 }
   3431 
   3432 #if !CONFIG_REALTIME_ONLY
   3433 // Computes the cost of the current predictor by going through the whole
   3434 // av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
   3435 // during motion_mode_rd. We are going through the whole
   3436 // av1_enc_build_inter_predictor because we might have changed the interpolation
   3437 // filter, etc before motion_mode_rd is called.
   3438 static inline unsigned int compute_motion_cost(
   3439    MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3440    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
   3441    const MV *this_mv) {
   3442  unsigned int mse;
   3443  unsigned int sse;
   3444  const int mi_row = xd->mi_row;
   3445  const int mi_col = xd->mi_col;
   3446 
   3447  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
   3448                                AOM_PLANE_Y, AOM_PLANE_Y);
   3449 
   3450  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
   3451  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   3452 
   3453  const uint8_t *const src = ms_buffers->src->buf;
   3454  const int src_stride = ms_buffers->src->stride;
   3455  const uint8_t *const dst = xd->plane[0].dst.buf;
   3456  const int dst_stride = xd->plane[0].dst.stride;
   3457  const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;
   3458 
   3459  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
   3460  mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params);
   3461  return mse;
   3462 }
   3463 
   3464 // Refines MV in a small range
   3465 
   3466 // Macros to build bitmasks which help us avoid redundant computations
   3467 //
   3468 // To explain the idea here, imagine that on the first iteration of the
   3469 // loop below, we step rightwards. Then, on the second iteration, the neighbors
   3470 // to consider are:
   3471 //     . . .
   3472 //     0 1 .
   3473 //     . . .
   3474 // Where 0 is the initial search point, 1 is the best candidate found in the
   3475 // first iteration, and the dots are the other neighbors of point 1.
   3476 //
   3477 // Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and
   3478 // the seven points marked with dots), and compare them to see where to move
   3479 // next. However, we already evaluated 5 of those 8 neighbors in the last
   3480 // iteration, and decided that they are worse than point 1. So we don't need
   3481 // to re-consider these points. We only really need to consider the three
   3482 // points which are adjacent to point 1 but *not* to point 0.
   3483 //
   3484 // As the algorithm goes on, there are other ways that redundant evaluations
   3485 // can happen, if the search path curls back around on itself.
   3486 //
   3487 // To avoid all possible redundancies, we'd have to build a set containing
   3488 // every point we have already checked, and this would be quite expensive.
   3489 //
   3490 // So instead, we apply a 95%-effective solution with a much lower overhead:
   3491 // we prune out the points which were considered during the previous
   3492 // iteration, but we don't worry about any prior iteration. This can be done
   3493 // as follows:
   3494 //
   3495 // We build a static table, called neighbor_mask, which answers the question
   3496 // "if we moved in direction X last time, which neighbors are new, and which
   3497 //  were scanned last iteration?"
   3498 // Then we can query this table to quickly determine which points we need to
   3499 // evaluate, and which we can skip.
   3500 //
   3501 // To query the table, the logic is simply:
   3502 // neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration,
   3503 //                             do we need to scan neighbor j this iteration?"
   3504 #define NEIGHBOR_MASK_DIA(left, down, right, up) \
   3505  (left | (down << 1) | (right << 2) | (up << 3))
   3506 
   3507 #define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \
   3508                          up_left, up_right)                            \
   3509  (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) |   \
   3510   (down_right << 5) | (up_left << 6) | (up_right << 7))
   3511 
   3512 static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = {
   3513  // WARP_SEARCH_DIAMOND
   3514  {
   3515    .num_neighbors = 4,
   3516    .neighbors = { {  0, -1 }, {  1,  0 }, {  0,  1 }, { -1,  0 } },
   3517    .neighbor_mask = {
   3518      // If we stepped left last time, consider all points except right
   3519      NEIGHBOR_MASK_DIA(1, 1, 0, 1),
   3520      // If we stepped down last time, consider all points except up
   3521      NEIGHBOR_MASK_DIA(1, 1, 1, 0),
   3522      // Stepped right last time
   3523      NEIGHBOR_MASK_DIA(0, 1, 1, 1),
   3524      // Stepped up last time
   3525      NEIGHBOR_MASK_DIA(1, 0, 1, 1),
   3526    },
   3527  },
   3528  // WARP_SEARCH_SQUARE
   3529  {
   3530    .num_neighbors = 8,
   3531    .neighbors = { {  0, -1 }, {  1,  0 }, {  0,  1 }, { -1,  0 },
   3532                   {  1, -1 }, {  1,  1 }, { -1, -1 }, { -1,  1 } },
   3533    .neighbor_mask = {
   3534      // If we stepped left last time, then we only need to consider 3 points:
   3535      // left, down+left, up+left
   3536      NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0),
   3537      // If we stepped down last time, then we only need to consider 3 points:
   3538      // down, down+left, down+right
   3539      NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0),
   3540      // Stepped right last time
   3541      NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1),
   3542      // Stepped up last time
   3543      NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1),
   3544 
   3545      // If we stepped down+left last time, then we need to consider 5 points:
   3546      // left, down, down+left, down+right, up+left
   3547      NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0),
   3548      // Stepped down+right last time
   3549      NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1),
   3550      // Stepped up+left last time
   3551      NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1),
   3552      // Stepped up+right last time
   3553      NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1),
   3554    },
   3555  },
   3556 };
   3557 
   3558 unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3559                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
   3560                                  BLOCK_SIZE bsize, const int *pts0,
   3561                                  const int *pts_inref0, int total_samples,
   3562                                  WARP_SEARCH_METHOD search_method,
   3563                                  int num_iterations) {
   3564  MB_MODE_INFO *mbmi = xd->mi[0];
   3565 
   3566  const MV *neighbors = warp_search_info[search_method].neighbors;
   3567  const int num_neighbors = warp_search_info[search_method].num_neighbors;
   3568  const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask;
   3569 
   3570  MV *best_mv = &mbmi->mv[0].as_mv;
   3571 
   3572  WarpedMotionParams best_wm_params = mbmi->wm_params;
   3573  int best_num_proj_ref = mbmi->num_proj_ref;
   3574  unsigned int bestmse;
   3575  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3576 
   3577  const int mv_shift = ms_params->allow_hp ? 0 : 1;
   3578 
   3579  // Calculate the center position's error
   3580  assert(av1_is_subpelmv_in_range(mv_limits, *best_mv));
   3581  bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv);
   3582 
   3583  // MV search
   3584  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
   3585  const int mi_row = xd->mi_row;
   3586  const int mi_col = xd->mi_col;
   3587 
   3588  // First step always scans all neighbors
   3589  uint8_t valid_neighbors = UINT8_MAX;
   3590 
   3591  for (int ite = 0; ite < num_iterations; ++ite) {
   3592    int best_idx = -1;
   3593 
   3594    for (int idx = 0; idx < num_neighbors; ++idx) {
   3595      if ((valid_neighbors & (1 << idx)) == 0) {
   3596        continue;
   3597      }
   3598 
   3599      unsigned int thismse;
   3600 
   3601      MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift),
   3602                     best_mv->col + neighbors[idx].col * (1 << mv_shift) };
   3603      if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
   3604        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
   3605        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
   3606        if (total_samples > 1) {
   3607          mbmi->num_proj_ref =
   3608              av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
   3609        }
   3610 
   3611        if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
   3612                                 this_mv.row, this_mv.col, &mbmi->wm_params,
   3613                                 mi_row, mi_col)) {
   3614          thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv);
   3615 
   3616          if (thismse < bestmse) {
   3617            best_idx = idx;
   3618            best_wm_params = mbmi->wm_params;
   3619            best_num_proj_ref = mbmi->num_proj_ref;
   3620            bestmse = thismse;
   3621          }
   3622        }
   3623      }
   3624    }
   3625 
   3626    if (best_idx == -1) break;
   3627 
   3628    if (best_idx >= 0) {
   3629      best_mv->row += neighbors[best_idx].row * (1 << mv_shift);
   3630      best_mv->col += neighbors[best_idx].col * (1 << mv_shift);
   3631      valid_neighbors = neighbor_mask[best_idx];
   3632    }
   3633  }
   3634 
   3635  mbmi->wm_params = best_wm_params;
   3636  mbmi->num_proj_ref = best_num_proj_ref;
   3637  return bestmse;
   3638 }
   3639 
   3640 #endif  // !CONFIG_REALTIME_ONLY
   3641 // =============================================================================
   3642 //  Subpixel Motion Search: OBMC
   3643 // =============================================================================
   3644 // Estimates the variance of prediction residue
   3645 static inline int estimate_obmc_pref_error(
   3646    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3647    unsigned int *sse) {
   3648  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
   3649 
   3650  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   3651  const int32_t *src = ms_buffers->wsrc;
   3652  const int32_t *mask = ms_buffers->obmc_mask;
   3653  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
   3654  const int ref_stride = ms_buffers->ref->stride;
   3655 
   3656  const int subpel_x_q3 = get_subpel_part(this_mv->col);
   3657  const int subpel_y_q3 = get_subpel_part(this_mv->row);
   3658 
   3659  return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse);
   3660 }
   3661 
   3662 // Calculates the variance of prediction residue
   3663 static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
   3664                                     const MV *this_mv,
   3665                                     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3666                                     unsigned int *sse) {
   3667  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
   3668  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
   3669  const int w = var_params->w;
   3670  const int h = var_params->h;
   3671 
   3672  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   3673  const int32_t *wsrc = ms_buffers->wsrc;
   3674  const int32_t *mask = ms_buffers->obmc_mask;
   3675  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
   3676  const int ref_stride = ms_buffers->ref->stride;
   3677 
   3678  const int subpel_x_q3 = get_subpel_part(this_mv->col);
   3679  const int subpel_y_q3 = get_subpel_part(this_mv->row);
   3680 
   3681  const int mi_row = xd->mi_row;
   3682  const int mi_col = xd->mi_col;
   3683 
   3684  unsigned int besterr;
   3685  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
   3686 #if CONFIG_AV1_HIGHBITDEPTH
   3687  if (is_cur_buf_hbd(xd)) {
   3688    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
   3689    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
   3690                              subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
   3691                              subpel_search_type);
   3692    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
   3693  } else {
   3694    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
   3695                       subpel_y_q3, ref, ref_stride, subpel_search_type);
   3696 
   3697    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
   3698  }
   3699 #else
   3700  aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
   3701                     subpel_y_q3, ref, ref_stride, subpel_search_type);
   3702 
   3703  besterr = vfp->ovf(pred, w, wsrc, mask, sse);
   3704 #endif
   3705  return besterr;
   3706 }
   3707 
   3708 static unsigned int setup_obmc_center_error(
   3709    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3710    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
   3711  // TODO(chiyotsai@google.com): There might be a bug here where we didn't use
   3712  // get_buf_from_mv(ref, *this_mv).
   3713  const MSBuffers *ms_buffers = &var_params->ms_buffers;
   3714  const int32_t *wsrc = ms_buffers->wsrc;
   3715  const int32_t *mask = ms_buffers->obmc_mask;
   3716  const uint8_t *ref = ms_buffers->ref->buf;
   3717  const int ref_stride = ms_buffers->ref->stride;
   3718  unsigned int besterr =
   3719      var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1);
   3720  *distortion = besterr;
   3721  besterr += mv_err_cost_(this_mv, mv_cost_params);
   3722  return besterr;
   3723 }
   3724 
   3725 static unsigned int upsampled_setup_obmc_center_error(
   3726    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv,
   3727    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3728    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
   3729  unsigned int besterr =
   3730      upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1);
   3731  *distortion = besterr;
   3732  besterr += mv_err_cost_(this_mv, mv_cost_params);
   3733  return besterr;
   3734 }
   3735 
   3736 // Estimates the variance of prediction residue
   3737 // TODO(chiyotsai@google.com): the cost does does not match the cost in
   3738 // mv_cost_. Investigate this later.
   3739 static inline int estimate_obmc_mvcost(const MV *this_mv,
   3740                                       const MV_COST_PARAMS *mv_cost_params) {
   3741  const MV *ref_mv = mv_cost_params->ref_mv;
   3742  const int *mvjcost = mv_cost_params->mvjcost;
   3743  const int *const *mvcost = mv_cost_params->mvcost;
   3744  const int error_per_bit = mv_cost_params->error_per_bit;
   3745  const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type;
   3746  const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row),
   3747                       GET_MV_SUBPEL(this_mv->col - ref_mv->col) };
   3748 
   3749  switch (mv_cost_type) {
   3750    case MV_COST_ENTROPY:
   3751      return (unsigned)((mv_cost(&diff_mv, mvjcost,
   3752                                 CONVERT_TO_CONST_MVCOST(mvcost)) *
   3753                             error_per_bit +
   3754                         4096) >>
   3755                        13);
   3756    case MV_COST_NONE: return 0;
   3757    default:
   3758      assert(0 && "L1 norm is not tuned for estimated obmc mvcost");
   3759      return 0;
   3760  }
   3761 }
   3762 
   3763 // Estimates whether this_mv is better than best_mv. This function incorporates
   3764 // both prediction error and residue into account.
   3765 static inline unsigned int obmc_check_better_fast(
   3766    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
   3767    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3768    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   3769    unsigned int *sse1, int *distortion, int *has_better_mv) {
   3770  unsigned int cost;
   3771  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
   3772    unsigned int sse;
   3773    const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse);
   3774 
   3775    cost = estimate_obmc_mvcost(this_mv, mv_cost_params);
   3776    cost += thismse;
   3777 
   3778    if (cost < *besterr) {
   3779      *besterr = cost;
   3780      *best_mv = *this_mv;
   3781      *distortion = thismse;
   3782      *sse1 = sse;
   3783      *has_better_mv |= 1;
   3784    }
   3785  } else {
   3786    cost = INT_MAX;
   3787  }
   3788  return cost;
   3789 }
   3790 
   3791 // Estimates whether this_mv is better than best_mv. This function incorporates
   3792 // both prediction error and residue into account.
   3793 static inline unsigned int obmc_check_better(
   3794    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
   3795    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3796    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   3797    unsigned int *sse1, int *distortion, int *has_better_mv) {
   3798  unsigned int cost;
   3799  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
   3800    unsigned int sse;
   3801    const int thismse =
   3802        upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse);
   3803    cost = mv_err_cost_(this_mv, mv_cost_params);
   3804 
   3805    cost += thismse;
   3806 
   3807    if (cost < *besterr) {
   3808      *besterr = cost;
   3809      *best_mv = *this_mv;
   3810      *distortion = thismse;
   3811      *sse1 = sse;
   3812      *has_better_mv |= 1;
   3813    }
   3814  } else {
   3815    cost = INT_MAX;
   3816  }
   3817  return cost;
   3818 }
   3819 
   3820 static AOM_FORCE_INLINE MV obmc_first_level_check(
   3821    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv,
   3822    const int hstep, const SubpelMvLimits *mv_limits,
   3823    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3824    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   3825    unsigned int *sse1, int *distortion) {
   3826  int dummy = 0;
   3827  const MV left_mv = { this_mv.row, this_mv.col - hstep };
   3828  const MV right_mv = { this_mv.row, this_mv.col + hstep };
   3829  const MV top_mv = { this_mv.row - hstep, this_mv.col };
   3830  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
   3831 
   3832  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
   3833    const unsigned int left =
   3834        obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
   3835                          mv_cost_params, besterr, sse1, distortion, &dummy);
   3836    const unsigned int right =
   3837        obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
   3838                          mv_cost_params, besterr, sse1, distortion, &dummy);
   3839    const unsigned int up =
   3840        obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
   3841                          mv_cost_params, besterr, sse1, distortion, &dummy);
   3842    const unsigned int down =
   3843        obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
   3844                          mv_cost_params, besterr, sse1, distortion, &dummy);
   3845 
   3846    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
   3847    const MV diag_mv = { this_mv.row + diag_step.row,
   3848                         this_mv.col + diag_step.col };
   3849 
   3850    // Check the diagonal direction with the best mv
   3851    obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
   3852                      mv_cost_params, besterr, sse1, distortion, &dummy);
   3853 
   3854    return diag_step;
   3855  } else {
   3856    const unsigned int left = obmc_check_better_fast(
   3857        &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
   3858        distortion, &dummy);
   3859    const unsigned int right = obmc_check_better_fast(
   3860        &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
   3861        sse1, distortion, &dummy);
   3862 
   3863    const unsigned int up = obmc_check_better_fast(
   3864        &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
   3865        distortion, &dummy);
   3866 
   3867    const unsigned int down = obmc_check_better_fast(
   3868        &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
   3869        sse1, distortion, &dummy);
   3870 
   3871    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
   3872    const MV diag_mv = { this_mv.row + diag_step.row,
   3873                         this_mv.col + diag_step.col };
   3874 
   3875    // Check the diagonal direction with the best mv
   3876    obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params,
   3877                           mv_cost_params, besterr, sse1, distortion, &dummy);
   3878 
   3879    return diag_step;
   3880  }
   3881 }
   3882 
   3883 // A newer version of second level check for obmc that gives better quality.
   3884 static AOM_FORCE_INLINE void obmc_second_level_check_v2(
   3885    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
   3886    MV *best_mv, const SubpelMvLimits *mv_limits,
   3887    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
   3888    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
   3889    unsigned int *sse1, int *distortion) {
   3890  assert(best_mv->row == this_mv.row + diag_step.row ||
   3891         best_mv->col == this_mv.col + diag_step.col);
   3892  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
   3893    return;
   3894  } else if (this_mv.row == best_mv->row) {
   3895    // Search away from diagonal step since diagonal search did not provide any
   3896    // improvement
   3897    diag_step.row *= -1;
   3898  } else if (this_mv.col == best_mv->col) {
   3899    diag_step.col *= -1;
   3900  }
   3901 
   3902  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
   3903  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
   3904  const MV diag_bias_mv = { best_mv->row + diag_step.row,
   3905                            best_mv->col + diag_step.col };
   3906  int has_better_mv = 0;
   3907 
   3908  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
   3909    obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
   3910                      mv_cost_params, besterr, sse1, distortion,
   3911                      &has_better_mv);
   3912    obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
   3913                      mv_cost_params, besterr, sse1, distortion,
   3914                      &has_better_mv);
   3915 
   3916    // Do an additional search if the second iteration gives a better mv
   3917    if (has_better_mv) {
   3918      obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
   3919                        mv_cost_params, besterr, sse1, distortion,
   3920                        &has_better_mv);
   3921    }
   3922  } else {
   3923    obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
   3924                           mv_cost_params, besterr, sse1, distortion,
   3925                           &has_better_mv);
   3926    obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
   3927                           mv_cost_params, besterr, sse1, distortion,
   3928                           &has_better_mv);
   3929 
   3930    // Do an additional search if the second iteration gives a better mv
   3931    if (has_better_mv) {
   3932      obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
   3933                             mv_cost_params, besterr, sse1, distortion,
   3934                             &has_better_mv);
   3935    }
   3936  }
   3937 }
   3938 
   3939 int av1_find_best_obmc_sub_pixel_tree_up(
   3940    MACROBLOCKD *xd, const AV1_COMMON *const cm,
   3941    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
   3942    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
   3943    unsigned int *sse1, int_mv *last_mv_search_list) {
   3944  (void)last_mv_search_list;
   3945  (void)start_mv_stats;
   3946  const int allow_hp = ms_params->allow_hp;
   3947  const int forced_stop = ms_params->forced_stop;
   3948  const int iters_per_step = ms_params->iters_per_step;
   3949  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   3950  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
   3951  const SUBPEL_SEARCH_TYPE subpel_search_type =
   3952      ms_params->var_params.subpel_search_type;
   3953  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
   3954 
   3955  int hstep = INIT_SUBPEL_STEP_SIZE;
   3956  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
   3957 
   3958  unsigned int besterr = INT_MAX;
   3959  *bestmv = start_mv;
   3960 
   3961  if (subpel_search_type != USE_2_TAPS_ORIG)
   3962    besterr = upsampled_setup_obmc_center_error(
   3963        xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion);
   3964  else
   3965    besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1,
   3966                                      distortion);
   3967 
   3968  for (int iter = 0; iter < round; ++iter) {
   3969    MV iter_center_mv = *bestmv;
   3970    MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
   3971                                          mv_limits, var_params, mv_cost_params,
   3972                                          &besterr, sse1, distortion);
   3973 
   3974    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
   3975      obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
   3976                                 mv_limits, var_params, mv_cost_params,
   3977                                 &besterr, sse1, distortion);
   3978    }
   3979    hstep >>= 1;
   3980  }
   3981 
   3982  return besterr;
   3983 }
   3984 
   3985 // =============================================================================
   3986 //  Public cost function: mv_cost + pred error
   3987 // =============================================================================
   3988 int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
   3989                       const FULLPEL_MV best_mv,
   3990                       const aom_variance_fn_ptr_t *vfp,
   3991                       const struct buf_2d *src, const struct buf_2d *pre) {
   3992  const MV mv = get_mv_from_fullmv(&best_mv);
   3993  unsigned int sse, var;
   3994 
   3995  var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv),
   3996                pre->stride, &sse);
   3997  (void)var;
   3998 
   3999  return sse + mv_err_cost_(&mv, mv_cost_params);
   4000 }