tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve.c (63089B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <string.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "av1/common/av1_common_int.h"
     19 #include "av1/common/blockd.h"
     20 #include "av1/common/convolve.h"
     21 #include "av1/common/filter.h"
     22 #include "av1/common/resize.h"
     23 #include "aom_dsp/aom_dsp_common.h"
     24 #include "aom_ports/mem.h"
     25 
     26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
     27                             int dst_stride, int w, int h,
     28                             const int16_t *x_filters, int x0_qn,
     29                             int x_step_qn) {
     30  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
     31  for (int y = 0; y < h; ++y) {
     32    int x_qn = x0_qn;
     33    for (int x = 0; x < w; ++x) {
     34      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
     35      const int x_filter_idx =
     36          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     37      assert(x_filter_idx <= RS_SUBPEL_MASK);
     38      const int16_t *const x_filter =
     39          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
     40      int sum = 0;
     41      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
     42        sum += src_x[k] * x_filter[k];
     43      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     44      x_qn += x_step_qn;
     45    }
     46    src += src_stride;
     47    dst += dst_stride;
     48  }
     49 }
     50 
     51 #if CONFIG_AV1_HIGHBITDEPTH
     52 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
     53                                    uint16_t *dst, int dst_stride, int w, int h,
     54                                    const int16_t *x_filters, int x0_qn,
     55                                    int x_step_qn, int bd) {
     56  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
     57  for (int y = 0; y < h; ++y) {
     58    int x_qn = x0_qn;
     59    for (int x = 0; x < w; ++x) {
     60      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
     61      const int x_filter_idx =
     62          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     63      assert(x_filter_idx <= RS_SUBPEL_MASK);
     64      const int16_t *const x_filter =
     65          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
     66      int sum = 0;
     67      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
     68        sum += src_x[k] * x_filter[k];
     69      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     70      x_qn += x_step_qn;
     71    }
     72    src += src_stride;
     73    dst += dst_stride;
     74  }
     75 }
     76 #endif  // CONFIG_AV1_HIGHBITDEPTH
     77 
     78 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
     79                          int dst_stride, int w, int h,
     80                          const InterpFilterParams *filter_params_x,
     81                          const InterpFilterParams *filter_params_y,
     82                          const int subpel_x_qn, const int subpel_y_qn,
     83                          ConvolveParams *conv_params) {
     84  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
     85  int im_h = h + filter_params_y->taps - 1;
     86  int im_stride = w;
     87  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
     88  const int fo_vert = filter_params_y->taps / 2 - 1;
     89  const int fo_horiz = filter_params_x->taps / 2 - 1;
     90  const int bd = 8;
     91  const int bits =
     92      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     93 
     94  // horizontal filter
     95  const uint8_t *src_horiz = src - fo_vert * src_stride;
     96  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     97      filter_params_x, subpel_x_qn & SUBPEL_MASK);
     98  for (int y = 0; y < im_h; ++y) {
     99    for (int x = 0; x < w; ++x) {
    100      int32_t sum = (1 << (bd + FILTER_BITS - 1));
    101      for (int k = 0; k < filter_params_x->taps; ++k) {
    102        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    103      }
    104 
    105      // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
    106      // be beyond the following range. For better prediction, a clamping can be
    107      // added for 12 tap filter to ensure the horizontal filtering result is
    108      // within 16 bit. The same applies to the vertical filtering.
    109      assert(filter_params_x->taps > 8 ||
    110             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
    111      im_block[y * im_stride + x] =
    112          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    113    }
    114  }
    115 
    116  // vertical filter
    117  int16_t *src_vert = im_block + fo_vert * im_stride;
    118  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    119      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    120  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    121  for (int y = 0; y < h; ++y) {
    122    for (int x = 0; x < w; ++x) {
    123      int32_t sum = 1 << offset_bits;
    124      for (int k = 0; k < filter_params_y->taps; ++k) {
    125        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    126      }
    127      assert(filter_params_y->taps > 8 ||
    128             (0 <= sum && sum < (1 << (offset_bits + 2))));
    129      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
    130                    ((1 << (offset_bits - conv_params->round_1)) +
    131                     (1 << (offset_bits - conv_params->round_1 - 1)));
    132      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
    133    }
    134  }
    135 }
    136 
    137 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
    138                         int dst_stride, int w, int h,
    139                         const InterpFilterParams *filter_params_y,
    140                         const int subpel_y_qn) {
    141  const int fo_vert = filter_params_y->taps / 2 - 1;
    142 
    143  // vertical filter
    144  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    145      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    146  for (int y = 0; y < h; ++y) {
    147    for (int x = 0; x < w; ++x) {
    148      int32_t res = 0;
    149      for (int k = 0; k < filter_params_y->taps; ++k) {
    150        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    151      }
    152      dst[y * dst_stride + x] =
    153          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
    154    }
    155  }
    156 }
    157 
    158 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
    159                         int dst_stride, int w, int h,
    160                         const InterpFilterParams *filter_params_x,
    161                         const int subpel_x_qn, ConvolveParams *conv_params) {
    162  const int fo_horiz = filter_params_x->taps / 2 - 1;
    163  const int bits = FILTER_BITS - conv_params->round_0;
    164 
    165  assert(bits >= 0);
    166  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    167         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    168 
    169  // horizontal filter
    170  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    171      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    172 
    173  for (int y = 0; y < h; ++y) {
    174    for (int x = 0; x < w; ++x) {
    175      int32_t res = 0;
    176      for (int k = 0; k < filter_params_x->taps; ++k) {
    177        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    178      }
    179      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
    180      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
    181    }
    182  }
    183 }
    184 
    185 // This function is exactly the same as av1_convolve_2d_sr_c, and is an
    186 // optimized version for intrabc. Use the following 2-tap filter:
    187 // DECLARE_ALIGNED(256, static const int16_t,
    188 //                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
    189 //   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    190 //   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    191 // };
    192 void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
    193                                  uint8_t *dst, int dst_stride, int w, int h,
    194                                  const InterpFilterParams *filter_params_x,
    195                                  const InterpFilterParams *filter_params_y,
    196                                  const int subpel_x_qn, const int subpel_y_qn,
    197                                  ConvolveParams *conv_params) {
    198  assert(subpel_x_qn == 8);
    199  assert(subpel_y_qn == 8);
    200  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
    201  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
    202  (void)filter_params_x;
    203  (void)subpel_x_qn;
    204  (void)filter_params_y;
    205  (void)subpel_y_qn;
    206  (void)conv_params;
    207 
    208  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    209  int im_h = h + 1;
    210  int im_stride = w;
    211  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
    212  const int bd = 8;
    213 
    214  // horizontal filter
    215  // explicitly operate for subpel_x_qn = 8.
    216  int16_t *im = im_block;
    217  for (int y = 0; y < im_h; ++y) {
    218    for (int x = 0; x < w; ++x) {
    219      const int32_t sum = (1 << bd) + src[x] + src[x + 1];
    220      assert(0 <= sum && sum < (1 << (bd + 2)));
    221      im[x] = sum;
    222    }
    223    src += src_stride;
    224    im += im_stride;
    225  }
    226 
    227  // vertical filter
    228  // explicitly operate for subpel_y_qn = 8.
    229  int16_t *src_vert = im_block;
    230  for (int y = 0; y < h; ++y) {
    231    for (int x = 0; x < w; ++x) {
    232      const int32_t sum =
    233          (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
    234      assert(0 <= sum && sum < (1 << (bd + 4)));
    235      const int16_t res =
    236          ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
    237      dst[x] = clip_pixel(res);
    238    }
    239    src_vert += im_stride;
    240    dst += dst_stride;
    241  }
    242 }
    243 
    244 // This function is exactly the same as av1_convolve_y_sr_c, and is an
    245 // optimized version for intrabc.
    246 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
    247                                 uint8_t *dst, int dst_stride, int w, int h,
    248                                 const InterpFilterParams *filter_params_y,
    249                                 const int subpel_y_qn) {
    250  assert(subpel_y_qn == 8);
    251  assert(filter_params_y->taps == 2);
    252  (void)filter_params_y;
    253  (void)subpel_y_qn;
    254 
    255  // vertical filter
    256  // explicitly operate for subpel_y_qn = 8.
    257  for (int y = 0; y < h; ++y) {
    258    for (int x = 0; x < w; ++x) {
    259      const int32_t res = src[x] + src[src_stride + x];
    260      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
    261    }
    262    src += src_stride;
    263    dst += dst_stride;
    264  }
    265 }
    266 
    267 // This function is exactly the same as av1_convolve_x_sr_c, and is an
    268 // optimized version for intrabc.
    269 void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
    270                                 uint8_t *dst, int dst_stride, int w, int h,
    271                                 const InterpFilterParams *filter_params_x,
    272                                 const int subpel_x_qn,
    273                                 ConvolveParams *conv_params) {
    274  assert(subpel_x_qn == 8);
    275  assert(filter_params_x->taps == 2);
    276  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
    277  (void)filter_params_x;
    278  (void)subpel_x_qn;
    279  (void)conv_params;
    280 
    281  // horizontal filter
    282  // explicitly operate for subpel_x_qn = 8.
    283  for (int y = 0; y < h; ++y) {
    284    for (int x = 0; x < w; ++x) {
    285      const int32_t res = src[x] + src[x + 1];
    286      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
    287    }
    288    src += src_stride;
    289    dst += dst_stride;
    290  }
    291 }
    292 
    293 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
    294                                uint8_t *dst, int dst_stride, int w, int h,
    295                                const InterpFilterParams *filter_params_x,
    296                                const InterpFilterParams *filter_params_y,
    297                                const int subpel_x_qn, const int subpel_y_qn,
    298                                ConvolveParams *conv_params) {
    299  CONV_BUF_TYPE *dst16 = conv_params->dst;
    300  int dst16_stride = conv_params->dst_stride;
    301  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    302  int im_h = h + filter_params_y->taps - 1;
    303  int im_stride = w;
    304  const int fo_vert = filter_params_y->taps / 2 - 1;
    305  const int fo_horiz = filter_params_x->taps / 2 - 1;
    306  const int bd = 8;
    307  const int round_bits =
    308      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    309 
    310  // horizontal filter
    311  const uint8_t *src_horiz = src - fo_vert * src_stride;
    312  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    313      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    314  for (int y = 0; y < im_h; ++y) {
    315    for (int x = 0; x < w; ++x) {
    316      int32_t sum = (1 << (bd + FILTER_BITS - 1));
    317      for (int k = 0; k < filter_params_x->taps; ++k) {
    318        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    319      }
    320      assert(filter_params_x->taps > 8 ||
    321             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
    322      im_block[y * im_stride + x] =
    323          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    324    }
    325  }
    326 
    327  // vertical filter
    328  int16_t *src_vert = im_block + fo_vert * im_stride;
    329  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    330      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    331  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    332  for (int y = 0; y < h; ++y) {
    333    for (int x = 0; x < w; ++x) {
    334      int32_t sum = 1 << offset_bits;
    335      for (int k = 0; k < filter_params_y->taps; ++k) {
    336        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    337      }
    338      assert(filter_params_y->taps > 8 ||
    339             (0 <= sum && sum < (1 << (offset_bits + 2))));
    340      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    341      if (conv_params->do_average) {
    342        int32_t tmp = dst16[y * dst16_stride + x];
    343        if (conv_params->use_dist_wtd_comp_avg) {
    344          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    345          tmp = tmp >> DIST_PRECISION_BITS;
    346        } else {
    347          tmp += res;
    348          tmp = tmp >> 1;
    349        }
    350        tmp -= (1 << (offset_bits - conv_params->round_1)) +
    351               (1 << (offset_bits - conv_params->round_1 - 1));
    352        dst[y * dst_stride + x] =
    353            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
    354      } else {
    355        dst16[y * dst16_stride + x] = res;
    356      }
    357    }
    358  }
    359 }
    360 
    361 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
    362                               int dst_stride, int w, int h,
    363                               const InterpFilterParams *filter_params_y,
    364                               const int subpel_y_qn,
    365                               ConvolveParams *conv_params) {
    366  CONV_BUF_TYPE *dst16 = conv_params->dst;
    367  int dst16_stride = conv_params->dst_stride;
    368  const int fo_vert = filter_params_y->taps / 2 - 1;
    369  const int bits = FILTER_BITS - conv_params->round_0;
    370  const int bd = 8;
    371  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    372  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    373                           (1 << (offset_bits - conv_params->round_1 - 1));
    374  const int round_bits =
    375      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    376 
    377  // vertical filter
    378  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    379      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    380  for (int y = 0; y < h; ++y) {
    381    for (int x = 0; x < w; ++x) {
    382      int32_t res = 0;
    383      for (int k = 0; k < filter_params_y->taps; ++k) {
    384        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    385      }
    386      res *= (1 << bits);
    387      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
    388 
    389      if (conv_params->do_average) {
    390        int32_t tmp = dst16[y * dst16_stride + x];
    391        if (conv_params->use_dist_wtd_comp_avg) {
    392          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    393          tmp = tmp >> DIST_PRECISION_BITS;
    394        } else {
    395          tmp += res;
    396          tmp = tmp >> 1;
    397        }
    398        tmp -= round_offset;
    399        dst[y * dst_stride + x] =
    400            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
    401      } else {
    402        dst16[y * dst16_stride + x] = res;
    403      }
    404    }
    405  }
    406 }
    407 
    408 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
    409                               int dst_stride, int w, int h,
    410                               const InterpFilterParams *filter_params_x,
    411                               const int subpel_x_qn,
    412                               ConvolveParams *conv_params) {
    413  CONV_BUF_TYPE *dst16 = conv_params->dst;
    414  int dst16_stride = conv_params->dst_stride;
    415  const int fo_horiz = filter_params_x->taps / 2 - 1;
    416  const int bits = FILTER_BITS - conv_params->round_1;
    417  const int bd = 8;
    418  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    419  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    420                           (1 << (offset_bits - conv_params->round_1 - 1));
    421  const int round_bits =
    422      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    423 
    424  // horizontal filter
    425  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    426      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    427  for (int y = 0; y < h; ++y) {
    428    for (int x = 0; x < w; ++x) {
    429      int32_t res = 0;
    430      for (int k = 0; k < filter_params_x->taps; ++k) {
    431        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    432      }
    433      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
    434      res += round_offset;
    435 
    436      if (conv_params->do_average) {
    437        int32_t tmp = dst16[y * dst16_stride + x];
    438        if (conv_params->use_dist_wtd_comp_avg) {
    439          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    440          tmp = tmp >> DIST_PRECISION_BITS;
    441        } else {
    442          tmp += res;
    443          tmp = tmp >> 1;
    444        }
    445        tmp -= round_offset;
    446        dst[y * dst_stride + x] =
    447            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
    448      } else {
    449        dst16[y * dst16_stride + x] = res;
    450      }
    451    }
    452  }
    453 }
    454 
    455 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
    456                                     uint8_t *dst, int dst_stride, int w, int h,
    457                                     ConvolveParams *conv_params) {
    458  CONV_BUF_TYPE *dst16 = conv_params->dst;
    459  int dst16_stride = conv_params->dst_stride;
    460  const int bits =
    461      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
    462  const int bd = 8;
    463  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    464  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    465                           (1 << (offset_bits - conv_params->round_1 - 1));
    466 
    467  for (int y = 0; y < h; ++y) {
    468    for (int x = 0; x < w; ++x) {
    469      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
    470      res += round_offset;
    471 
    472      if (conv_params->do_average) {
    473        int32_t tmp = dst16[y * dst16_stride + x];
    474        if (conv_params->use_dist_wtd_comp_avg) {
    475          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    476          tmp = tmp >> DIST_PRECISION_BITS;
    477        } else {
    478          tmp += res;
    479          tmp = tmp >> 1;
    480        }
    481        tmp -= round_offset;
    482        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
    483      } else {
    484        dst16[y * dst16_stride + x] = res;
    485      }
    486    }
    487  }
    488 }
    489 
    490 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
    491                             int dst_stride, int w, int h,
    492                             const InterpFilterParams *filter_params_x,
    493                             const InterpFilterParams *filter_params_y,
    494                             const int subpel_x_qn, const int x_step_qn,
    495                             const int subpel_y_qn, const int y_step_qn,
    496                             ConvolveParams *conv_params) {
    497  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
    498  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
    499             filter_params_y->taps;
    500  CONV_BUF_TYPE *dst16 = conv_params->dst;
    501  const int dst16_stride = conv_params->dst_stride;
    502  const int bits =
    503      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    504  assert(bits >= 0);
    505  int im_stride = w;
    506  const int fo_vert = filter_params_y->taps / 2 - 1;
    507  const int fo_horiz = filter_params_x->taps / 2 - 1;
    508  const int bd = 8;
    509 
    510  // horizontal filter
    511  const uint8_t *src_horiz = src - fo_vert * src_stride;
    512  for (int y = 0; y < im_h; ++y) {
    513    int x_qn = subpel_x_qn;
    514    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
    515      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
    516      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    517      assert(x_filter_idx < SUBPEL_SHIFTS);
    518      const int16_t *x_filter =
    519          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
    520      int32_t sum = (1 << (bd + FILTER_BITS - 1));
    521      for (int k = 0; k < filter_params_x->taps; ++k) {
    522        sum += x_filter[k] * src_x[k - fo_horiz];
    523      }
    524      assert(filter_params_x->taps > 8 ||
    525             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
    526      im_block[y * im_stride + x] =
    527          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    528    }
    529    src_horiz += src_stride;
    530  }
    531 
    532  // vertical filter
    533  int16_t *src_vert = im_block + fo_vert * im_stride;
    534  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    535  for (int x = 0; x < w; ++x) {
    536    int y_qn = subpel_y_qn;
    537    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
    538      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
    539      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    540      assert(y_filter_idx < SUBPEL_SHIFTS);
    541      const int16_t *y_filter =
    542          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
    543      int32_t sum = 1 << offset_bits;
    544      for (int k = 0; k < filter_params_y->taps; ++k) {
    545        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
    546      }
    547      assert(filter_params_y->taps > 8 ||
    548             (0 <= sum && sum < (1 << (offset_bits + 2))));
    549      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    550      if (conv_params->is_compound) {
    551        if (conv_params->do_average) {
    552          int32_t tmp = dst16[y * dst16_stride + x];
    553          if (conv_params->use_dist_wtd_comp_avg) {
    554            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    555            tmp = tmp >> DIST_PRECISION_BITS;
    556          } else {
    557            tmp += res;
    558            tmp = tmp >> 1;
    559          }
    560          /* Subtract round offset and convolve round */
    561          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
    562                       (1 << (offset_bits - conv_params->round_1 - 1)));
    563          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
    564        } else {
    565          dst16[y * dst16_stride + x] = res;
    566        }
    567      } else {
    568        /* Subtract round offset and convolve round */
    569        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
    570                             (1 << (offset_bits - conv_params->round_1 - 1)));
    571        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
    572      }
    573    }
    574    src_vert++;
    575  }
    576 }
    577 
    578 static void convolve_2d_scale_wrapper(
    579    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
    580    int h, const InterpFilterParams *filter_params_x,
    581    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    582    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
    583    ConvolveParams *conv_params) {
    584  if (conv_params->is_compound) {
    585    assert(conv_params->dst != NULL);
    586  }
    587  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    588                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
    589                        y_step_qn, conv_params);
    590 }
    591 
    592 static void convolve_2d_facade_compound(
    593    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
    594    int h, const InterpFilterParams *filter_params_x,
    595    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    596    const int subpel_y_qn, ConvolveParams *conv_params) {
    597  const bool need_x = subpel_x_qn != 0;
    598  const bool need_y = subpel_y_qn != 0;
    599  if (!need_x && !need_y) {
    600    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
    601                                  conv_params);
    602  } else if (need_x && !need_y) {
    603    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
    604                            filter_params_x, subpel_x_qn, conv_params);
    605  } else if (!need_x && need_y) {
    606    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
    607                            filter_params_y, subpel_y_qn, conv_params);
    608  } else {
    609    assert(need_y && need_x);
    610    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
    611                             filter_params_x, filter_params_y, subpel_x_qn,
    612                             subpel_y_qn, conv_params);
    613  }
    614 }
    615 
    616 static void convolve_2d_facade_single(
    617    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
    618    int h, const InterpFilterParams *filter_params_x,
    619    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    620    const int subpel_y_qn, ConvolveParams *conv_params) {
    621  const bool need_x = subpel_x_qn != 0;
    622  const bool need_y = subpel_y_qn != 0;
    623  if (!need_x && !need_y) {
    624    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
    625  } else if (need_x && !need_y) {
    626    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    627                      subpel_x_qn, conv_params);
    628  } else if (!need_x && need_y) {
    629    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
    630                      subpel_y_qn);
    631  } else {
    632    assert(need_x && need_y);
    633    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    634                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
    635  }
    636 }
    637 
    638 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
    639                            int dst_stride, int w, int h,
    640                            const InterpFilterParams *interp_filters[2],
    641                            const int subpel_x_qn, int x_step_q4,
    642                            const int subpel_y_qn, int y_step_q4, int scaled,
    643                            ConvolveParams *conv_params) {
    644  (void)x_step_q4;
    645  (void)y_step_q4;
    646  (void)dst;
    647  (void)dst_stride;
    648 
    649  const InterpFilterParams *filter_params_x = interp_filters[0];
    650  const InterpFilterParams *filter_params_y = interp_filters[1];
    651 
    652  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
    653  // 2-tap filter indicates that it is for IntraBC.
    654  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
    655    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
    656    assert(!scaled);
    657    if (subpel_x_qn && subpel_y_qn) {
    658      av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
    659                                 filter_params_x, filter_params_y, subpel_x_qn,
    660                                 subpel_y_qn, conv_params);
    661      return;
    662    } else if (subpel_x_qn) {
    663      av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
    664                                filter_params_x, subpel_x_qn, conv_params);
    665      return;
    666    } else if (subpel_y_qn) {
    667      av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
    668                                filter_params_y, subpel_y_qn);
    669      return;
    670    }
    671  }
    672 
    673  if (scaled) {
    674    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
    675                              filter_params_x, filter_params_y, subpel_x_qn,
    676                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
    677  } else if (conv_params->is_compound) {
    678    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
    679                                filter_params_x, filter_params_y, subpel_x_qn,
    680                                subpel_y_qn, conv_params);
    681  } else {
    682    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
    683                              filter_params_x, filter_params_y, subpel_x_qn,
    684                              subpel_y_qn, conv_params);
    685  }
    686 }
    687 
    688 #if CONFIG_AV1_HIGHBITDEPTH
    689 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
    690                                uint16_t *dst, int dst_stride, int w, int h,
    691                                const InterpFilterParams *filter_params_x,
    692                                const int subpel_x_qn,
    693                                ConvolveParams *conv_params, int bd) {
    694  const int fo_horiz = filter_params_x->taps / 2 - 1;
    695  const int bits = FILTER_BITS - conv_params->round_0;
    696 
    697  assert(bits >= 0);
    698  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    699         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    700 
    701  // horizontal filter
    702  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    703      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    704  for (int y = 0; y < h; ++y) {
    705    for (int x = 0; x < w; ++x) {
    706      int32_t res = 0;
    707      for (int k = 0; k < filter_params_x->taps; ++k) {
    708        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    709      }
    710      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
    711      dst[y * dst_stride + x] =
    712          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
    713    }
    714  }
    715 }
    716 
    717 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
    718                                uint16_t *dst, int dst_stride, int w, int h,
    719                                const InterpFilterParams *filter_params_y,
    720                                const int subpel_y_qn, int bd) {
    721  const int fo_vert = filter_params_y->taps / 2 - 1;
    722  // vertical filter
    723  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    724      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    725  for (int y = 0; y < h; ++y) {
    726    for (int x = 0; x < w; ++x) {
    727      int32_t res = 0;
    728      for (int k = 0; k < filter_params_y->taps; ++k) {
    729        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    730      }
    731      dst[y * dst_stride + x] =
    732          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
    733    }
    734  }
    735 }
    736 
    737 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
    738                                 uint16_t *dst, int dst_stride, int w, int h,
    739                                 const InterpFilterParams *filter_params_x,
    740                                 const InterpFilterParams *filter_params_y,
    741                                 const int subpel_x_qn, const int subpel_y_qn,
    742                                 ConvolveParams *conv_params, int bd) {
    743  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    744  int im_h = h + filter_params_y->taps - 1;
    745  int im_stride = w;
    746  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
    747  const int fo_vert = filter_params_y->taps / 2 - 1;
    748  const int fo_horiz = filter_params_x->taps / 2 - 1;
    749  const int bits =
    750      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    751  assert(bits >= 0);
    752 
    753  // horizontal filter
    754  const uint16_t *src_horiz = src - fo_vert * src_stride;
    755  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    756      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    757  for (int y = 0; y < im_h; ++y) {
    758    for (int x = 0; x < w; ++x) {
    759      int32_t sum = (1 << (bd + FILTER_BITS - 1));
    760      for (int k = 0; k < filter_params_x->taps; ++k) {
    761        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    762      }
    763      assert(filter_params_x->taps > 8 ||
    764             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
    765      im_block[y * im_stride + x] =
    766          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    767    }
    768  }
    769 
    770  // vertical filter
    771  int16_t *src_vert = im_block + fo_vert * im_stride;
    772  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    773      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    774  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    775  for (int y = 0; y < h; ++y) {
    776    for (int x = 0; x < w; ++x) {
    777      int32_t sum = 1 << offset_bits;
    778      for (int k = 0; k < filter_params_y->taps; ++k) {
    779        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    780      }
    781      assert(filter_params_y->taps > 8 ||
    782             (0 <= sum && sum < (1 << (offset_bits + 2))));
    783      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
    784                    ((1 << (offset_bits - conv_params->round_1)) +
    785                     (1 << (offset_bits - conv_params->round_1 - 1)));
    786      dst[y * dst_stride + x] =
    787          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
    788    }
    789  }
    790 }
    791 
    792 // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
    793 // optimized version for intrabc. Use the following 2-tap filter:
    794 // DECLARE_ALIGNED(256, static const int16_t,
    795 //                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
    796 //   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    797 //   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    798 // };
    799 void av1_highbd_convolve_2d_sr_intrabc_c(
    800    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    801    int h, const InterpFilterParams *filter_params_x,
    802    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    803    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
    804  const int bits =
    805      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    806  assert(bits >= 0);
    807  assert(subpel_x_qn == 8);
    808  assert(subpel_y_qn == 8);
    809  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
    810  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
    811  (void)filter_params_x;
    812  (void)subpel_x_qn;
    813  (void)filter_params_y;
    814  (void)subpel_y_qn;
    815  (void)conv_params;
    816 
    817  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    818  int im_h = h + 1;
    819  int im_stride = w;
    820  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
    821 
    822  // horizontal filter
    823  // explicitly operate for subpel_x_qn = 8.
    824  int16_t *im = im_block;
    825  for (int y = 0; y < im_h; ++y) {
    826    for (int x = 0; x < w; ++x) {
    827      int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
    828      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    829      sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    830      im[x] = sum;
    831    }
    832    src += src_stride;
    833    im += im_stride;
    834  }
    835 
    836  // vertical filter
    837  // explicitly operate for subpel_y_qn = 8.
    838  int16_t *src_vert = im_block;
    839  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    840  for (int y = 0; y < h; ++y) {
    841    for (int x = 0; x < w; ++x) {
    842      const int32_t sum =
    843          (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
    844      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    845      const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
    846                          ((1 << (offset_bits - conv_params->round_1)) +
    847                           (1 << (offset_bits - conv_params->round_1 - 1)));
    848 
    849      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
    850    }
    851    src_vert += im_stride;
    852    dst += dst_stride;
    853  }
    854 }
    855 
    856 // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
    857 // optimized version for intrabc.
    858 void av1_highbd_convolve_y_sr_intrabc_c(
    859    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    860    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
    861    int bd) {
    862  assert(subpel_y_qn == 8);
    863  assert(filter_params_y->taps == 2);
    864  (void)filter_params_y;
    865  (void)subpel_y_qn;
    866 
    867  // vertical filter
    868  // explicitly operate for subpel_y_qn = 8.
    869  for (int y = 0; y < h; ++y) {
    870    for (int x = 0; x < w; ++x) {
    871      const int32_t res = src[x] + src[src_stride + x];
    872      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
    873    }
    874    src += src_stride;
    875    dst += dst_stride;
    876  }
    877 }
    878 
    879 // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
    880 // optimized version for intrabc.
    881 void av1_highbd_convolve_x_sr_intrabc_c(
    882    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    883    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
    884    ConvolveParams *conv_params, int bd) {
    885  const int bits = FILTER_BITS - conv_params->round_0;
    886  assert(bits >= 0);
    887  assert(subpel_x_qn == 8);
    888  assert(filter_params_x->taps == 2);
    889  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
    890  (void)filter_params_x;
    891  (void)subpel_x_qn;
    892 
    893  // horizontal filter
    894  // explicitly operate for subpel_x_qn = 8.
    895  for (int y = 0; y < h; ++y) {
    896    for (int x = 0; x < w; ++x) {
    897      int32_t res = 64 * (src[x] + src[x + 1]);
    898      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
    899      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
    900    }
    901    src += src_stride;
    902    dst += dst_stride;
    903  }
    904 }
    905 
    906 void av1_highbd_dist_wtd_convolve_2d_c(
    907    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    908    int h, const InterpFilterParams *filter_params_x,
    909    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    910    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
    911  int x, y, k;
    912  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    913  CONV_BUF_TYPE *dst16 = conv_params->dst;
    914  int dst16_stride = conv_params->dst_stride;
    915  int im_h = h + filter_params_y->taps - 1;
    916  int im_stride = w;
    917  const int fo_vert = filter_params_y->taps / 2 - 1;
    918  const int fo_horiz = filter_params_x->taps / 2 - 1;
    919  const int round_bits =
    920      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    921  assert(round_bits >= 0);
    922 
    923  // horizontal filter
    924  const uint16_t *src_horiz = src - fo_vert * src_stride;
    925  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    926      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    927  for (y = 0; y < im_h; ++y) {
    928    for (x = 0; x < w; ++x) {
    929      int32_t sum = (1 << (bd + FILTER_BITS - 1));
    930      for (k = 0; k < filter_params_x->taps; ++k) {
    931        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    932      }
    933      assert(filter_params_x->taps > 8 ||
    934             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
    935      (void)bd;
    936      im_block[y * im_stride + x] =
    937          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    938    }
    939  }
    940 
    941  // vertical filter
    942  int16_t *src_vert = im_block + fo_vert * im_stride;
    943  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    944  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    945      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    946  for (y = 0; y < h; ++y) {
    947    for (x = 0; x < w; ++x) {
    948      int32_t sum = 1 << offset_bits;
    949      for (k = 0; k < filter_params_y->taps; ++k) {
    950        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    951      }
    952      assert(filter_params_y->taps > 8 ||
    953             (0 <= sum && sum < (1 << (offset_bits + 2))));
    954      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    955      if (conv_params->do_average) {
    956        int32_t tmp = dst16[y * dst16_stride + x];
    957        if (conv_params->use_dist_wtd_comp_avg) {
    958          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    959          tmp = tmp >> DIST_PRECISION_BITS;
    960        } else {
    961          tmp += res;
    962          tmp = tmp >> 1;
    963        }
    964        tmp -= (1 << (offset_bits - conv_params->round_1)) +
    965               (1 << (offset_bits - conv_params->round_1 - 1));
    966        dst[y * dst_stride + x] =
    967            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
    968      } else {
    969        dst16[y * dst16_stride + x] = res;
    970      }
    971    }
    972  }
    973 }
    974 
    975 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
    976                                      uint16_t *dst, int dst_stride, int w,
    977                                      int h,
    978                                      const InterpFilterParams *filter_params_x,
    979                                      const int subpel_x_qn,
    980                                      ConvolveParams *conv_params, int bd) {
    981  CONV_BUF_TYPE *dst16 = conv_params->dst;
    982  int dst16_stride = conv_params->dst_stride;
    983  const int fo_horiz = filter_params_x->taps / 2 - 1;
    984  const int bits = FILTER_BITS - conv_params->round_1;
    985  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    986  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    987                           (1 << (offset_bits - conv_params->round_1 - 1));
    988  const int round_bits =
    989      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    990  assert(round_bits >= 0);
    991  assert(bits >= 0);
    992  // horizontal filter
    993  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    994      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    995  for (int y = 0; y < h; ++y) {
    996    for (int x = 0; x < w; ++x) {
    997      int32_t res = 0;
    998      for (int k = 0; k < filter_params_x->taps; ++k) {
    999        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
   1000      }
   1001      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
   1002      res += round_offset;
   1003 
   1004      if (conv_params->do_average) {
   1005        int32_t tmp = dst16[y * dst16_stride + x];
   1006        if (conv_params->use_dist_wtd_comp_avg) {
   1007          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
   1008          tmp = tmp >> DIST_PRECISION_BITS;
   1009        } else {
   1010          tmp += res;
   1011          tmp = tmp >> 1;
   1012        }
   1013        tmp -= round_offset;
   1014        dst[y * dst_stride + x] =
   1015            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
   1016      } else {
   1017        dst16[y * dst16_stride + x] = res;
   1018      }
   1019    }
   1020  }
   1021 }
   1022 
   1023 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
   1024                                      uint16_t *dst, int dst_stride, int w,
   1025                                      int h,
   1026                                      const InterpFilterParams *filter_params_y,
   1027                                      const int subpel_y_qn,
   1028                                      ConvolveParams *conv_params, int bd) {
   1029  CONV_BUF_TYPE *dst16 = conv_params->dst;
   1030  int dst16_stride = conv_params->dst_stride;
   1031  const int fo_vert = filter_params_y->taps / 2 - 1;
   1032  const int bits = FILTER_BITS - conv_params->round_0;
   1033  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1034  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
   1035                           (1 << (offset_bits - conv_params->round_1 - 1));
   1036  const int round_bits =
   1037      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   1038  assert(round_bits >= 0);
   1039  assert(bits >= 0);
   1040  // vertical filter
   1041  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
   1042      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1043  for (int y = 0; y < h; ++y) {
   1044    for (int x = 0; x < w; ++x) {
   1045      int32_t res = 0;
   1046      for (int k = 0; k < filter_params_y->taps; ++k) {
   1047        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
   1048      }
   1049      res *= (1 << bits);
   1050      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
   1051 
   1052      if (conv_params->do_average) {
   1053        int32_t tmp = dst16[y * dst16_stride + x];
   1054        if (conv_params->use_dist_wtd_comp_avg) {
   1055          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
   1056          tmp = tmp >> DIST_PRECISION_BITS;
   1057        } else {
   1058          tmp += res;
   1059          tmp = tmp >> 1;
   1060        }
   1061        tmp -= round_offset;
   1062        dst[y * dst_stride + x] =
   1063            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
   1064      } else {
   1065        dst16[y * dst16_stride + x] = res;
   1066      }
   1067    }
   1068  }
   1069 }
   1070 
   1071 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
   1072                                            uint16_t *dst, int dst_stride,
   1073                                            int w, int h,
   1074                                            ConvolveParams *conv_params,
   1075                                            int bd) {
   1076  CONV_BUF_TYPE *dst16 = conv_params->dst;
   1077  int dst16_stride = conv_params->dst_stride;
   1078  const int bits =
   1079      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   1080  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1081  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
   1082                           (1 << (offset_bits - conv_params->round_1 - 1));
   1083  assert(bits >= 0);
   1084 
   1085  for (int y = 0; y < h; ++y) {
   1086    for (int x = 0; x < w; ++x) {
   1087      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
   1088      res += round_offset;
   1089      if (conv_params->do_average) {
   1090        int32_t tmp = dst16[y * dst16_stride + x];
   1091        if (conv_params->use_dist_wtd_comp_avg) {
   1092          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
   1093          tmp = tmp >> DIST_PRECISION_BITS;
   1094        } else {
   1095          tmp += res;
   1096          tmp = tmp >> 1;
   1097        }
   1098        tmp -= round_offset;
   1099        dst[y * dst_stride + x] =
   1100            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
   1101      } else {
   1102        dst16[y * dst16_stride + x] = res;
   1103      }
   1104    }
   1105  }
   1106 }
   1107 
   1108 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
   1109                                    uint16_t *dst, int dst_stride, int w, int h,
   1110                                    const InterpFilterParams *filter_params_x,
   1111                                    const InterpFilterParams *filter_params_y,
   1112                                    const int subpel_x_qn, const int x_step_qn,
   1113                                    const int subpel_y_qn, const int y_step_qn,
   1114                                    ConvolveParams *conv_params, int bd) {
   1115  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   1116  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
   1117             filter_params_y->taps;
   1118  int im_stride = w;
   1119  const int fo_vert = filter_params_y->taps / 2 - 1;
   1120  const int fo_horiz = filter_params_x->taps / 2 - 1;
   1121  CONV_BUF_TYPE *dst16 = conv_params->dst;
   1122  const int dst16_stride = conv_params->dst_stride;
   1123  const int bits =
   1124      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   1125  assert(bits >= 0);
   1126  // horizontal filter
   1127  const uint16_t *src_horiz = src - fo_vert * src_stride;
   1128  for (int y = 0; y < im_h; ++y) {
   1129    int x_qn = subpel_x_qn;
   1130    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
   1131      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
   1132      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
   1133      assert(x_filter_idx < SUBPEL_SHIFTS);
   1134      const int16_t *x_filter =
   1135          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
   1136      int32_t sum = (1 << (bd + FILTER_BITS - 1));
   1137      for (int k = 0; k < filter_params_x->taps; ++k) {
   1138        sum += x_filter[k] * src_x[k - fo_horiz];
   1139      }
   1140      assert(filter_params_x->taps > 8 ||
   1141             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
   1142      im_block[y * im_stride + x] =
   1143          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
   1144    }
   1145    src_horiz += src_stride;
   1146  }
   1147 
   1148  // vertical filter
   1149  int16_t *src_vert = im_block + fo_vert * im_stride;
   1150  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1151  for (int x = 0; x < w; ++x) {
   1152    int y_qn = subpel_y_qn;
   1153    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
   1154      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
   1155      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
   1156      assert(y_filter_idx < SUBPEL_SHIFTS);
   1157      const int16_t *y_filter =
   1158          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
   1159      int32_t sum = 1 << offset_bits;
   1160      for (int k = 0; k < filter_params_y->taps; ++k) {
   1161        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
   1162      }
   1163      assert(filter_params_y->taps > 8 ||
   1164             (0 <= sum && sum < (1 << (offset_bits + 2))));
   1165      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
   1166      if (conv_params->is_compound) {
   1167        if (conv_params->do_average) {
   1168          int32_t tmp = dst16[y * dst16_stride + x];
   1169          if (conv_params->use_dist_wtd_comp_avg) {
   1170            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
   1171            tmp = tmp >> DIST_PRECISION_BITS;
   1172          } else {
   1173            tmp += res;
   1174            tmp = tmp >> 1;
   1175          }
   1176          /* Subtract round offset and convolve round */
   1177          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
   1178                       (1 << (offset_bits - conv_params->round_1 - 1)));
   1179          dst[y * dst_stride + x] =
   1180              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
   1181        } else {
   1182          dst16[y * dst16_stride + x] = res;
   1183        }
   1184      } else {
   1185        /* Subtract round offset and convolve round */
   1186        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
   1187                             (1 << (offset_bits - conv_params->round_1 - 1)));
   1188        dst[y * dst_stride + x] =
   1189            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
   1190      }
   1191    }
   1192    src_vert++;
   1193  }
   1194 }
   1195 
   1196 static void highbd_convolve_2d_facade_compound(
   1197    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
   1198    const int w, const int h, const InterpFilterParams *filter_params_x,
   1199    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
   1200    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   1201  const bool need_x = subpel_x_qn != 0;
   1202  const bool need_y = subpel_y_qn != 0;
   1203  if (!need_x && !need_y) {
   1204    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
   1205                                         conv_params, bd);
   1206  } else if (need_x && !need_y) {
   1207    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
   1208                                   filter_params_x, subpel_x_qn, conv_params,
   1209                                   bd);
   1210  } else if (!need_x && need_y) {
   1211    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
   1212                                   filter_params_y, subpel_y_qn, conv_params,
   1213                                   bd);
   1214  } else {
   1215    assert(need_x && need_y);
   1216    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
   1217                                    filter_params_x, filter_params_y,
   1218                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
   1219  }
   1220 }
   1221 
   1222 static void highbd_convolve_2d_facade_single(
   1223    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
   1224    const int w, const int h, const InterpFilterParams *filter_params_x,
   1225    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
   1226    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   1227  const bool need_x = subpel_x_qn != 0;
   1228  const bool need_y = subpel_y_qn != 0;
   1229 
   1230  if (!need_x && !need_y) {
   1231    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
   1232  } else if (need_x && !need_y) {
   1233    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
   1234                             filter_params_x, subpel_x_qn, conv_params, bd);
   1235  } else if (!need_x && need_y) {
   1236    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
   1237                             filter_params_y, subpel_y_qn, bd);
   1238  } else {
   1239    assert(need_x && need_y);
   1240    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
   1241                              filter_params_x, filter_params_y, subpel_x_qn,
   1242                              subpel_y_qn, conv_params, bd);
   1243  }
   1244 }
   1245 
   1246 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
   1247                                   uint8_t *dst8, int dst_stride, int w, int h,
   1248                                   const InterpFilterParams *interp_filters[2],
   1249                                   const int subpel_x_qn, int x_step_q4,
   1250                                   const int subpel_y_qn, int y_step_q4,
   1251                                   int scaled, ConvolveParams *conv_params,
   1252                                   int bd) {
   1253  (void)x_step_q4;
   1254  (void)y_step_q4;
   1255  (void)dst_stride;
   1256  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   1257 
   1258  const InterpFilterParams *filter_params_x = interp_filters[0];
   1259  const InterpFilterParams *filter_params_y = interp_filters[1];
   1260 
   1261  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1262  // 2-tap filter indicates that it is for IntraBC.
   1263  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
   1264    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
   1265    assert(!scaled);
   1266    if (subpel_x_qn && subpel_y_qn) {
   1267      av1_highbd_convolve_2d_sr_intrabc_c(
   1268          src, src_stride, dst, dst_stride, w, h, filter_params_x,
   1269          filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
   1270      return;
   1271    } else if (subpel_x_qn) {
   1272      av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
   1273                                         filter_params_x, subpel_x_qn,
   1274                                         conv_params, bd);
   1275      return;
   1276    } else if (subpel_y_qn) {
   1277      av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
   1278                                         filter_params_y, subpel_y_qn, bd);
   1279      return;
   1280    }
   1281  }
   1282 
   1283  if (scaled) {
   1284    if (conv_params->is_compound) {
   1285      assert(conv_params->dst != NULL);
   1286    }
   1287    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
   1288                                 filter_params_x, filter_params_y, subpel_x_qn,
   1289                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
   1290                                 bd);
   1291  } else if (conv_params->is_compound) {
   1292    highbd_convolve_2d_facade_compound(
   1293        src, src_stride, dst, dst_stride, w, h, filter_params_x,
   1294        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
   1295  } else {
   1296    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
   1297                                     filter_params_x, filter_params_y,
   1298                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
   1299  }
   1300 }
   1301 #endif  // CONFIG_AV1_HIGHBITDEPTH
   1302 
   1303 // Note: Fixed size intermediate buffers, place limits on parameters
   1304 // of some functions. 2d filtering proceeds in 2 steps:
   1305 //   (1) Interpolate horizontally into an intermediate buffer, temp.
   1306 //   (2) Interpolate temp vertically to derive the sub-pixel result.
   1307 // Deriving the maximum number of rows in the temp buffer (135):
   1308 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
   1309 // --Largest block size is 128x128 pixels.
   1310 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
   1311 //   original frame (in 1/16th pixel units).
   1312 // --Must round-up because block may be located at sub-pixel position.
   1313 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   1314 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
   1315 #define WIENER_MAX_EXT_SIZE 263
   1316 
   1317 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1318 static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) {
   1319  int sum = 0;
   1320  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   1321  return sum;
   1322 }
   1323 
   1324 #if CONFIG_AV1_HIGHBITDEPTH
   1325 static inline int highbd_horz_scalar_product(const uint16_t *a,
   1326                                             const int16_t *b) {
   1327  int sum = 0;
   1328  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   1329  return sum;
   1330 }
   1331 #endif
   1332 
   1333 static inline int highbd_vert_scalar_product(const uint16_t *a,
   1334                                             ptrdiff_t a_stride,
   1335                                             const int16_t *b) {
   1336  int sum = 0;
   1337  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
   1338  return sum;
   1339 }
   1340 
   1341 static const InterpKernel *get_filter_base(const int16_t *filter) {
   1342  // NOTE: This assumes that the filter table is 256-byte aligned.
   1343  // TODO(agrange) Modify to make independent of table alignment.
   1344  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
   1345 }
   1346 
   1347 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
   1348  return (int)((const InterpKernel *)(intptr_t)f - base);
   1349 }
   1350 
   1351 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
   1352                                       uint16_t *dst, ptrdiff_t dst_stride,
   1353                                       const InterpKernel *x_filters, int x0_q4,
   1354                                       int x_step_q4, int w, int h,
   1355                                       int round0_bits) {
   1356  const int bd = 8;
   1357  src -= SUBPEL_TAPS / 2 - 1;
   1358  for (int y = 0; y < h; ++y) {
   1359    int x_q4 = x0_q4;
   1360    for (int x = 0; x < w; ++x) {
   1361      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
   1362      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
   1363      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
   1364                           (1 << (bd + FILTER_BITS - 1));
   1365      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
   1366      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
   1367                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
   1368      x_q4 += x_step_q4;
   1369    }
   1370    src += src_stride;
   1371    dst += dst_stride;
   1372  }
   1373 }
   1374 
   1375 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
   1376                                      uint8_t *dst, ptrdiff_t dst_stride,
   1377                                      const InterpKernel *y_filters, int y0_q4,
   1378                                      int y_step_q4, int w, int h,
   1379                                      int round1_bits) {
   1380  const int bd = 8;
   1381  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   1382 
   1383  for (int x = 0; x < w; ++x) {
   1384    int y_q4 = y0_q4;
   1385    for (int y = 0; y < h; ++y) {
   1386      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
   1387      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
   1388      const int rounding =
   1389          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
   1390          (1 << (bd + round1_bits - 1));
   1391      const int sum =
   1392          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
   1393      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
   1394      y_q4 += y_step_q4;
   1395    }
   1396    ++src;
   1397    ++dst;
   1398  }
   1399 }
   1400 
   1401 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
   1402                                   uint8_t *dst, ptrdiff_t dst_stride,
   1403                                   const int16_t *filter_x, int x_step_q4,
   1404                                   const int16_t *filter_y, int y_step_q4,
   1405                                   int w, int h,
   1406                                   const WienerConvolveParams *conv_params) {
   1407  const InterpKernel *const filters_x = get_filter_base(filter_x);
   1408  const int x0_q4 = get_filter_offset(filter_x, filters_x);
   1409 
   1410  const InterpKernel *const filters_y = get_filter_base(filter_y);
   1411  const int y0_q4 = get_filter_offset(filter_y, filters_y);
   1412 
   1413  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
   1414  const int intermediate_height =
   1415      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
   1416  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
   1417 
   1418  assert(w <= MAX_SB_SIZE);
   1419  assert(h <= MAX_SB_SIZE);
   1420  assert(y_step_q4 <= 32);
   1421  assert(x_step_q4 <= 32);
   1422 
   1423  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
   1424                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
   1425                             x_step_q4, w, intermediate_height,
   1426                             conv_params->round_0);
   1427  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
   1428                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
   1429                            y_step_q4, w, h, conv_params->round_1);
   1430 }
   1431 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1432 
   1433 #if CONFIG_AV1_HIGHBITDEPTH
   1434 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1435 static void highbd_convolve_add_src_horiz_hip(
   1436    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
   1437    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
   1438    int x_step_q4, int w, int h, int round0_bits, int bd) {
   1439  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
   1440  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   1441  src -= SUBPEL_TAPS / 2 - 1;
   1442  for (int y = 0; y < h; ++y) {
   1443    int x_q4 = x0_q4;
   1444    for (int x = 0; x < w; ++x) {
   1445      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
   1446      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
   1447      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
   1448                           (1 << (bd + FILTER_BITS - 1));
   1449      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
   1450      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
   1451                               extraprec_clamp_limit - 1);
   1452      x_q4 += x_step_q4;
   1453    }
   1454    src += src_stride;
   1455    dst += dst_stride;
   1456  }
   1457 }
   1458 
   1459 static void highbd_convolve_add_src_vert_hip(
   1460    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
   1461    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
   1462    int y_step_q4, int w, int h, int round1_bits, int bd) {
   1463  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1464  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   1465  for (int x = 0; x < w; ++x) {
   1466    int y_q4 = y0_q4;
   1467    for (int y = 0; y < h; ++y) {
   1468      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
   1469      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
   1470      const int rounding =
   1471          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
   1472          (1 << (bd + round1_bits - 1));
   1473      const int sum =
   1474          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
   1475      dst[y * dst_stride] =
   1476          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
   1477      y_q4 += y_step_q4;
   1478    }
   1479    ++src;
   1480    ++dst;
   1481  }
   1482 }
   1483 
   1484 void av1_highbd_wiener_convolve_add_src_c(
   1485    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   1486    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
   1487    const int16_t *filter_y, int y_step_q4, int w, int h,
   1488    const WienerConvolveParams *conv_params, int bd) {
   1489  const InterpKernel *const filters_x = get_filter_base(filter_x);
   1490  const int x0_q4 = get_filter_offset(filter_x, filters_x);
   1491 
   1492  const InterpKernel *const filters_y = get_filter_base(filter_y);
   1493  const int y0_q4 = get_filter_offset(filter_y, filters_y);
   1494 
   1495  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
   1496  const int intermediate_height =
   1497      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
   1498 
   1499  assert(w <= MAX_SB_SIZE);
   1500  assert(h <= MAX_SB_SIZE);
   1501  assert(y_step_q4 <= 32);
   1502  assert(x_step_q4 <= 32);
   1503  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
   1504 
   1505  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
   1506                                    src_stride, temp, MAX_SB_SIZE, filters_x,
   1507                                    x0_q4, x_step_q4, w, intermediate_height,
   1508                                    conv_params->round_0, bd);
   1509  highbd_convolve_add_src_vert_hip(
   1510      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
   1511      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
   1512 }
   1513 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1514 #endif  // CONFIG_AV1_HIGHBITDEPTH