tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

restoration.c (62323B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 *
     11 */
     12 
     13 #include <math.h>
     14 #include <stddef.h>
     15 
     16 #include "config/aom_config.h"
     17 #include "config/aom_scale_rtcd.h"
     18 
     19 #include "aom/internal/aom_codec_internal.h"
     20 #include "aom_mem/aom_mem.h"
     21 #include "aom_dsp/aom_dsp_common.h"
     22 #include "aom_mem/aom_mem.h"
     23 #include "aom_ports/mem.h"
     24 #include "aom_util/aom_pthread.h"
     25 
     26 #include "av1/common/av1_common_int.h"
     27 #include "av1/common/convolve.h"
     28 #include "av1/common/enums.h"
     29 #include "av1/common/resize.h"
     30 #include "av1/common/restoration.h"
     31 #include "av1/common/thread_common.h"
     32 
     33 // The 's' values are calculated based on original 'r' and 'e' values in the
     34 // spec using GenSgrprojVtable().
     35 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
     36 const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
     37  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
     38  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
     39  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
     40  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
     41  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
     42  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
     43  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
     44  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
     45 };
     46 
     47 void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
     48                                  int *plane_h) {
     49  int ss_x = is_uv && cm->seq_params->subsampling_x;
     50  int ss_y = is_uv && cm->seq_params->subsampling_y;
     51  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
     52  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
     53 }
     54 
     55 // Count horizontal or vertical units in a plane (use a width or height for
     56 // plane_size, respectively). We basically want to divide the plane size by the
     57 // size of a restoration unit. Rather than rounding up unconditionally as you
     58 // might expect, we round to nearest, which models the way a right or bottom
     59 // restoration unit can extend to up to 150% its normal width or height.
     60 //
     61 // The max with 1 is to deal with small frames, which may be smaller than
     62 // half of an LR unit in size.
     63 int av1_lr_count_units(int unit_size, int plane_size) {
     64  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
     65 }
     66 
     67 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
     68                                  int is_uv) {
     69  int plane_w, plane_h;
     70  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
     71 
     72  const int unit_size = rsi->restoration_unit_size;
     73  const int horz_units = av1_lr_count_units(unit_size, plane_w);
     74  const int vert_units = av1_lr_count_units(unit_size, plane_h);
     75 
     76  rsi->num_rest_units = horz_units * vert_units;
     77  rsi->horz_units = horz_units;
     78  rsi->vert_units = vert_units;
     79 
     80  aom_free(rsi->unit_info);
     81  CHECK_MEM_ERROR(cm, rsi->unit_info,
     82                  (RestorationUnitInfo *)aom_memalign(
     83                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
     84 }
     85 
     86 void av1_free_restoration_struct(RestorationInfo *rst_info) {
     87  aom_free(rst_info->unit_info);
     88  rst_info->unit_info = NULL;
     89 }
     90 
     91 #if 0
     92 // Pair of values for each sgrproj parameter:
     93 // Index 0 corresponds to r[0], e[0]
     94 // Index 1 corresponds to r[1], e[1]
     95 int sgrproj_mtable[SGRPROJ_PARAMS][2];
     96 
     97 static void GenSgrprojVtable(void) {
     98  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
     99    const sgr_params_type *const params = &av1_sgr_params[i];
    100    for (int j = 0; j < 2; ++j) {
    101      const int e = params->e[j];
    102      const int r = params->r[j];
    103      if (r == 0) {                 // filter is disabled
    104        sgrproj_mtable[i][j] = -1;  // mark invalid
    105      } else {                      // filter is enabled
    106        const int n = (2 * r + 1) * (2 * r + 1);
    107        const int n2e = n * n * e;
    108        assert(n2e != 0);
    109        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
    110      }
    111    }
    112  }
    113 }
    114 #endif
    115 
    116 void av1_loop_restoration_precal(void) {
    117 #if 0
    118  GenSgrprojVtable();
    119 #endif
    120 }
    121 
    122 static void extend_frame_lowbd(uint8_t *data, int width, int height,
    123                               ptrdiff_t stride, int border_horz,
    124                               int border_vert) {
    125  uint8_t *data_p;
    126  int i;
    127  for (i = 0; i < height; ++i) {
    128    data_p = data + i * stride;
    129    memset(data_p - border_horz, data_p[0], border_horz);
    130    memset(data_p + width, data_p[width - 1], border_horz);
    131  }
    132  data_p = data - border_horz;
    133  for (i = -border_vert; i < 0; ++i) {
    134    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
    135  }
    136  for (i = height; i < height + border_vert; ++i) {
    137    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
    138           width + 2 * border_horz);
    139  }
    140 }
    141 
    142 #if CONFIG_AV1_HIGHBITDEPTH
    143 static void extend_frame_highbd(uint16_t *data, int width, int height,
    144                                ptrdiff_t stride, int border_horz,
    145                                int border_vert) {
    146  uint16_t *data_p;
    147  int i, j;
    148  for (i = 0; i < height; ++i) {
    149    data_p = data + i * stride;
    150    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
    151    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
    152  }
    153  data_p = data - border_horz;
    154  for (i = -border_vert; i < 0; ++i) {
    155    memcpy(data_p + i * stride, data_p,
    156           (width + 2 * border_horz) * sizeof(uint16_t));
    157  }
    158  for (i = height; i < height + border_vert; ++i) {
    159    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
    160           (width + 2 * border_horz) * sizeof(uint16_t));
    161  }
    162 }
    163 
    164 static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
    165                                  int src_stride, uint16_t *dst,
    166                                  int dst_stride) {
    167  for (int i = 0; i < height; ++i)
    168    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
    169 }
    170 #endif
    171 
    172 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
    173                      int border_horz, int border_vert, int highbd) {
    174 #if CONFIG_AV1_HIGHBITDEPTH
    175  if (highbd) {
    176    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
    177                        border_horz, border_vert);
    178    return;
    179  }
    180 #endif
    181  (void)highbd;
    182  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
    183 }
    184 
    185 static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
    186                                 int src_stride, uint8_t *dst, int dst_stride) {
    187  for (int i = 0; i < height; ++i)
    188    memcpy(dst + i * dst_stride, src + i * src_stride, width);
    189 }
    190 
    191 static void copy_rest_unit(int width, int height, const uint8_t *src,
    192                           int src_stride, uint8_t *dst, int dst_stride,
    193                           int highbd) {
    194 #if CONFIG_AV1_HIGHBITDEPTH
    195  if (highbd) {
    196    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
    197                          CONVERT_TO_SHORTPTR(dst), dst_stride);
    198    return;
    199  }
    200 #endif
    201  (void)highbd;
    202  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
    203 }
    204 
    205 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
    206 
    207 // With striped loop restoration, the filtering for each 64-pixel stripe gets
    208 // most of its input from the output of CDEF (stored in data8), but we need to
    209 // fill out a border of 3 pixels above/below the stripe according to the
    210 // following rules:
    211 //
    212 // * At the top and bottom of the frame, we copy the outermost row of CDEF
    213 //   pixels three times. This extension is done by a call to av1_extend_frame()
    214 //   at the start of the loop restoration process, so the value of
    215 //   copy_above/copy_below doesn't strictly matter.
    216 //
    217 // * All other boundaries are stripe boundaries within the frame. In that case,
    218 //   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
    219 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
    220                                     int plane_w, int plane_h, int ss_y,
    221                                     int *copy_above, int *copy_below) {
    222  (void)plane_w;
    223 
    224  *copy_above = 1;
    225  *copy_below = 1;
    226 
    227  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
    228  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
    229 
    230  const int first_stripe_in_plane = (limits->v_start == 0);
    231  const int this_stripe_height =
    232      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
    233  const int last_stripe_in_plane =
    234      (limits->v_start + this_stripe_height >= plane_h);
    235 
    236  if (first_stripe_in_plane) *copy_above = 0;
    237  if (last_stripe_in_plane) *copy_below = 0;
    238 }
    239 
    240 // Overwrite the border pixels around a processing stripe so that the conditions
    241 // listed above get_stripe_boundary_info() are preserved.
    242 // We save the pixels which get overwritten into a temporary buffer, so that
    243 // they can be restored by restore_processing_stripe_boundary() after we've
    244 // processed the stripe.
    245 //
    246 // limits gives the rectangular limits of the remaining stripes for the current
    247 // restoration unit. rsb is the stored stripe boundaries (taken from either
    248 // deblock or CDEF output as necessary).
    249 static void setup_processing_stripe_boundary(
    250    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
    251    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
    252    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
    253  // Offsets within the line buffers. The buffer logically starts at column
    254  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
    255  // has column x0 in the buffer.
    256  const int buf_stride = rsb->stripe_boundary_stride;
    257  const int buf_x0_off = limits->h_start;
    258  const int line_width =
    259      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
    260  const int line_size = line_width << use_highbd;
    261 
    262  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
    263 
    264  // Replace RESTORATION_BORDER pixels above the top of the stripe
    265  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
    266  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
    267  // duplicating the topmost of the 2 lines (see the AOMMAX call when
    268  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
    269  if (!opt) {
    270    if (copy_above) {
    271      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    272 
    273      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
    274        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
    275        const int buf_off = buf_x0_off + buf_row * buf_stride;
    276        const uint8_t *buf =
    277            rsb->stripe_boundary_above + (buf_off << use_highbd);
    278        uint8_t *dst8 = data8_tl + i * data_stride;
    279        // Save old pixels, then replace with data from stripe_boundary_above
    280        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
    281               REAL_PTR(use_highbd, dst8), line_size);
    282        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
    283      }
    284    }
    285 
    286    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
    287    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
    288    // for i = 0, 1, 2.
    289    if (copy_below) {
    290      const int stripe_end = limits->v_start + h;
    291      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
    292 
    293      for (int i = 0; i < RESTORATION_BORDER; ++i) {
    294        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
    295        const int buf_off = buf_x0_off + buf_row * buf_stride;
    296        const uint8_t *src =
    297            rsb->stripe_boundary_below + (buf_off << use_highbd);
    298 
    299        uint8_t *dst8 = data8_bl + i * data_stride;
    300        // Save old pixels, then replace with data from stripe_boundary_below
    301        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
    302        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
    303      }
    304    }
    305  } else {
    306    if (copy_above) {
    307      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    308 
    309      // Only save and overwrite i=-RESTORATION_BORDER line.
    310      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
    311      // Save old pixels, then replace with data from stripe_boundary_above
    312      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
    313      memcpy(REAL_PTR(use_highbd, dst8),
    314             REAL_PTR(use_highbd,
    315                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
    316             line_size);
    317    }
    318 
    319    if (copy_below) {
    320      const int stripe_end = limits->v_start + h;
    321      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
    322 
    323      // Only save and overwrite i=2 line.
    324      uint8_t *dst8 = data8_bl + 2 * data_stride;
    325      // Save old pixels, then replace with data from stripe_boundary_below
    326      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
    327      memcpy(REAL_PTR(use_highbd, dst8),
    328             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
    329    }
    330  }
    331 }
    332 
    333 // Once a processing stripe is finished, this function sets the boundary
    334 // pixels which were overwritten by setup_processing_stripe_boundary()
    335 // back to their original values
    336 static void restore_processing_stripe_boundary(
    337    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
    338    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
    339    int copy_below, int opt) {
    340  const int line_width =
    341      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
    342  const int line_size = line_width << use_highbd;
    343 
    344  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
    345 
    346  if (!opt) {
    347    if (copy_above) {
    348      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    349      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
    350        uint8_t *dst8 = data8_tl + i * data_stride;
    351        memcpy(REAL_PTR(use_highbd, dst8),
    352               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
    353      }
    354    }
    355 
    356    if (copy_below) {
    357      const int stripe_bottom = limits->v_start + h;
    358      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
    359 
    360      for (int i = 0; i < RESTORATION_BORDER; ++i) {
    361        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
    362 
    363        uint8_t *dst8 = data8_bl + i * data_stride;
    364        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
    365      }
    366    }
    367  } else {
    368    if (copy_above) {
    369      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    370 
    371      // Only restore i=-RESTORATION_BORDER line.
    372      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
    373      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
    374    }
    375 
    376    if (copy_below) {
    377      const int stripe_bottom = limits->v_start + h;
    378      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
    379 
    380      // Only restore i=2 line.
    381      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
    382        uint8_t *dst8 = data8_bl + 2 * data_stride;
    383        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
    384      }
    385    }
    386  }
    387 }
    388 
    389 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
    390                                 int stripe_width, int stripe_height,
    391                                 int procunit_width, const uint8_t *src,
    392                                 int src_stride, uint8_t *dst, int dst_stride,
    393                                 int32_t *tmpbuf, int bit_depth,
    394                                 struct aom_internal_error_info *error_info) {
    395  (void)tmpbuf;
    396  (void)bit_depth;
    397  (void)error_info;
    398  assert(bit_depth == 8);
    399  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
    400 
    401  for (int j = 0; j < stripe_width; j += procunit_width) {
    402    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
    403    const uint8_t *src_p = src + j;
    404    uint8_t *dst_p = dst + j;
    405    av1_wiener_convolve_add_src(
    406        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
    407        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
    408  }
    409 }
    410 
    411 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
    412   over the input. The window is of size (2r + 1)x(2r + 1), and we
    413   specialize to r = 1, 2, 3. A default function is used for r > 3.
    414 
    415   Each loop follows the same format: We keep a window's worth of input
    416   in individual variables and select data out of that as appropriate.
    417 */
    418 static void boxsum1(int32_t *src, int width, int height, int src_stride,
    419                    int sqr, int32_t *dst, int dst_stride) {
    420  int i, j, a, b, c;
    421  assert(width > 2 * SGRPROJ_BORDER_HORZ);
    422  assert(height > 2 * SGRPROJ_BORDER_VERT);
    423 
    424  // Vertical sum over 3-pixel regions, from src into dst.
    425  if (!sqr) {
    426    for (j = 0; j < width; ++j) {
    427      a = src[j];
    428      b = src[src_stride + j];
    429      c = src[2 * src_stride + j];
    430 
    431      dst[j] = a + b;
    432      for (i = 1; i < height - 2; ++i) {
    433        // Loop invariant: At the start of each iteration,
    434        // a = src[(i - 1) * src_stride + j]
    435        // b = src[(i    ) * src_stride + j]
    436        // c = src[(i + 1) * src_stride + j]
    437        dst[i * dst_stride + j] = a + b + c;
    438        a = b;
    439        b = c;
    440        c = src[(i + 2) * src_stride + j];
    441      }
    442      dst[i * dst_stride + j] = a + b + c;
    443      dst[(i + 1) * dst_stride + j] = b + c;
    444    }
    445  } else {
    446    for (j = 0; j < width; ++j) {
    447      a = src[j] * src[j];
    448      b = src[src_stride + j] * src[src_stride + j];
    449      c = src[2 * src_stride + j] * src[2 * src_stride + j];
    450 
    451      dst[j] = a + b;
    452      for (i = 1; i < height - 2; ++i) {
    453        dst[i * dst_stride + j] = a + b + c;
    454        a = b;
    455        b = c;
    456        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
    457      }
    458      dst[i * dst_stride + j] = a + b + c;
    459      dst[(i + 1) * dst_stride + j] = b + c;
    460    }
    461  }
    462 
    463  // Horizontal sum over 3-pixel regions of dst
    464  for (i = 0; i < height; ++i) {
    465    a = dst[i * dst_stride];
    466    b = dst[i * dst_stride + 1];
    467    c = dst[i * dst_stride + 2];
    468 
    469    dst[i * dst_stride] = a + b;
    470    for (j = 1; j < width - 2; ++j) {
    471      // Loop invariant: At the start of each iteration,
    472      // a = src[i * src_stride + (j - 1)]
    473      // b = src[i * src_stride + (j    )]
    474      // c = src[i * src_stride + (j + 1)]
    475      dst[i * dst_stride + j] = a + b + c;
    476      a = b;
    477      b = c;
    478      c = dst[i * dst_stride + (j + 2)];
    479    }
    480    dst[i * dst_stride + j] = a + b + c;
    481    dst[i * dst_stride + (j + 1)] = b + c;
    482  }
    483 }
    484 
    485 static void boxsum2(int32_t *src, int width, int height, int src_stride,
    486                    int sqr, int32_t *dst, int dst_stride) {
    487  int i, j, a, b, c, d, e;
    488  assert(width > 2 * SGRPROJ_BORDER_HORZ);
    489  assert(height > 2 * SGRPROJ_BORDER_VERT);
    490 
    491  // Vertical sum over 5-pixel regions, from src into dst.
    492  if (!sqr) {
    493    for (j = 0; j < width; ++j) {
    494      a = src[j];
    495      b = src[src_stride + j];
    496      c = src[2 * src_stride + j];
    497      d = src[3 * src_stride + j];
    498      e = src[4 * src_stride + j];
    499 
    500      dst[j] = a + b + c;
    501      dst[dst_stride + j] = a + b + c + d;
    502      for (i = 2; i < height - 3; ++i) {
    503        // Loop invariant: At the start of each iteration,
    504        // a = src[(i - 2) * src_stride + j]
    505        // b = src[(i - 1) * src_stride + j]
    506        // c = src[(i    ) * src_stride + j]
    507        // d = src[(i + 1) * src_stride + j]
    508        // e = src[(i + 2) * src_stride + j]
    509        dst[i * dst_stride + j] = a + b + c + d + e;
    510        a = b;
    511        b = c;
    512        c = d;
    513        d = e;
    514        e = src[(i + 3) * src_stride + j];
    515      }
    516      dst[i * dst_stride + j] = a + b + c + d + e;
    517      dst[(i + 1) * dst_stride + j] = b + c + d + e;
    518      dst[(i + 2) * dst_stride + j] = c + d + e;
    519    }
    520  } else {
    521    for (j = 0; j < width; ++j) {
    522      a = src[j] * src[j];
    523      b = src[src_stride + j] * src[src_stride + j];
    524      c = src[2 * src_stride + j] * src[2 * src_stride + j];
    525      d = src[3 * src_stride + j] * src[3 * src_stride + j];
    526      e = src[4 * src_stride + j] * src[4 * src_stride + j];
    527 
    528      dst[j] = a + b + c;
    529      dst[dst_stride + j] = a + b + c + d;
    530      for (i = 2; i < height - 3; ++i) {
    531        dst[i * dst_stride + j] = a + b + c + d + e;
    532        a = b;
    533        b = c;
    534        c = d;
    535        d = e;
    536        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
    537      }
    538      dst[i * dst_stride + j] = a + b + c + d + e;
    539      dst[(i + 1) * dst_stride + j] = b + c + d + e;
    540      dst[(i + 2) * dst_stride + j] = c + d + e;
    541    }
    542  }
    543 
    544  // Horizontal sum over 5-pixel regions of dst
    545  for (i = 0; i < height; ++i) {
    546    a = dst[i * dst_stride];
    547    b = dst[i * dst_stride + 1];
    548    c = dst[i * dst_stride + 2];
    549    d = dst[i * dst_stride + 3];
    550    e = dst[i * dst_stride + 4];
    551 
    552    dst[i * dst_stride] = a + b + c;
    553    dst[i * dst_stride + 1] = a + b + c + d;
    554    for (j = 2; j < width - 3; ++j) {
    555      // Loop invariant: At the start of each iteration,
    556      // a = src[i * src_stride + (j - 2)]
    557      // b = src[i * src_stride + (j - 1)]
    558      // c = src[i * src_stride + (j    )]
    559      // d = src[i * src_stride + (j + 1)]
    560      // e = src[i * src_stride + (j + 2)]
    561      dst[i * dst_stride + j] = a + b + c + d + e;
    562      a = b;
    563      b = c;
    564      c = d;
    565      d = e;
    566      e = dst[i * dst_stride + (j + 3)];
    567    }
    568    dst[i * dst_stride + j] = a + b + c + d + e;
    569    dst[i * dst_stride + (j + 1)] = b + c + d + e;
    570    dst[i * dst_stride + (j + 2)] = c + d + e;
    571  }
    572 }
    573 
    574 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
    575                   int sqr, int32_t *dst, int dst_stride) {
    576  if (r == 1)
    577    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
    578  else if (r == 2)
    579    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
    580  else
    581    assert(0 && "Invalid value of r in self-guided filter");
    582 }
    583 
    584 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
    585  if (params->r[0] == 0) {
    586    xq[0] = 0;
    587    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
    588  } else if (params->r[1] == 0) {
    589    xq[0] = xqd[0];
    590    xq[1] = 0;
    591  } else {
    592    xq[0] = xqd[0];
    593    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
    594  }
    595 }
    596 
    597 const int32_t av1_x_by_xplus1[256] = {
    598  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
    599  // instead of 0. See comments in selfguided_restoration_internal() for why
    600  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
    601  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
    602  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
    603  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
    604  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
    605  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
    606  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
    607  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    608  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    609  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    610  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    611  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    612  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    613  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    614  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    615  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    616  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    617  256,
    618 };
    619 
    620 const int32_t av1_one_by_x[MAX_NELEM] = {
    621  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
    622  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
    623 };
    624 
    625 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
    626                                          int dgd_stride, int bit_depth,
    627                                          int sgr_params_idx, int radius_idx,
    628                                          int pass, int32_t *A, int32_t *B) {
    629  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
    630  const int r = params->r[radius_idx];
    631  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
    632  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
    633  // Adjusting the stride of A and B here appears to avoid bad cache effects,
    634  // leading to a significant speed improvement.
    635  // We also align the stride to a multiple of 16 bytes, for consistency
    636  // with the SIMD version of this function.
    637  int buf_stride = ((width_ext + 3) & ~3) + 16;
    638  const int step = pass == 0 ? 1 : 2;
    639  int i, j;
    640 
    641  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
    642  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
    643         "Need SGRPROJ_BORDER_* >= r+1");
    644 
    645  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
    646         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
    647  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
    648         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
    649  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    650  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    651  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
    652  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
    653  for (i = -1; i < height + 1; i += step) {
    654    for (j = -1; j < width + 1; ++j) {
    655      const int k = i * buf_stride + j;
    656      const int n = (2 * r + 1) * (2 * r + 1);
    657 
    658      // a < 2^16 * n < 2^22 regardless of bit depth
    659      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
    660      // b < 2^8 * n < 2^14 regardless of bit depth
    661      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
    662 
    663      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
    664      // and p itself satisfies p < 2^14 * n^2 < 2^26.
    665      // This bound on p is due to:
    666      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
    667      //
    668      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
    669      // This is an artefact of rounding, and can only happen if all pixels
    670      // are (almost) identical, so in this case we saturate to p=0.
    671      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
    672 
    673      const uint32_t s = params->s[radius_idx];
    674 
    675      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
    676      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
    677      // (this holds even after accounting for the rounding in s)
    678      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
    679 
    680      // Note: We have to be quite careful about the value of A[k].
    681      // This is used as a blend factor between individual pixel values and the
    682      // local mean. So it logically has a range of [0, 256], including both
    683      // endpoints.
    684      //
    685      // This is a pain for hardware, as we'd like something which can be stored
    686      // in exactly 8 bits.
    687      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
    688      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
    689      // slightly above 2^(8 + bit depth), due to rounding in the value of
    690      // av1_one_by_x[25-1].
    691      //
    692      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
    693      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
    694      // overflow), without significantly affecting the final result: z == 0
    695      // implies that the image is essentially "flat", so the local mean and
    696      // individual pixel values are very similar.
    697      //
    698      // Note that saturating on the other side, ie. requring A[k] <= 255,
    699      // would be a bad idea, as that corresponds to the case where the image
    700      // is very variable, when we want to preserve the local pixel value as
    701      // much as possible.
    702      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
    703 
    704      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
    705      // av1_one_by_x[n - 1] = round(2^12 / n)
    706      // => the product here is < 2^(20 + bit_depth) <= 2^32,
    707      // and B[k] is set to a value < 2^(8 + bit depth)
    708      // This holds even with the rounding in av1_one_by_x and in the overall
    709      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
    710      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
    711                                             (uint32_t)B[k] *
    712                                             (uint32_t)av1_one_by_x[n - 1],
    713                                         SGRPROJ_RECIP_BITS);
    714    }
    715  }
    716 }
    717 
    718 static void selfguided_restoration_fast_internal(
    719    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
    720    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
    721  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
    722  const int r = params->r[radius_idx];
    723  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
    724  // Adjusting the stride of A and B here appears to avoid bad cache effects,
    725  // leading to a significant speed improvement.
    726  // We also align the stride to a multiple of 16 bytes, for consistency
    727  // with the SIMD version of this function.
    728  int buf_stride = ((width_ext + 3) & ~3) + 16;
    729  int32_t A_[RESTORATION_PROC_UNIT_PELS];
    730  int32_t B_[RESTORATION_PROC_UNIT_PELS];
    731  int32_t *A = A_;
    732  int32_t *B = B_;
    733  int i, j;
    734  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
    735                                sgr_params_idx, radius_idx, 1, A, B);
    736  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    737  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    738 
    739  // Use the A[] and B[] arrays to calculate the filtered image
    740  (void)r;
    741  assert(r == 2);
    742  for (i = 0; i < height; ++i) {
    743    if (!(i & 1)) {  // even row
    744      for (j = 0; j < width; ++j) {
    745        const int k = i * buf_stride + j;
    746        const int l = i * dgd_stride + j;
    747        const int m = i * dst_stride + j;
    748        const int nb = 5;
    749        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
    750                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
    751                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
    752                              5;
    753        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
    754                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
    755                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
    756                              5;
    757        const int32_t v = a * dgd[l] + b;
    758        dst[m] =
    759            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    760      }
    761    } else {  // odd row
    762      for (j = 0; j < width; ++j) {
    763        const int k = i * buf_stride + j;
    764        const int l = i * dgd_stride + j;
    765        const int m = i * dst_stride + j;
    766        const int nb = 4;
    767        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
    768        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
    769        const int32_t v = a * dgd[l] + b;
    770        dst[m] =
    771            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    772      }
    773    }
    774  }
    775 }
    776 
    777 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
    778                                            int dgd_stride, int32_t *dst,
    779                                            int dst_stride, int bit_depth,
    780                                            int sgr_params_idx,
    781                                            int radius_idx) {
    782  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
    783  // Adjusting the stride of A and B here appears to avoid bad cache effects,
    784  // leading to a significant speed improvement.
    785  // We also align the stride to a multiple of 16 bytes, for consistency
    786  // with the SIMD version of this function.
    787  int buf_stride = ((width_ext + 3) & ~3) + 16;
    788  int32_t A_[RESTORATION_PROC_UNIT_PELS];
    789  int32_t B_[RESTORATION_PROC_UNIT_PELS];
    790  int32_t *A = A_;
    791  int32_t *B = B_;
    792  int i, j;
    793  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
    794                                sgr_params_idx, radius_idx, 0, A, B);
    795  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    796  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    797 
    798  // Use the A[] and B[] arrays to calculate the filtered image
    799  for (i = 0; i < height; ++i) {
    800    for (j = 0; j < width; ++j) {
    801      const int k = i * buf_stride + j;
    802      const int l = i * dgd_stride + j;
    803      const int m = i * dst_stride + j;
    804      const int nb = 5;
    805      const int32_t a =
    806          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
    807              4 +
    808          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
    809           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
    810              3;
    811      const int32_t b =
    812          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
    813              4 +
    814          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
    815           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
    816              3;
    817      const int32_t v = a * dgd[l] + b;
    818      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    819    }
    820  }
    821 }
    822 
    823 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
    824                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
    825                                 int flt_stride, int sgr_params_idx,
    826                                 int bit_depth, int highbd) {
    827  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
    828  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
    829  int32_t *dgd32 =
    830      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
    831 
    832  if (highbd) {
    833    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
    834    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    835      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
    836        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
    837      }
    838    }
    839  } else {
    840    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    841      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
    842        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
    843      }
    844    }
    845  }
    846 
    847  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
    848  // If params->r == 0 we skip the corresponding filter. We only allow one of
    849  // the radii to be 0, as having both equal to 0 would be equivalent to
    850  // skipping SGR entirely.
    851  assert(!(params->r[0] == 0 && params->r[1] == 0));
    852 
    853  if (params->r[0] > 0)
    854    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
    855                                         flt0, flt_stride, bit_depth,
    856                                         sgr_params_idx, 0);
    857  if (params->r[1] > 0)
    858    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
    859                                    flt_stride, bit_depth, sgr_params_idx, 1);
    860  return 0;
    861 }
    862 
    863 int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
    864                                       int height, int stride, int eps,
    865                                       const int *xqd, uint8_t *dst8,
    866                                       int dst_stride, int32_t *tmpbuf,
    867                                       int bit_depth, int highbd) {
    868  int32_t *flt0 = tmpbuf;
    869  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
    870  assert(width * height <= RESTORATION_UNITPELS_MAX);
    871 
    872  const int ret = av1_selfguided_restoration_c(
    873      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
    874  if (ret != 0) return ret;
    875  const sgr_params_type *const params = &av1_sgr_params[eps];
    876  int xq[2];
    877  av1_decode_xq(xqd, xq, params);
    878  for (int i = 0; i < height; ++i) {
    879    for (int j = 0; j < width; ++j) {
    880      const int k = i * width + j;
    881      uint8_t *dst8ij = dst8 + i * dst_stride + j;
    882      const uint8_t *dat8ij = dat8 + i * stride + j;
    883 
    884      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
    885      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
    886      int32_t v = u << SGRPROJ_PRJ_BITS;
    887      // If params->r == 0 then we skipped the filtering in
    888      // av1_selfguided_restoration_c, i.e. flt[k] == u
    889      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
    890      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
    891      const int16_t w =
    892          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
    893 
    894      const uint16_t out = clip_pixel_highbd(w, bit_depth);
    895      if (highbd)
    896        *CONVERT_TO_SHORTPTR(dst8ij) = out;
    897      else
    898        *dst8ij = (uint8_t)out;
    899    }
    900  }
    901  return 0;
    902 }
    903 
    904 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
    905                                  int stripe_width, int stripe_height,
    906                                  int procunit_width, const uint8_t *src,
    907                                  int src_stride, uint8_t *dst, int dst_stride,
    908                                  int32_t *tmpbuf, int bit_depth,
    909                                  struct aom_internal_error_info *error_info) {
    910  (void)bit_depth;
    911  assert(bit_depth == 8);
    912 
    913  for (int j = 0; j < stripe_width; j += procunit_width) {
    914    int w = AOMMIN(procunit_width, stripe_width - j);
    915    if (av1_apply_selfguided_restoration(
    916            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
    917            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
    918            0) != 0) {
    919      aom_internal_error(
    920          error_info, AOM_CODEC_MEM_ERROR,
    921          "Error allocating buffer in av1_apply_selfguided_restoration");
    922    }
    923  }
    924 }
    925 
    926 #if CONFIG_AV1_HIGHBITDEPTH
    927 static void wiener_filter_stripe_highbd(
    928    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
    929    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
    930    int dst_stride, int32_t *tmpbuf, int bit_depth,
    931    struct aom_internal_error_info *error_info) {
    932  (void)tmpbuf;
    933  (void)error_info;
    934  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
    935 
    936  for (int j = 0; j < stripe_width; j += procunit_width) {
    937    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
    938    const uint8_t *src8_p = src8 + j;
    939    uint8_t *dst8_p = dst8 + j;
    940    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
    941                                       rui->wiener_info.hfilter, 16,
    942                                       rui->wiener_info.vfilter, 16, w,
    943                                       stripe_height, &conv_params, bit_depth);
    944  }
    945 }
    946 
    947 static void sgrproj_filter_stripe_highbd(
    948    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
    949    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
    950    int dst_stride, int32_t *tmpbuf, int bit_depth,
    951    struct aom_internal_error_info *error_info) {
    952  for (int j = 0; j < stripe_width; j += procunit_width) {
    953    int w = AOMMIN(procunit_width, stripe_width - j);
    954    if (av1_apply_selfguided_restoration(
    955            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
    956            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
    957            1) != 0) {
    958      aom_internal_error(
    959          error_info, AOM_CODEC_MEM_ERROR,
    960          "Error allocating buffer in av1_apply_selfguided_restoration");
    961    }
    962  }
    963 }
    964 #endif  // CONFIG_AV1_HIGHBITDEPTH
    965 
    966 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
    967                                  int stripe_width, int stripe_height,
    968                                  int procunit_width, const uint8_t *src,
    969                                  int src_stride, uint8_t *dst, int dst_stride,
    970                                  int32_t *tmpbuf, int bit_depth,
    971                                  struct aom_internal_error_info *error_info);
    972 
    973 #if CONFIG_AV1_HIGHBITDEPTH
    974 #define NUM_STRIPE_FILTERS 4
    975 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
    976  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
    977  sgrproj_filter_stripe_highbd
    978 };
    979 #else
    980 #define NUM_STRIPE_FILTERS 2
    981 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
    982  wiener_filter_stripe, sgrproj_filter_stripe
    983 };
    984 #endif  // CONFIG_AV1_HIGHBITDEPTH
    985 
    986 // Filter one restoration unit
    987 void av1_loop_restoration_filter_unit(
    988    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
    989    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
    990    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
    991    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
    992    int optimized_lr, struct aom_internal_error_info *error_info) {
    993  RestorationType unit_rtype = rui->restoration_type;
    994 
    995  int unit_h = limits->v_end - limits->v_start;
    996  int unit_w = limits->h_end - limits->h_start;
    997  uint8_t *data8_tl =
    998      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
    999  uint8_t *dst8_tl =
   1000      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
   1001 
   1002  if (unit_rtype == RESTORE_NONE) {
   1003    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
   1004                   highbd);
   1005    return;
   1006  }
   1007 
   1008  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
   1009  assert(filter_idx < NUM_STRIPE_FILTERS);
   1010  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
   1011 
   1012  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
   1013 
   1014  // Filter the whole image one stripe at a time
   1015  RestorationTileLimits remaining_stripes = *limits;
   1016  int i = 0;
   1017  while (i < unit_h) {
   1018    int copy_above, copy_below;
   1019    remaining_stripes.v_start = limits->v_start + i;
   1020 
   1021    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
   1022                             &copy_above, &copy_below);
   1023 
   1024    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   1025    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
   1026 
   1027    // Work out where this stripe's boundaries are within
   1028    // rsb->stripe_boundary_{above,below}
   1029    const int frame_stripe =
   1030        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
   1031    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
   1032 
   1033    // Calculate this stripe's height, based on two rules:
   1034    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
   1035    // * We can't extend past the end of the current restoration unit
   1036    const int nominal_stripe_height =
   1037        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
   1038    const int h = AOMMIN(nominal_stripe_height,
   1039                         remaining_stripes.v_end - remaining_stripes.v_start);
   1040 
   1041    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
   1042                                     h, data8, stride, rlbs, copy_above,
   1043                                     copy_below, optimized_lr);
   1044 
   1045    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
   1046                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
   1047                  error_info);
   1048 
   1049    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
   1050                                       data8, stride, copy_above, copy_below,
   1051                                       optimized_lr);
   1052 
   1053    i += h;
   1054  }
   1055 }
   1056 
   1057 static void filter_frame_on_unit(const RestorationTileLimits *limits,
   1058                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
   1059                                 RestorationLineBuffers *rlbs,
   1060                                 struct aom_internal_error_info *error_info) {
   1061  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
   1062  const RestorationInfo *rsi = ctxt->rsi;
   1063 
   1064  av1_loop_restoration_filter_unit(
   1065      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
   1066      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
   1067      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
   1068      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
   1069 }
   1070 
   1071 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
   1072                                            YV12_BUFFER_CONFIG *frame,
   1073                                            AV1_COMMON *cm, int optimized_lr,
   1074                                            int num_planes) {
   1075  const SequenceHeader *const seq_params = cm->seq_params;
   1076  const int bit_depth = seq_params->bit_depth;
   1077  const int highbd = seq_params->use_highbitdepth;
   1078  lr_ctxt->dst = &cm->rst_frame;
   1079 
   1080  const int frame_width = frame->crop_widths[0];
   1081  const int frame_height = frame->crop_heights[0];
   1082  if (aom_realloc_frame_buffer(
   1083          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
   1084          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
   1085          cm->features.byte_alignment, NULL, NULL, NULL, false,
   1086          0) != AOM_CODEC_OK)
   1087    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
   1088                       "Failed to allocate restoration dst buffer");
   1089 
   1090  lr_ctxt->on_rest_unit = filter_frame_on_unit;
   1091  lr_ctxt->frame = frame;
   1092  for (int plane = 0; plane < num_planes; ++plane) {
   1093    RestorationInfo *rsi = &cm->rst_info[plane];
   1094    RestorationType rtype = rsi->frame_restoration_type;
   1095    rsi->optimized_lr = optimized_lr;
   1096    lr_ctxt->ctxt[plane].rsi = rsi;
   1097 
   1098    if (rtype == RESTORE_NONE) {
   1099      continue;
   1100    }
   1101 
   1102    const int is_uv = plane > 0;
   1103    int plane_w, plane_h;
   1104    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
   1105    assert(plane_w == frame->crop_widths[is_uv]);
   1106    assert(plane_h == frame->crop_heights[is_uv]);
   1107 
   1108    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
   1109                     frame->strides[is_uv], RESTORATION_BORDER,
   1110                     RESTORATION_BORDER, highbd);
   1111 
   1112    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
   1113    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
   1114    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
   1115    lr_plane_ctxt->plane_w = plane_w;
   1116    lr_plane_ctxt->plane_h = plane_h;
   1117    lr_plane_ctxt->highbd = highbd;
   1118    lr_plane_ctxt->bit_depth = bit_depth;
   1119    lr_plane_ctxt->data8 = frame->buffers[plane];
   1120    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
   1121    lr_plane_ctxt->data_stride = frame->strides[is_uv];
   1122    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
   1123  }
   1124 }
   1125 
   1126 static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
   1127                                         AV1_COMMON *cm, int num_planes) {
   1128  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
   1129                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
   1130                           int vstart, int vend);
   1131  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
   1132                                         aom_yv12_partial_coloc_copy_u,
   1133                                         aom_yv12_partial_coloc_copy_v };
   1134  assert(num_planes <= 3);
   1135  for (int plane = 0; plane < num_planes; ++plane) {
   1136    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
   1137    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
   1138    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
   1139                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
   1140  }
   1141 }
   1142 
   1143 // Call on_rest_unit for each loop restoration unit in the plane.
   1144 static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
   1145                                       rest_unit_visitor_t on_rest_unit,
   1146                                       void *priv, int32_t *tmpbuf,
   1147                                       RestorationLineBuffers *rlbs) {
   1148  const RestorationInfo *rsi = &cm->rst_info[plane];
   1149  const int hnum_rest_units = rsi->horz_units;
   1150  const int vnum_rest_units = rsi->vert_units;
   1151  const int unit_size = rsi->restoration_unit_size;
   1152 
   1153  const int is_uv = plane > 0;
   1154  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   1155  const int ext_size = unit_size * 3 / 2;
   1156  int plane_w, plane_h;
   1157  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
   1158 
   1159  int y0 = 0, i = 0;
   1160  while (y0 < plane_h) {
   1161    int remaining_h = plane_h - y0;
   1162    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
   1163 
   1164    RestorationTileLimits limits;
   1165    limits.v_start = y0;
   1166    limits.v_end = y0 + h;
   1167    assert(limits.v_end <= plane_h);
   1168    // Offset upwards to align with the restoration processing stripe
   1169    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
   1170    limits.v_start = AOMMAX(0, limits.v_start - voffset);
   1171    if (limits.v_end < plane_h) limits.v_end -= voffset;
   1172 
   1173    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
   1174                                 hnum_rest_units, vnum_rest_units, plane, priv,
   1175                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
   1176                                 av1_lr_sync_write_dummy, NULL, cm->error);
   1177 
   1178    y0 += h;
   1179    ++i;
   1180  }
   1181 }
   1182 
   1183 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
   1184                                        int num_planes) {
   1185  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
   1186 
   1187  for (int plane = 0; plane < num_planes; ++plane) {
   1188    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
   1189      continue;
   1190    }
   1191 
   1192    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
   1193                               cm->rst_tmpbuf, cm->rlbs);
   1194  }
   1195 }
   1196 
   1197 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
   1198                                       AV1_COMMON *cm, int optimized_lr,
   1199                                       void *lr_ctxt) {
   1200  assert(!cm->features.all_lossless);
   1201  const int num_planes = av1_num_planes(cm);
   1202 
   1203  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
   1204 
   1205  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
   1206                                         optimized_lr, num_planes);
   1207 
   1208  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
   1209 
   1210  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
   1211 }
   1212 
   1213 void av1_foreach_rest_unit_in_row(
   1214    RestorationTileLimits *limits, int plane_w,
   1215    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
   1216    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
   1217    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
   1218    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
   1219    struct aom_internal_error_info *error_info) {
   1220  const int ext_size = unit_size * 3 / 2;
   1221  int x0 = 0, j = 0;
   1222  while (x0 < plane_w) {
   1223    int remaining_w = plane_w - x0;
   1224    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
   1225 
   1226    limits->h_start = x0;
   1227    limits->h_end = x0 + w;
   1228    assert(limits->h_end <= plane_w);
   1229 
   1230    const int unit_idx = row_number * hnum_rest_units + j;
   1231 
   1232    // No sync for even numbered rows
   1233    // For odd numbered rows, Loop Restoration of current block requires the LR
   1234    // of top-right and bottom-right blocks to be completed
   1235 
   1236    // top-right sync
   1237    on_sync_read(lr_sync, row_number, j, plane);
   1238    if ((row_number + 1) < vnum_rest_units)
   1239      // bottom-right sync
   1240      on_sync_read(lr_sync, row_number + 2, j, plane);
   1241 
   1242 #if CONFIG_MULTITHREAD
   1243    if (lr_sync && lr_sync->num_workers > 1) {
   1244      pthread_mutex_lock(lr_sync->job_mutex);
   1245      const bool lr_mt_exit = lr_sync->lr_mt_exit;
   1246      pthread_mutex_unlock(lr_sync->job_mutex);
   1247      // Exit in case any worker has encountered an error.
   1248      if (lr_mt_exit) return;
   1249    }
   1250 #endif
   1251 
   1252    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
   1253 
   1254    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
   1255 
   1256    x0 += w;
   1257    ++j;
   1258  }
   1259 }
   1260 
   1261 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
   1262  (void)lr_sync;
   1263  (void)r;
   1264  (void)c;
   1265  (void)plane;
   1266 }
   1267 
   1268 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
   1269                             const int sb_cols, int plane) {
   1270  (void)lr_sync;
   1271  (void)r;
   1272  (void)c;
   1273  (void)sb_cols;
   1274  (void)plane;
   1275 }
   1276 
   1277 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   1278                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
   1279                                       int *rcol0, int *rcol1, int *rrow0,
   1280                                       int *rrow1) {
   1281  assert(rcol0 && rcol1 && rrow0 && rrow1);
   1282 
   1283  if (bsize != cm->seq_params->sb_size) return 0;
   1284 
   1285  assert(!cm->features.all_lossless);
   1286 
   1287  const int is_uv = plane > 0;
   1288 
   1289  // Compute the mi-unit corners of the superblock
   1290  const int mi_row0 = mi_row;
   1291  const int mi_col0 = mi_col;
   1292  const int mi_row1 = mi_row0 + mi_size_high[bsize];
   1293  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
   1294 
   1295  const RestorationInfo *rsi = &cm->rst_info[plane];
   1296  const int size = rsi->restoration_unit_size;
   1297  const int horz_units = rsi->horz_units;
   1298  const int vert_units = rsi->vert_units;
   1299 
   1300  // The size of an MI-unit on this plane of the image
   1301  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   1302  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   1303  const int mi_size_x = MI_SIZE >> ss_x;
   1304  const int mi_size_y = MI_SIZE >> ss_y;
   1305 
   1306  // Write m for the relative mi column or row, D for the superres denominator
   1307  // and N for the superres numerator. If u is the upscaled pixel offset then
   1308  // we can write the downscaled pixel offset in two ways as:
   1309  //
   1310  //   MI_SIZE * m = N / D u
   1311  //
   1312  // from which we get u = D * MI_SIZE * m / N
   1313  const int mi_to_num_x = av1_superres_scaled(cm)
   1314                              ? mi_size_x * cm->superres_scale_denominator
   1315                              : mi_size_x;
   1316  const int mi_to_num_y = mi_size_y;
   1317  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
   1318  const int denom_y = size;
   1319 
   1320  const int rnd_x = denom_x - 1;
   1321  const int rnd_y = denom_y - 1;
   1322 
   1323  // rcol0/rrow0 should be the first column/row of restoration units that
   1324  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
   1325  // to round up the division (if the sb starts at runit column 10.1, the first
   1326  // matching runit has column index 11)
   1327  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
   1328  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
   1329 
   1330  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
   1331  // below-right. If we're at the bottom or right of the frame, this restoration
   1332  // unit might not exist, in which case we'll clamp accordingly.
   1333  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
   1334  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
   1335 
   1336  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
   1337 }
   1338 
   1339 // Extend to left and right
   1340 static void extend_lines(uint8_t *buf, int width, int height, int stride,
   1341                         int extend, int use_highbitdepth) {
   1342  for (int i = 0; i < height; ++i) {
   1343    if (use_highbitdepth) {
   1344      uint16_t *buf16 = (uint16_t *)buf;
   1345      aom_memset16(buf16 - extend, buf16[0], extend);
   1346      aom_memset16(buf16 + width, buf16[width - 1], extend);
   1347    } else {
   1348      memset(buf - extend, buf[0], extend);
   1349      memset(buf + width, buf[width - 1], extend);
   1350    }
   1351    buf += stride;
   1352  }
   1353 }
   1354 
   1355 static void save_deblock_boundary_lines(
   1356    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
   1357    int stripe, int use_highbd, int is_above,
   1358    RestorationStripeBoundaries *boundaries) {
   1359  const int is_uv = plane > 0;
   1360  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
   1361  const int src_stride = frame->strides[is_uv] << use_highbd;
   1362  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
   1363 
   1364  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
   1365                               : boundaries->stripe_boundary_below;
   1366  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
   1367  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
   1368  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
   1369 
   1370  // There is a rare case in which a processing stripe can end 1px above the
   1371  // crop border. In this case, we do want to use deblocked pixels from below
   1372  // the stripe (hence why we ended up in this function), but instead of
   1373  // fetching 2 "below" rows we need to fetch one and duplicate it.
   1374  // This is equivalent to clamping the sample locations against the crop border
   1375  const int lines_to_save =
   1376      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
   1377  assert(lines_to_save == 1 || lines_to_save == 2);
   1378 
   1379  int upscaled_width;
   1380  int line_bytes;
   1381  if (av1_superres_scaled(cm)) {
   1382    const int ss_x = is_uv && cm->seq_params->subsampling_x;
   1383    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
   1384    line_bytes = upscaled_width << use_highbd;
   1385    if (use_highbd)
   1386      av1_upscale_normative_rows(
   1387          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
   1388          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
   1389          plane, lines_to_save);
   1390    else
   1391      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
   1392                                 boundaries->stripe_boundary_stride, plane,
   1393                                 lines_to_save);
   1394  } else {
   1395    upscaled_width = frame->crop_widths[is_uv];
   1396    line_bytes = upscaled_width << use_highbd;
   1397    for (int i = 0; i < lines_to_save; i++) {
   1398      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
   1399             line_bytes);
   1400    }
   1401  }
   1402  // If we only saved one line, then copy it into the second line buffer
   1403  if (lines_to_save == 1)
   1404    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
   1405 
   1406  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
   1407               RESTORATION_EXTRA_HORZ, use_highbd);
   1408 }
   1409 
   1410 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   1411                                     const AV1_COMMON *cm, int plane, int row,
   1412                                     int stripe, int use_highbd, int is_above,
   1413                                     RestorationStripeBoundaries *boundaries) {
   1414  const int is_uv = plane > 0;
   1415  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
   1416  const int src_stride = frame->strides[is_uv] << use_highbd;
   1417  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
   1418 
   1419  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
   1420                               : boundaries->stripe_boundary_below;
   1421  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
   1422  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
   1423  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
   1424  const int src_width = frame->crop_widths[is_uv];
   1425 
   1426  // At the point where this function is called, we've already applied
   1427  // superres. So we don't need to extend the lines here, we can just
   1428  // pull directly from the topmost row of the upscaled frame.
   1429  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   1430  const int upscaled_width = av1_superres_scaled(cm)
   1431                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
   1432                                 : src_width;
   1433  const int line_bytes = upscaled_width << use_highbd;
   1434  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
   1435    // Copy the line at 'src_rows' into both context lines
   1436    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
   1437  }
   1438  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
   1439               RESTORATION_EXTRA_HORZ, use_highbd);
   1440 }
   1441 
   1442 static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
   1443                                int plane, AV1_COMMON *cm, int after_cdef) {
   1444  const int is_uv = plane > 0;
   1445  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   1446  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   1447  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
   1448 
   1449  int plane_w, plane_h;
   1450  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
   1451 
   1452  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
   1453 
   1454  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
   1455 
   1456  int stripe_idx;
   1457  for (stripe_idx = 0;; ++stripe_idx) {
   1458    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
   1459    const int y0 = rel_y0;
   1460    if (y0 >= plane_h) break;
   1461 
   1462    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
   1463    const int y1 = AOMMIN(rel_y1, plane_h);
   1464 
   1465    // Extend using CDEF pixels at the top and bottom of the frame,
   1466    // and deblocked pixels at internal stripe boundaries
   1467    const int use_deblock_above = (stripe_idx > 0);
   1468    const int use_deblock_below = (y1 < plane_height);
   1469 
   1470    if (!after_cdef) {
   1471      // Save deblocked context at internal stripe boundaries
   1472      if (use_deblock_above) {
   1473        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
   1474                                    stripe_idx, use_highbd, 1, boundaries);
   1475      }
   1476      if (use_deblock_below) {
   1477        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
   1478                                    use_highbd, 0, boundaries);
   1479      }
   1480    } else {
   1481      // Save CDEF context at frame boundaries
   1482      if (!use_deblock_above) {
   1483        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
   1484                                 1, boundaries);
   1485      }
   1486      if (!use_deblock_below) {
   1487        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
   1488                                 use_highbd, 0, boundaries);
   1489      }
   1490    }
   1491  }
   1492 }
   1493 
   1494 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
   1495 // lines to be used as boundary in the loop restoration process. The
   1496 // lines are saved in rst_internal.stripe_boundary_lines
   1497 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   1498                                              AV1_COMMON *cm, int after_cdef) {
   1499  const int num_planes = av1_num_planes(cm);
   1500  const int use_highbd = cm->seq_params->use_highbitdepth;
   1501  for (int p = 0; p < num_planes; ++p) {
   1502    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   1503  }
   1504 }