tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pickrst_sve.c (22284B)


      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <arm_sve.h>
     14 #include <assert.h>
     15 #include <string.h>
     16 
     17 #include "config/aom_config.h"
     18 #include "config/av1_rtcd.h"
     19 
     20 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
     21 #include "aom_dsp/arm/mem_neon.h"
     22 #include "aom_dsp/arm/sum_neon.h"
     23 #include "aom_dsp/arm/transpose_neon.h"
     24 #include "av1/common/restoration.h"
     25 #include "av1/encoder/pickrst.h"
     26 #include "av1/encoder/arm/pickrst_sve.h"
     27 
     28 static inline uint8_t find_average_sve(const uint8_t *src, int src_stride,
     29                                       int width, int height) {
     30  uint32x4_t avg_u32 = vdupq_n_u32(0);
     31  uint8x16_t ones = vdupq_n_u8(1);
     32 
     33  // Use a predicate to compute the last columns.
     34  svbool_t pattern = svwhilelt_b8_u32(0, width % 16);
     35 
     36  int h = height;
     37  do {
     38    int j = width;
     39    const uint8_t *src_ptr = src;
     40    while (j >= 16) {
     41      uint8x16_t s = vld1q_u8(src_ptr);
     42      avg_u32 = vdotq_u32(avg_u32, s, ones);
     43 
     44      j -= 16;
     45      src_ptr += 16;
     46    }
     47    uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr));
     48    avg_u32 = vdotq_u32(avg_u32, s_end, ones);
     49 
     50    src += src_stride;
     51  } while (--h != 0);
     52  return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height));
     53 }
     54 
     55 static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
     56                                   int16_t *buf_avg, int buf_avg_stride,
     57                                   int width, int height,
     58                                   int downsample_factor) {
     59  uint8x8_t avg_u8 = vdup_n_u8(avg);
     60 
     61  // Use a predicate to compute the last columns.
     62  svbool_t pattern = svwhilelt_b8_u32(0, width % 8);
     63 
     64  uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg)));
     65 
     66  do {
     67    int j = width;
     68    const uint8_t *buf_ptr = buf;
     69    int16_t *buf_avg_ptr = buf_avg;
     70    while (j >= 8) {
     71      uint8x8_t d = vld1_u8(buf_ptr);
     72      vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
     73 
     74      j -= 8;
     75      buf_ptr += 8;
     76      buf_avg_ptr += 8;
     77    }
     78    uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr)));
     79    vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end)));
     80 
     81    buf += buf_stride;
     82    buf_avg += buf_avg_stride;
     83    height -= downsample_factor;
     84  } while (height > 0);
     85 }
     86 
     87 static inline void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
     88                                       const int wiener_win2, const int scale) {
     89  for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
     90    // Transpose the first 2x2 square. It needs a special case as the element
     91    // of the bottom left is on the diagonal.
     92    int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
     93    int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
     94 
     95    int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
     96 
     97    vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
     98    vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
     99 
    100    // Transpose and store all the remaining 2x2 squares of the line.
    101    for (int j = i + 3; j < wiener_win2; j = j + 2) {
    102      row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
    103      row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
    104 
    105      int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
    106      int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
    107 
    108      vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0);
    109      vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
    110    }
    111  }
    112  for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
    113    H[i] += H_tmp[i] * scale;
    114  }
    115 }
    116 
    117 // Transpose the matrix that has just been computed and accumulate it in M.
    118 static inline void acc_transpose_M(int64_t *M, const int64_t *M_trn,
    119                                   const int wiener_win, int scale) {
    120  for (int i = 0; i < wiener_win; ++i) {
    121    for (int j = 0; j < wiener_win; ++j) {
    122      int tr_idx = j * wiener_win + i;
    123      *M++ += (int64_t)(M_trn[tr_idx] * scale);
    124    }
    125  }
    126 }
    127 
    128 // This function computes two matrices: the cross-correlation between the src
    129 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
    130 //
    131 // M is of size 7 * 7. It needs to be filled such that multiplying one element
    132 // from src with each element of a row of the wiener window will fill one
    133 // column of M. However this is not very convenient in terms of memory
    134 // accesses, as it means we do contiguous loads of dgd but strided stores to M.
    135 // As a result, we use an intermediate matrix M_trn which is instead filled
    136 // such that one row of the wiener window gives one row of M_trn. Once fully
    137 // computed, M_trn is then transposed to return M.
    138 //
    139 // H is of size 49 * 49. It is filled by multiplying every pair of elements of
    140 // the wiener window together. Since it is a symmetric matrix, we only compute
    141 // the upper triangle, and then copy it down to the lower one. Here we fill it
    142 // by taking each different pair of columns, and multiplying all the elements of
    143 // the first one with all the elements of the second one, with a special case
    144 // when multiplying a column by itself.
    145 static inline void compute_stats_win7_downsampled_sve(
    146    int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
    147    int width, int height, int64_t *M, int64_t *H, int downsample_factor) {
    148  const int wiener_win = 7;
    149  const int wiener_win2 = wiener_win * wiener_win;
    150 
    151  // Use a predicate to compute the last columns of the block for H.
    152  svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
    153 
    154  // Use intermediate matrices for H and M to perform the computation, they
    155  // will be accumulated into the original H and M at the end.
    156  int64_t M_trn[49];
    157  memset(M_trn, 0, sizeof(M_trn));
    158 
    159  int64_t H_tmp[49 * 49];
    160  memset(H_tmp, 0, sizeof(H_tmp));
    161 
    162  assert(height > 0);
    163  do {
    164    // Cross-correlation (M).
    165    for (int row = 0; row < wiener_win; row++) {
    166      int j = 0;
    167      while (j < width) {
    168        int16x8_t dgd[7];
    169        load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
    170                     &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
    171        int16x8_t s = vld1q_s16(src_avg + j);
    172 
    173        // Compute all the elements of one row of M.
    174        compute_M_one_row_win7(s, dgd, M_trn, row);
    175 
    176        j += 8;
    177      }
    178    }
    179 
    180    // Auto-covariance (H).
    181    int j = 0;
    182    while (j <= width - 8) {
    183      for (int col0 = 0; col0 < wiener_win; col0++) {
    184        int16x8_t dgd0[7];
    185        load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
    186                     &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
    187 
    188        // Perform computation of the first column with itself (28 elements).
    189        // For the first column this will fill the upper triangle of the 7x7
    190        // matrix at the top left of the H matrix. For the next columns this
    191        // will fill the upper triangle of the other 7x7 matrices around H's
    192        // diagonal.
    193        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
    194 
    195        // All computation next to the matrix diagonal has already been done.
    196        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
    197          // Load second column and scale based on downsampling factor.
    198          int16x8_t dgd1[7];
    199          load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
    200                       &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
    201 
    202          // Compute all elements from the combination of both columns (49
    203          // elements).
    204          compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
    205        }
    206      }
    207      j += 8;
    208    }
    209 
    210    if (j < width) {
    211      // Process remaining columns using a predicate to discard excess elements.
    212      for (int col0 = 0; col0 < wiener_win; col0++) {
    213        // Load first column.
    214        int16x8_t dgd0[7];
    215        dgd0[0] = svget_neonq_s16(
    216            svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
    217        dgd0[1] = svget_neonq_s16(
    218            svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
    219        dgd0[2] = svget_neonq_s16(
    220            svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
    221        dgd0[3] = svget_neonq_s16(
    222            svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
    223        dgd0[4] = svget_neonq_s16(
    224            svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
    225        dgd0[5] = svget_neonq_s16(
    226            svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
    227        dgd0[6] = svget_neonq_s16(
    228            svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
    229 
    230        // Perform computation of the first column with itself (28 elements).
    231        // For the first column this will fill the upper triangle of the 7x7
    232        // matrix at the top left of the H matrix. For the next columns this
    233        // will fill the upper triangle of the other 7x7 matrices around H's
    234        // diagonal.
    235        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
    236 
    237        // All computation next to the matrix diagonal has already been done.
    238        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
    239          // Load second column and scale based on downsampling factor.
    240          int16x8_t dgd1[7];
    241          load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
    242                       &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
    243 
    244          // Compute all elements from the combination of both columns (49
    245          // elements).
    246          compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
    247        }
    248      }
    249    }
    250    dgd_avg += downsample_factor * dgd_avg_stride;
    251    src_avg += src_avg_stride;
    252  } while (--height != 0);
    253 
    254  // Transpose M_trn.
    255  acc_transpose_M(M, M_trn, 7, downsample_factor);
    256 
    257  // Copy upper triangle of H in the lower one.
    258  copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
    259 }
    260 
    261 // This function computes two matrices: the cross-correlation between the src
    262 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
    263 //
    264 // M is of size 5 * 5. It needs to be filled such that multiplying one element
    265 // from src with each element of a row of the wiener window will fill one
    266 // column of M. However this is not very convenient in terms of memory
    267 // accesses, as it means we do contiguous loads of dgd but strided stores to M.
    268 // As a result, we use an intermediate matrix M_trn which is instead filled
    269 // such that one row of the wiener window gives one row of M_trn. Once fully
    270 // computed, M_trn is then transposed to return M.
    271 //
    272 // H is of size 25 * 25. It is filled by multiplying every pair of elements of
    273 // the wiener window together. Since it is a symmetric matrix, we only compute
    274 // the upper triangle, and then copy it down to the lower one. Here we fill it
    275 // by taking each different pair of columns, and multiplying all the elements of
    276 // the first one with all the elements of the second one, with a special case
    277 // when multiplying a column by itself.
    278 static inline void compute_stats_win5_downsampled_sve(
    279    int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
    280    int width, int height, int64_t *M, int64_t *H, int downsample_factor) {
    281  const int wiener_win = 5;
    282  const int wiener_win2 = wiener_win * wiener_win;
    283 
    284  // Use a predicate to compute the last columns of the block for H.
    285  svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
    286 
    287  // Use intermediate matrices for H and M to perform the computation, they
    288  // will be accumulated into the original H and M at the end.
    289  int64_t M_trn[25];
    290  memset(M_trn, 0, sizeof(M_trn));
    291 
    292  int64_t H_tmp[25 * 25];
    293  memset(H_tmp, 0, sizeof(H_tmp));
    294 
    295  assert(height > 0);
    296  do {
    297    // Cross-correlation (M).
    298    for (int row = 0; row < wiener_win; row++) {
    299      int j = 0;
    300      while (j < width) {
    301        int16x8_t dgd[5];
    302        load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
    303                     &dgd[2], &dgd[3], &dgd[4]);
    304        int16x8_t s = vld1q_s16(src_avg + j);
    305 
    306        // Compute all the elements of one row of M.
    307        compute_M_one_row_win5(s, dgd, M_trn, row);
    308 
    309        j += 8;
    310      }
    311    }
    312 
    313    // Auto-covariance (H).
    314    int j = 0;
    315    while (j <= width - 8) {
    316      for (int col0 = 0; col0 < wiener_win; col0++) {
    317        // Load first column.
    318        int16x8_t dgd0[5];
    319        load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
    320                     &dgd0[2], &dgd0[3], &dgd0[4]);
    321 
    322        // Perform computation of the first column with itself (15 elements).
    323        // For the first column this will fill the upper triangle of the 5x5
    324        // matrix at the top left of the H matrix. For the next columns this
    325        // will fill the upper triangle of the other 5x5 matrices around H's
    326        // diagonal.
    327        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
    328 
    329        // All computation next to the matrix diagonal has already been done.
    330        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
    331          // Load second column and scale based on downsampling factor.
    332          int16x8_t dgd1[5];
    333          load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
    334                       &dgd1[2], &dgd1[3], &dgd1[4]);
    335 
    336          // Compute all elements from the combination of both columns (25
    337          // elements).
    338          compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
    339        }
    340      }
    341      j += 8;
    342    }
    343 
    344    // Process remaining columns using a predicate to discard excess elements.
    345    if (j < width) {
    346      for (int col0 = 0; col0 < wiener_win; col0++) {
    347        int16x8_t dgd0[5];
    348        dgd0[0] = svget_neonq_s16(
    349            svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
    350        dgd0[1] = svget_neonq_s16(
    351            svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
    352        dgd0[2] = svget_neonq_s16(
    353            svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
    354        dgd0[3] = svget_neonq_s16(
    355            svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
    356        dgd0[4] = svget_neonq_s16(
    357            svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
    358 
    359        // Perform computation of the first column with itself (15 elements).
    360        // For the first column this will fill the upper triangle of the 5x5
    361        // matrix at the top left of the H matrix. For the next columns this
    362        // will fill the upper triangle of the other 5x5 matrices around H's
    363        // diagonal.
    364        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
    365 
    366        // All computation next to the matrix diagonal has already been done.
    367        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
    368          // Load second column and scale based on downsampling factor.
    369          int16x8_t dgd1[5];
    370          load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
    371                       &dgd1[2], &dgd1[3], &dgd1[4]);
    372 
    373          // Compute all elements from the combination of both columns (25
    374          // elements).
    375          compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
    376        }
    377      }
    378    }
    379    dgd_avg += downsample_factor * dgd_avg_stride;
    380    src_avg += src_avg_stride;
    381  } while (--height != 0);
    382 
    383  // Transpose M_trn.
    384  acc_transpose_M(M, M_trn, 5, downsample_factor);
    385 
    386  // Copy upper triangle of H in the lower one.
    387  copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
    388 }
    389 
    390 static inline void av1_compute_stats_downsampled_sve(
    391    int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg,
    392    int16_t *src_avg, int h_start, int h_end, int v_start, int v_end,
    393    int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
    394  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
    395 
    396  const int wiener_win2 = wiener_win * wiener_win;
    397  const int wiener_halfwin = wiener_win >> 1;
    398  const int32_t width = h_end - h_start;
    399  const int32_t height = v_end - v_start;
    400  const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
    401  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
    402  memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
    403 
    404  const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
    405  const int downsample_factor = WIENER_STATS_DOWNSAMPLE_FACTOR;
    406 
    407  // dgd_avg and src_avg have been memset to zero before calling this
    408  // function, so round up the stride to the next multiple of 8 so that we
    409  // don't have to worry about a tail loop when computing M.
    410  const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
    411  const int src_avg_stride = (width & ~7) + 8;
    412 
    413  // Compute (dgd - avg) and store it in dgd_avg.
    414  // The wiener window will slide along the dgd frame, centered on each pixel.
    415  // For the top left pixel and all the pixels on the side of the frame this
    416  // means half of the window will be outside of the frame. As such the actual
    417  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
    418  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
    419  const int vert_offset = v_start - wiener_halfwin;
    420  const int horiz_offset = h_start - wiener_halfwin;
    421  const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
    422  compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
    423                  width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
    424 
    425  // Compute (src - avg), downsample and store in src-avg.
    426  const uint8_t *src_start = src + h_start + v_start * src_stride;
    427  compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg,
    428                  src_avg_stride, width, height, downsample_factor);
    429 
    430  const int downsample_height = height / downsample_factor;
    431 
    432  // Since the height is not necessarily a multiple of the downsample factor,
    433  // the last line of src will be scaled according to how many rows remain.
    434  const int downsample_remainder = height % downsample_factor;
    435 
    436  if (downsample_height > 0) {
    437    if (wiener_win == WIENER_WIN) {
    438      compute_stats_win7_downsampled_sve(
    439          dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width,
    440          downsample_height, M, H, downsample_factor);
    441    } else {
    442      compute_stats_win5_downsampled_sve(
    443          dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width,
    444          downsample_height, M, H, downsample_factor);
    445    }
    446  }
    447 
    448  if (downsample_remainder > 0) {
    449    const int remainder_offset = height - downsample_remainder;
    450    if (wiener_win == WIENER_WIN) {
    451      compute_stats_win7_downsampled_sve(
    452          dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
    453          src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
    454          1, M, H, downsample_remainder);
    455    } else {
    456      compute_stats_win5_downsampled_sve(
    457          dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
    458          src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
    459          1, M, H, downsample_remainder);
    460    }
    461  }
    462 }
    463 
    464 void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd,
    465                           const uint8_t *src, int16_t *dgd_avg,
    466                           int16_t *src_avg, int h_start, int h_end,
    467                           int v_start, int v_end, int dgd_stride,
    468                           int src_stride, int64_t *M, int64_t *H,
    469                           int use_downsampled_wiener_stats) {
    470  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
    471 
    472  if (use_downsampled_wiener_stats) {
    473    av1_compute_stats_downsampled_sve(wiener_win, dgd, src, dgd_avg, src_avg,
    474                                      h_start, h_end, v_start, v_end,
    475                                      dgd_stride, src_stride, M, H);
    476    return;
    477  }
    478 
    479  const int wiener_win2 = wiener_win * wiener_win;
    480  const int wiener_halfwin = wiener_win >> 1;
    481  const int32_t width = h_end - h_start;
    482  const int32_t height = v_end - v_start;
    483  const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
    484  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
    485  memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
    486 
    487  const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
    488 
    489  // dgd_avg and src_avg have been memset to zero before calling this
    490  // function, so round up the stride to the next multiple of 8 so that we
    491  // don't have to worry about a tail loop when computing M.
    492  const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
    493  const int src_avg_stride = (width & ~7) + 8;
    494 
    495  // Compute (dgd - avg) and store it in dgd_avg.
    496  // The wiener window will slide along the dgd frame, centered on each pixel.
    497  // For the top left pixel and all the pixels on the side of the frame this
    498  // means half of the window will be outside of the frame. As such the actual
    499  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
    500  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
    501  const int vert_offset = v_start - wiener_halfwin;
    502  const int horiz_offset = h_start - wiener_halfwin;
    503  const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
    504  compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
    505                  width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
    506 
    507  // Compute (src - avg), and store in src-avg.
    508  const uint8_t *src_start = src + h_start + v_start * src_stride;
    509  compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
    510                  height, 1);
    511 
    512  if (wiener_win == WIENER_WIN) {
    513    compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
    514                           width, height, M, H);
    515  } else {
    516    compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
    517                           width, height, M, H);
    518  }
    519 
    520  // H is a symmetric matrix, so we only need to fill out the upper triangle.
    521  // We can copy it down to the lower triangle outside the (i, j) loops.
    522  diagonal_copy_stats_neon(wiener_win2, H);
    523 }