pickrst_sve.c (22284B)
1 /* 2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <arm_sve.h> 14 #include <assert.h> 15 #include <string.h> 16 17 #include "config/aom_config.h" 18 #include "config/av1_rtcd.h" 19 20 #include "aom_dsp/arm/aom_neon_sve_bridge.h" 21 #include "aom_dsp/arm/mem_neon.h" 22 #include "aom_dsp/arm/sum_neon.h" 23 #include "aom_dsp/arm/transpose_neon.h" 24 #include "av1/common/restoration.h" 25 #include "av1/encoder/pickrst.h" 26 #include "av1/encoder/arm/pickrst_sve.h" 27 28 static inline uint8_t find_average_sve(const uint8_t *src, int src_stride, 29 int width, int height) { 30 uint32x4_t avg_u32 = vdupq_n_u32(0); 31 uint8x16_t ones = vdupq_n_u8(1); 32 33 // Use a predicate to compute the last columns. 34 svbool_t pattern = svwhilelt_b8_u32(0, width % 16); 35 36 int h = height; 37 do { 38 int j = width; 39 const uint8_t *src_ptr = src; 40 while (j >= 16) { 41 uint8x16_t s = vld1q_u8(src_ptr); 42 avg_u32 = vdotq_u32(avg_u32, s, ones); 43 44 j -= 16; 45 src_ptr += 16; 46 } 47 uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr)); 48 avg_u32 = vdotq_u32(avg_u32, s_end, ones); 49 50 src += src_stride; 51 } while (--h != 0); 52 return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height)); 53 } 54 55 static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, 56 int16_t *buf_avg, int buf_avg_stride, 57 int width, int height, 58 int downsample_factor) { 59 uint8x8_t avg_u8 = vdup_n_u8(avg); 60 61 // Use a predicate to compute the last columns. 62 svbool_t pattern = svwhilelt_b8_u32(0, width % 8); 63 64 uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg))); 65 66 do { 67 int j = width; 68 const uint8_t *buf_ptr = buf; 69 int16_t *buf_avg_ptr = buf_avg; 70 while (j >= 8) { 71 uint8x8_t d = vld1_u8(buf_ptr); 72 vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); 73 74 j -= 8; 75 buf_ptr += 8; 76 buf_avg_ptr += 8; 77 } 78 uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr))); 79 vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end))); 80 81 buf += buf_stride; 82 buf_avg += buf_avg_stride; 83 height -= downsample_factor; 84 } while (height > 0); 85 } 86 87 static inline void copy_upper_triangle(int64_t *H, int64_t *H_tmp, 88 const int wiener_win2, const int scale) { 89 for (int i = 0; i < wiener_win2 - 2; i = i + 2) { 90 // Transpose the first 2x2 square. It needs a special case as the element 91 // of the bottom left is on the diagonal. 92 int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1); 93 int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1); 94 95 int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); 96 97 vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); 98 vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row); 99 100 // Transpose and store all the remaining 2x2 squares of the line. 101 for (int j = i + 3; j < wiener_win2; j = j + 2) { 102 row0 = vld1q_s64(H_tmp + i * wiener_win2 + j); 103 row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j); 104 105 int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); 106 int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); 107 108 vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0); 109 vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1); 110 } 111 } 112 for (int i = 0; i < wiener_win2 * wiener_win2; i++) { 113 H[i] += H_tmp[i] * scale; 114 } 115 } 116 117 // Transpose the matrix that has just been computed and accumulate it in M. 118 static inline void acc_transpose_M(int64_t *M, const int64_t *M_trn, 119 const int wiener_win, int scale) { 120 for (int i = 0; i < wiener_win; ++i) { 121 for (int j = 0; j < wiener_win; ++j) { 122 int tr_idx = j * wiener_win + i; 123 *M++ += (int64_t)(M_trn[tr_idx] * scale); 124 } 125 } 126 } 127 128 // This function computes two matrices: the cross-correlation between the src 129 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). 130 // 131 // M is of size 7 * 7. It needs to be filled such that multiplying one element 132 // from src with each element of a row of the wiener window will fill one 133 // column of M. However this is not very convenient in terms of memory 134 // accesses, as it means we do contiguous loads of dgd but strided stores to M. 135 // As a result, we use an intermediate matrix M_trn which is instead filled 136 // such that one row of the wiener window gives one row of M_trn. Once fully 137 // computed, M_trn is then transposed to return M. 138 // 139 // H is of size 49 * 49. It is filled by multiplying every pair of elements of 140 // the wiener window together. Since it is a symmetric matrix, we only compute 141 // the upper triangle, and then copy it down to the lower one. Here we fill it 142 // by taking each different pair of columns, and multiplying all the elements of 143 // the first one with all the elements of the second one, with a special case 144 // when multiplying a column by itself. 145 static inline void compute_stats_win7_downsampled_sve( 146 int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, 147 int width, int height, int64_t *M, int64_t *H, int downsample_factor) { 148 const int wiener_win = 7; 149 const int wiener_win2 = wiener_win * wiener_win; 150 151 // Use a predicate to compute the last columns of the block for H. 152 svbool_t pattern = svwhilelt_b16_u32(0, width % 8); 153 154 // Use intermediate matrices for H and M to perform the computation, they 155 // will be accumulated into the original H and M at the end. 156 int64_t M_trn[49]; 157 memset(M_trn, 0, sizeof(M_trn)); 158 159 int64_t H_tmp[49 * 49]; 160 memset(H_tmp, 0, sizeof(H_tmp)); 161 162 assert(height > 0); 163 do { 164 // Cross-correlation (M). 165 for (int row = 0; row < wiener_win; row++) { 166 int j = 0; 167 while (j < width) { 168 int16x8_t dgd[7]; 169 load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], 170 &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]); 171 int16x8_t s = vld1q_s16(src_avg + j); 172 173 // Compute all the elements of one row of M. 174 compute_M_one_row_win7(s, dgd, M_trn, row); 175 176 j += 8; 177 } 178 } 179 180 // Auto-covariance (H). 181 int j = 0; 182 while (j <= width - 8) { 183 for (int col0 = 0; col0 < wiener_win; col0++) { 184 int16x8_t dgd0[7]; 185 load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], 186 &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]); 187 188 // Perform computation of the first column with itself (28 elements). 189 // For the first column this will fill the upper triangle of the 7x7 190 // matrix at the top left of the H matrix. For the next columns this 191 // will fill the upper triangle of the other 7x7 matrices around H's 192 // diagonal. 193 compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); 194 195 // All computation next to the matrix diagonal has already been done. 196 for (int col1 = col0 + 1; col1 < wiener_win; col1++) { 197 // Load second column and scale based on downsampling factor. 198 int16x8_t dgd1[7]; 199 load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], 200 &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); 201 202 // Compute all elements from the combination of both columns (49 203 // elements). 204 compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); 205 } 206 } 207 j += 8; 208 } 209 210 if (j < width) { 211 // Process remaining columns using a predicate to discard excess elements. 212 for (int col0 = 0; col0 < wiener_win; col0++) { 213 // Load first column. 214 int16x8_t dgd0[7]; 215 dgd0[0] = svget_neonq_s16( 216 svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); 217 dgd0[1] = svget_neonq_s16( 218 svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); 219 dgd0[2] = svget_neonq_s16( 220 svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); 221 dgd0[3] = svget_neonq_s16( 222 svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); 223 dgd0[4] = svget_neonq_s16( 224 svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); 225 dgd0[5] = svget_neonq_s16( 226 svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0)); 227 dgd0[6] = svget_neonq_s16( 228 svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0)); 229 230 // Perform computation of the first column with itself (28 elements). 231 // For the first column this will fill the upper triangle of the 7x7 232 // matrix at the top left of the H matrix. For the next columns this 233 // will fill the upper triangle of the other 7x7 matrices around H's 234 // diagonal. 235 compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); 236 237 // All computation next to the matrix diagonal has already been done. 238 for (int col1 = col0 + 1; col1 < wiener_win; col1++) { 239 // Load second column and scale based on downsampling factor. 240 int16x8_t dgd1[7]; 241 load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], 242 &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); 243 244 // Compute all elements from the combination of both columns (49 245 // elements). 246 compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); 247 } 248 } 249 } 250 dgd_avg += downsample_factor * dgd_avg_stride; 251 src_avg += src_avg_stride; 252 } while (--height != 0); 253 254 // Transpose M_trn. 255 acc_transpose_M(M, M_trn, 7, downsample_factor); 256 257 // Copy upper triangle of H in the lower one. 258 copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); 259 } 260 261 // This function computes two matrices: the cross-correlation between the src 262 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). 263 // 264 // M is of size 5 * 5. It needs to be filled such that multiplying one element 265 // from src with each element of a row of the wiener window will fill one 266 // column of M. However this is not very convenient in terms of memory 267 // accesses, as it means we do contiguous loads of dgd but strided stores to M. 268 // As a result, we use an intermediate matrix M_trn which is instead filled 269 // such that one row of the wiener window gives one row of M_trn. Once fully 270 // computed, M_trn is then transposed to return M. 271 // 272 // H is of size 25 * 25. It is filled by multiplying every pair of elements of 273 // the wiener window together. Since it is a symmetric matrix, we only compute 274 // the upper triangle, and then copy it down to the lower one. Here we fill it 275 // by taking each different pair of columns, and multiplying all the elements of 276 // the first one with all the elements of the second one, with a special case 277 // when multiplying a column by itself. 278 static inline void compute_stats_win5_downsampled_sve( 279 int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, 280 int width, int height, int64_t *M, int64_t *H, int downsample_factor) { 281 const int wiener_win = 5; 282 const int wiener_win2 = wiener_win * wiener_win; 283 284 // Use a predicate to compute the last columns of the block for H. 285 svbool_t pattern = svwhilelt_b16_u32(0, width % 8); 286 287 // Use intermediate matrices for H and M to perform the computation, they 288 // will be accumulated into the original H and M at the end. 289 int64_t M_trn[25]; 290 memset(M_trn, 0, sizeof(M_trn)); 291 292 int64_t H_tmp[25 * 25]; 293 memset(H_tmp, 0, sizeof(H_tmp)); 294 295 assert(height > 0); 296 do { 297 // Cross-correlation (M). 298 for (int row = 0; row < wiener_win; row++) { 299 int j = 0; 300 while (j < width) { 301 int16x8_t dgd[5]; 302 load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], 303 &dgd[2], &dgd[3], &dgd[4]); 304 int16x8_t s = vld1q_s16(src_avg + j); 305 306 // Compute all the elements of one row of M. 307 compute_M_one_row_win5(s, dgd, M_trn, row); 308 309 j += 8; 310 } 311 } 312 313 // Auto-covariance (H). 314 int j = 0; 315 while (j <= width - 8) { 316 for (int col0 = 0; col0 < wiener_win; col0++) { 317 // Load first column. 318 int16x8_t dgd0[5]; 319 load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], 320 &dgd0[2], &dgd0[3], &dgd0[4]); 321 322 // Perform computation of the first column with itself (15 elements). 323 // For the first column this will fill the upper triangle of the 5x5 324 // matrix at the top left of the H matrix. For the next columns this 325 // will fill the upper triangle of the other 5x5 matrices around H's 326 // diagonal. 327 compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); 328 329 // All computation next to the matrix diagonal has already been done. 330 for (int col1 = col0 + 1; col1 < wiener_win; col1++) { 331 // Load second column and scale based on downsampling factor. 332 int16x8_t dgd1[5]; 333 load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], 334 &dgd1[2], &dgd1[3], &dgd1[4]); 335 336 // Compute all elements from the combination of both columns (25 337 // elements). 338 compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); 339 } 340 } 341 j += 8; 342 } 343 344 // Process remaining columns using a predicate to discard excess elements. 345 if (j < width) { 346 for (int col0 = 0; col0 < wiener_win; col0++) { 347 int16x8_t dgd0[5]; 348 dgd0[0] = svget_neonq_s16( 349 svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); 350 dgd0[1] = svget_neonq_s16( 351 svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); 352 dgd0[2] = svget_neonq_s16( 353 svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); 354 dgd0[3] = svget_neonq_s16( 355 svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); 356 dgd0[4] = svget_neonq_s16( 357 svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); 358 359 // Perform computation of the first column with itself (15 elements). 360 // For the first column this will fill the upper triangle of the 5x5 361 // matrix at the top left of the H matrix. For the next columns this 362 // will fill the upper triangle of the other 5x5 matrices around H's 363 // diagonal. 364 compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); 365 366 // All computation next to the matrix diagonal has already been done. 367 for (int col1 = col0 + 1; col1 < wiener_win; col1++) { 368 // Load second column and scale based on downsampling factor. 369 int16x8_t dgd1[5]; 370 load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], 371 &dgd1[2], &dgd1[3], &dgd1[4]); 372 373 // Compute all elements from the combination of both columns (25 374 // elements). 375 compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); 376 } 377 } 378 } 379 dgd_avg += downsample_factor * dgd_avg_stride; 380 src_avg += src_avg_stride; 381 } while (--height != 0); 382 383 // Transpose M_trn. 384 acc_transpose_M(M, M_trn, 5, downsample_factor); 385 386 // Copy upper triangle of H in the lower one. 387 copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); 388 } 389 390 static inline void av1_compute_stats_downsampled_sve( 391 int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, 392 int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, 393 int dgd_stride, int src_stride, int64_t *M, int64_t *H) { 394 assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); 395 396 const int wiener_win2 = wiener_win * wiener_win; 397 const int wiener_halfwin = wiener_win >> 1; 398 const int32_t width = h_end - h_start; 399 const int32_t height = v_end - v_start; 400 const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; 401 memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); 402 memset(M, 0, sizeof(*M) * wiener_win * wiener_win); 403 404 const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); 405 const int downsample_factor = WIENER_STATS_DOWNSAMPLE_FACTOR; 406 407 // dgd_avg and src_avg have been memset to zero before calling this 408 // function, so round up the stride to the next multiple of 8 so that we 409 // don't have to worry about a tail loop when computing M. 410 const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; 411 const int src_avg_stride = (width & ~7) + 8; 412 413 // Compute (dgd - avg) and store it in dgd_avg. 414 // The wiener window will slide along the dgd frame, centered on each pixel. 415 // For the top left pixel and all the pixels on the side of the frame this 416 // means half of the window will be outside of the frame. As such the actual 417 // buffer that we need to subtract the avg from will be 2 * wiener_halfwin 418 // wider and 2 * wiener_halfwin higher than the original dgd buffer. 419 const int vert_offset = v_start - wiener_halfwin; 420 const int horiz_offset = h_start - wiener_halfwin; 421 const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; 422 compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, 423 width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); 424 425 // Compute (src - avg), downsample and store in src-avg. 426 const uint8_t *src_start = src + h_start + v_start * src_stride; 427 compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, 428 src_avg_stride, width, height, downsample_factor); 429 430 const int downsample_height = height / downsample_factor; 431 432 // Since the height is not necessarily a multiple of the downsample factor, 433 // the last line of src will be scaled according to how many rows remain. 434 const int downsample_remainder = height % downsample_factor; 435 436 if (downsample_height > 0) { 437 if (wiener_win == WIENER_WIN) { 438 compute_stats_win7_downsampled_sve( 439 dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width, 440 downsample_height, M, H, downsample_factor); 441 } else { 442 compute_stats_win5_downsampled_sve( 443 dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width, 444 downsample_height, M, H, downsample_factor); 445 } 446 } 447 448 if (downsample_remainder > 0) { 449 const int remainder_offset = height - downsample_remainder; 450 if (wiener_win == WIENER_WIN) { 451 compute_stats_win7_downsampled_sve( 452 dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, 453 src_avg + downsample_height * src_avg_stride, src_avg_stride, width, 454 1, M, H, downsample_remainder); 455 } else { 456 compute_stats_win5_downsampled_sve( 457 dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, 458 src_avg + downsample_height * src_avg_stride, src_avg_stride, width, 459 1, M, H, downsample_remainder); 460 } 461 } 462 } 463 464 void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd, 465 const uint8_t *src, int16_t *dgd_avg, 466 int16_t *src_avg, int h_start, int h_end, 467 int v_start, int v_end, int dgd_stride, 468 int src_stride, int64_t *M, int64_t *H, 469 int use_downsampled_wiener_stats) { 470 assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); 471 472 if (use_downsampled_wiener_stats) { 473 av1_compute_stats_downsampled_sve(wiener_win, dgd, src, dgd_avg, src_avg, 474 h_start, h_end, v_start, v_end, 475 dgd_stride, src_stride, M, H); 476 return; 477 } 478 479 const int wiener_win2 = wiener_win * wiener_win; 480 const int wiener_halfwin = wiener_win >> 1; 481 const int32_t width = h_end - h_start; 482 const int32_t height = v_end - v_start; 483 const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; 484 memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); 485 memset(M, 0, sizeof(*M) * wiener_win * wiener_win); 486 487 const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); 488 489 // dgd_avg and src_avg have been memset to zero before calling this 490 // function, so round up the stride to the next multiple of 8 so that we 491 // don't have to worry about a tail loop when computing M. 492 const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; 493 const int src_avg_stride = (width & ~7) + 8; 494 495 // Compute (dgd - avg) and store it in dgd_avg. 496 // The wiener window will slide along the dgd frame, centered on each pixel. 497 // For the top left pixel and all the pixels on the side of the frame this 498 // means half of the window will be outside of the frame. As such the actual 499 // buffer that we need to subtract the avg from will be 2 * wiener_halfwin 500 // wider and 2 * wiener_halfwin higher than the original dgd buffer. 501 const int vert_offset = v_start - wiener_halfwin; 502 const int horiz_offset = h_start - wiener_halfwin; 503 const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; 504 compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, 505 width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); 506 507 // Compute (src - avg), and store in src-avg. 508 const uint8_t *src_start = src + h_start + v_start * src_stride; 509 compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width, 510 height, 1); 511 512 if (wiener_win == WIENER_WIN) { 513 compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, 514 width, height, M, H); 515 } else { 516 compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, 517 width, height, M, H); 518 } 519 520 // H is a symmetric matrix, so we only need to fill out the upper triangle. 521 // We can copy it down to the lower triangle outside the (i, j) loops. 522 diagonal_copy_stats_neon(wiener_win2, H); 523 }