tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pickrst_sve.h (90704B)


      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
     13 #define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
     14 
     15 #include <arm_neon.h>
     16 #include <arm_sve.h>
     17 
     18 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
     19 #include "av1/encoder/arm/pickrst_neon.h"
     20 
     21 // Swap each half of the dgd vectors so that we can accumulate the result of
     22 // the dot-products directly in the destination matrix.
     23 static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
     24  int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
     25      vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
     26  int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
     27      vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
     28 
     29  return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
     30 }
     31 
     32 static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
     33                                          int64_t *M, int row) {
     34  const int wiener_win = 5;
     35 
     36  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
     37  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
     38 
     39  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
     40  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
     41  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
     42 
     43  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
     44  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
     45 
     46  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
     47  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
     48  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
     49 
     50  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
     51  M[row * wiener_win + 4] += vaddvq_s64(m4);
     52 }
     53 
     54 static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
     55                                          int64_t *M, int row) {
     56  const int wiener_win = 7;
     57 
     58  int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
     59  int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
     60 
     61  int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
     62  cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
     63  vst1q_s64(M + row * wiener_win + 0, cross_corr01);
     64 
     65  int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
     66  int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
     67 
     68  int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
     69  cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
     70  vst1q_s64(M + row * wiener_win + 2, cross_corr23);
     71 
     72  int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
     73  int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
     74 
     75  int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
     76  cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
     77  vst1q_s64(M + row * wiener_win + 4, cross_corr45);
     78 
     79  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
     80  M[row * wiener_win + 6] += vaddvq_s64(m6);
     81 }
     82 
     83 static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
     84                                     const int wiener_win,
     85                                     const int wiener_win2) {
     86  for (int row0 = 0; row0 < wiener_win; row0++) {
     87    for (int row1 = row0; row1 < wiener_win; row1++) {
     88      int auto_cov_idx =
     89          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
     90 
     91      int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
     92      H[auto_cov_idx] += vaddvq_s64(auto_cov);
     93    }
     94  }
     95 }
     96 
     97 static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
     98                                           int row0, int row1, int64_t *H) {
     99  for (int col0 = 0; col0 < 5; col0++) {
    100    int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
    101 
    102    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
    103    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
    104 
    105    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
    106    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
    107    vst1q_s64(H + auto_cov_idx, auto_cov01);
    108 
    109    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
    110    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
    111 
    112    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
    113    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
    114    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
    115 
    116    int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
    117    H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
    118  }
    119 }
    120 
    121 static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
    122                                           int row0, int row1, int64_t *H) {
    123  for (int col0 = 0; col0 < 7; col0++) {
    124    int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
    125 
    126    int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
    127    int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
    128 
    129    int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
    130    auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
    131    vst1q_s64(H + auto_cov_idx, auto_cov01);
    132 
    133    int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
    134    int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
    135 
    136    int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
    137    auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
    138    vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
    139 
    140    int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
    141    int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
    142 
    143    int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
    144    auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
    145    vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
    146 
    147    int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
    148    H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
    149  }
    150 }
    151 
    152 static inline void stats_top_win5_sve(const int16x8_t src[2],
    153                                      const int16x8_t dgd[2],
    154                                      const int16_t *const d,
    155                                      const int32_t d_stride, int64x2_t *sum_m,
    156                                      int64x2_t *sum_h) {
    157  int16x8_t dgds[WIENER_WIN_CHROMA * 2];
    158 
    159  load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
    160               &dgds[8]);
    161  load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
    162               &dgds[9]);
    163 
    164  sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
    165  sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
    166  sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
    167  sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
    168  sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
    169  sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
    170  sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
    171  sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
    172  sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
    173  sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
    174 
    175  sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
    176  sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
    177  sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
    178  sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
    179  sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
    180  sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
    181  sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
    182  sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
    183  sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
    184  sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
    185 }
    186 
    187 static inline void stats_left_win5_sve(const int16x8_t src[2], const int16_t *d,
    188                                       const int32_t d_stride, int64x2_t *sum) {
    189  int16x8_t dgds[WIN_CHROMA];
    190 
    191  load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
    192               &dgds[6]);
    193  load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
    194               &dgds[7]);
    195 
    196  sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
    197  sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
    198  sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
    199  sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
    200  sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
    201  sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
    202  sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
    203  sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
    204 }
    205 
    206 static inline void sub_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
    207                                        int64x2_t *deltas) {
    208  deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(A[0]), B[0]);
    209  deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(A[0]), B[1]);
    210  deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(A[0]), B[2]);
    211  deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(A[0]), B[3]);
    212  deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(A[0]), B[4]);
    213  deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(A[1]), B[0]);
    214  deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(A[2]), B[0]);
    215  deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(A[3]), B[0]);
    216  deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(A[4]), B[0]);
    217 }
    218 
    219 static inline void add_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
    220                                        int64x2_t *deltas) {
    221  deltas[0] = aom_sdotq_s16(deltas[0], A[0], B[0]);
    222  deltas[1] = aom_sdotq_s16(deltas[1], A[0], B[1]);
    223  deltas[2] = aom_sdotq_s16(deltas[2], A[0], B[2]);
    224  deltas[3] = aom_sdotq_s16(deltas[3], A[0], B[3]);
    225  deltas[4] = aom_sdotq_s16(deltas[4], A[0], B[4]);
    226  deltas[5] = aom_sdotq_s16(deltas[5], A[1], B[0]);
    227  deltas[6] = aom_sdotq_s16(deltas[6], A[2], B[0]);
    228  deltas[7] = aom_sdotq_s16(deltas[7], A[3], B[0]);
    229  deltas[8] = aom_sdotq_s16(deltas[8], A[4], B[0]);
    230 }
    231 
    232 static inline void load_square_win5_sve(
    233    const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
    234    const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
    235    int16x8_t *d_je, svbool_t p0, svbool_t p1) {
    236  d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
    237  d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
    238  d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
    239  d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
    240  d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
    241  d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
    242  d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
    243  d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
    244 
    245  d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
    246  d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
    247  d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
    248  d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
    249  d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
    250  d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
    251  d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
    252  d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
    253 
    254  load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
    255  load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
    256  load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
    257               &d_je[4], &d_je[6]);
    258  load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
    259               &d_je[5], &d_je[7]);
    260 }
    261 
    262 static inline void update_4_stats_sve(const int64_t *const src,
    263                                      const int64x2_t *delta,
    264                                      int64_t *const dst) {
    265  const int64x2_t s1 = vld1q_s64(src);
    266  const int64x2_t s2 = vld1q_s64(src + 2);
    267 
    268  vst1q_s64(dst + 0, vaddq_s64(s1, delta[0]));
    269  vst1q_s64(dst + 2, vaddq_s64(s2, delta[1]));
    270 }
    271 
    272 static inline void derive_square_win5_sve(
    273    int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
    274    const int16x8_t *d_je,
    275    int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
    276  d_is[0] = vnegq_s16(d_is[0]);
    277  d_is[1] = vnegq_s16(d_is[1]);
    278  d_is[2] = vnegq_s16(d_is[2]);
    279  d_is[3] = vnegq_s16(d_is[3]);
    280  d_is[4] = vnegq_s16(d_is[4]);
    281  d_is[5] = vnegq_s16(d_is[5]);
    282  d_is[6] = vnegq_s16(d_is[6]);
    283  d_is[7] = vnegq_s16(d_is[7]);
    284 
    285  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
    286  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
    287  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
    288  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
    289  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
    290  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
    291  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
    292  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
    293 
    294  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
    295  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
    296  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
    297  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
    298  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
    299  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
    300  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
    301  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
    302 
    303  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
    304  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
    305  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
    306  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
    307  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
    308  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
    309  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
    310  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
    311 
    312  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
    313  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
    314  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
    315  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
    316  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
    317  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
    318  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
    319  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
    320 
    321  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
    322  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
    323  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
    324  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
    325  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
    326  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
    327  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
    328  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
    329 
    330  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
    331  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
    332  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
    333  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
    334  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
    335  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
    336  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
    337  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
    338 
    339  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
    340  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
    341  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
    342  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
    343  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
    344  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
    345  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
    346  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
    347 
    348  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
    349  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
    350  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
    351  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
    352  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
    353  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
    354  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
    355  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
    356 }
    357 
    358 static inline void hadd_update_4_stats_sve(const int64_t *const src,
    359                                           const int64x2_t *deltas,
    360                                           int64_t *const dst) {
    361  int64x2_t src0 = vld1q_s64(src);
    362  int64x2_t src1 = vld1q_s64(src + 2);
    363  vst1q_s64(dst + 0, vaddq_s64(src0, vpaddq_s64(deltas[0], deltas[1])));
    364  vst1q_s64(dst + 2, vaddq_s64(src1, vpaddq_s64(deltas[2], deltas[3])));
    365 }
    366 
    367 static inline void load_triangle_win5_sve(const int16_t *const di,
    368                                          const int32_t d_stride,
    369                                          const int32_t height, int16x8_t *d_is,
    370                                          int16x8_t *d_ie, svbool_t p0,
    371                                          svbool_t p1) {
    372  d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
    373  d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
    374  d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
    375  d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
    376  d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
    377  d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
    378  d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
    379  d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
    380  d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
    381  d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
    382  d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
    383  d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
    384  d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
    385  d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
    386  d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
    387  d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
    388 }
    389 
    390 static inline void derive_triangle_win5_sve(const int16x8_t *d_is,
    391                                            const int16x8_t *d_ie,
    392                                            int64x2_t *deltas) {
    393  deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
    394  deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
    395  deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
    396  deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
    397  deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
    398  deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
    399  deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
    400  deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
    401  deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[2]), d_is[2]);
    402  deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[3]), d_is[3]);
    403  deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[2]), d_is[4]);
    404  deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[3]), d_is[5]);
    405  deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[6]);
    406  deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[7]);
    407  deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[4]), d_is[4]);
    408  deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[5]), d_is[5]);
    409  deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[4]), d_is[6]);
    410  deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[5]), d_is[7]);
    411  deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[6]), d_is[6]);
    412  deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[7]), d_is[7]);
    413 
    414  deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
    415  deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
    416  deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
    417  deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
    418  deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
    419  deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
    420  deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
    421  deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
    422  deltas[4] = aom_sdotq_s16(deltas[4], d_ie[2], d_ie[2]);
    423  deltas[4] = aom_sdotq_s16(deltas[4], d_ie[3], d_ie[3]);
    424  deltas[5] = aom_sdotq_s16(deltas[5], d_ie[2], d_ie[4]);
    425  deltas[5] = aom_sdotq_s16(deltas[5], d_ie[3], d_ie[5]);
    426  deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[6]);
    427  deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[7]);
    428  deltas[7] = aom_sdotq_s16(deltas[7], d_ie[4], d_ie[4]);
    429  deltas[7] = aom_sdotq_s16(deltas[7], d_ie[5], d_ie[5]);
    430  deltas[8] = aom_sdotq_s16(deltas[8], d_ie[4], d_ie[6]);
    431  deltas[8] = aom_sdotq_s16(deltas[8], d_ie[5], d_ie[7]);
    432  deltas[9] = aom_sdotq_s16(deltas[9], d_ie[6], d_ie[6]);
    433  deltas[9] = aom_sdotq_s16(deltas[9], d_ie[7], d_ie[7]);
    434 }
    435 
    436 static inline void compute_stats_win5_sve(
    437    const int16_t *const d, const int32_t d_stride, const int16_t *const s,
    438    const int32_t s_stride, const int32_t width, const int32_t height,
    439    int64_t *const M, int64_t *const H) {
    440  const int32_t wiener_win = WIENER_WIN_CHROMA;
    441  const int32_t wiener_win2 = wiener_win * wiener_win;
    442  const int32_t h8 = height & ~7;
    443  int32_t i, j, x, y;
    444 
    445  // Use a predicate to compute the last columns.
    446  svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
    447  svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
    448 
    449  // Step 1: Calculate the top edge of the whole matrix, i.e., the top
    450  // edge of each triangle and square on the top row.
    451  j = 0;
    452  do {
    453    const int16_t *s_t = s;
    454    const int16_t *d_t = d;
    455    int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
    456    int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
    457    int16x8_t src[2], dgd[2];
    458 
    459    y = height;
    460    do {
    461      x = 0;
    462      while (x < width - 16) {
    463        src[0] = vld1q_s16(s_t + x + 0);
    464        src[1] = vld1q_s16(s_t + x + 8);
    465        dgd[0] = vld1q_s16(d_t + x + 0);
    466        dgd[1] = vld1q_s16(d_t + x + 8);
    467        stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
    468        x += 16;
    469      }
    470 
    471      src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
    472      src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
    473      dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
    474      dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
    475 
    476      stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
    477 
    478      s_t += s_stride;
    479      d_t += d_stride;
    480    } while (--y);
    481 
    482    vst1q_s64(&M[wiener_win * j + 0], vpaddq_s64(sum_m[0], sum_m[1]));
    483    vst1q_s64(&M[wiener_win * j + 2], vpaddq_s64(sum_m[2], sum_m[3]));
    484    M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
    485 
    486    vst1q_s64(&H[wiener_win * j + 0], vpaddq_s64(sum_h[0], sum_h[1]));
    487    vst1q_s64(&H[wiener_win * j + 2], vpaddq_s64(sum_h[2], sum_h[3]));
    488    H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
    489  } while (++j < wiener_win);
    490 
    491  // Step 2: Calculate the left edge of each square on the top row.
    492  j = 1;
    493  do {
    494    const int16_t *d_t = d;
    495    int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
    496    int16x8_t dgd[2];
    497 
    498    y = height;
    499    do {
    500      x = 0;
    501      while (x < width - 16) {
    502        dgd[0] = vld1q_s16(d_t + j + x + 0);
    503        dgd[1] = vld1q_s16(d_t + j + x + 8);
    504        stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
    505        x += 16;
    506      }
    507 
    508      dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
    509      dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
    510 
    511      stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
    512 
    513      d_t += d_stride;
    514    } while (--y);
    515 
    516    int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
    517    int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
    518    vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
    519    vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
    520    vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
    521    vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
    522 
    523  } while (++j < wiener_win);
    524 
    525  // Step 3: Derive the top edge of each triangle along the diagonal. No
    526  // triangle in top row.
    527  {
    528    const int16_t *d_t = d;
    529 
    530    if (height % 2) {
    531      int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
    532      int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
    533      int16x8_t ds[WIENER_WIN * 2];
    534 
    535      load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
    536      load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
    537      d_t += 4 * d_stride;
    538 
    539      step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
    540      transpose_arrays_s32_8x8(deltas, deltas_tr);
    541 
    542      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
    543                          deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
    544                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
    545 
    546      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
    547                          deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
    548                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
    549 
    550      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
    551                          deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
    552                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
    553 
    554      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
    555                          deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
    556                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
    557 
    558    } else {
    559      int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
    560      int16x8_t ds[WIENER_WIN_CHROMA * 2];
    561 
    562      ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
    563      ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
    564      ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
    565      ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
    566 
    567      step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
    568 
    569      transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
    570                                      &deltas[3]);
    571 
    572      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
    573                          deltas[0], vgetq_lane_s32(deltas[4], 0),
    574                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
    575 
    576      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
    577                          deltas[1], vgetq_lane_s32(deltas[4], 1),
    578                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
    579 
    580      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
    581                          deltas[2], vgetq_lane_s32(deltas[4], 2),
    582                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
    583 
    584      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
    585                          deltas[3], vgetq_lane_s32(deltas[4], 3),
    586                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
    587    }
    588  }
    589 
    590  // Step 4: Derive the top and left edge of each square. No square in top and
    591  // bottom row.
    592  {
    593    y = h8;
    594 
    595    int16x4_t d_s[12];
    596    int16x4_t d_e[12];
    597    const int16_t *d_t = d;
    598    int16x4_t zeros = vdup_n_s16(0);
    599    load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
    600    load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
    601    int64x2_t deltas[6][18] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } };
    602 
    603    while (y >= 8) {
    604      load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
    605                   &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
    606      load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
    607                   &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
    608 
    609      int16x8_t s_tr[8], e_tr[8];
    610      transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
    611                              d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
    612                              &s_tr[3]);
    613      transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
    614                              zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
    615                              &s_tr[7]);
    616 
    617      transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
    618                              d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
    619                              &e_tr[3]);
    620      transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
    621                              zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
    622                              &e_tr[7]);
    623 
    624      int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
    625      start_col0[0] = s_tr[0];
    626      start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
    627      start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
    628      start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
    629      start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
    630 
    631      start_col1[0] = s_tr[1];
    632      start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
    633      start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
    634      start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
    635      start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
    636 
    637      start_col2[0] = s_tr[2];
    638      start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
    639      start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
    640      start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
    641      start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
    642 
    643      start_col3[0] = s_tr[3];
    644      start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
    645      start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
    646      start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
    647      start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
    648 
    649      // i = 1, j = 2;
    650      sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
    651 
    652      // i = 1, j = 3;
    653      sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
    654 
    655      // i = 1, j = 4
    656      sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
    657 
    658      // i = 2, j =3
    659      sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
    660 
    661      // i = 2, j = 4
    662      sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
    663 
    664      // i = 3, j = 4
    665      sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
    666 
    667      int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
    668      end_col0[0] = e_tr[0];
    669      end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
    670      end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
    671      end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
    672      end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
    673 
    674      end_col1[0] = e_tr[1];
    675      end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
    676      end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
    677      end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
    678      end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
    679 
    680      end_col2[0] = e_tr[2];
    681      end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
    682      end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
    683      end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
    684      end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
    685 
    686      end_col3[0] = e_tr[3];
    687      end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
    688      end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
    689      end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
    690      end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
    691 
    692      // i = 1, j = 2;
    693      add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
    694 
    695      // i = 1, j = 3;
    696      add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
    697 
    698      // i = 1, j = 4
    699      add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
    700 
    701      // i = 2, j =3
    702      add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
    703 
    704      // i = 2, j = 4
    705      add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
    706 
    707      // i = 3, j = 4
    708      add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
    709 
    710      d_s[0] = d_s[8];
    711      d_s[1] = d_s[9];
    712      d_s[2] = d_s[10];
    713      d_s[3] = d_s[11];
    714      d_e[0] = d_e[8];
    715      d_e[1] = d_e[9];
    716      d_e[2] = d_e[10];
    717      d_e[3] = d_e[11];
    718 
    719      d_t += 8 * d_stride;
    720      y -= 8;
    721    }
    722 
    723    if (h8 != height) {
    724      const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
    725 
    726      load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
    727                   &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
    728      load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
    729                   &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
    730      int16x8_t s_tr[8], e_tr[8];
    731      transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
    732                              d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
    733                              &s_tr[3]);
    734      transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
    735                              zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
    736                              &s_tr[7]);
    737      transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
    738                              d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
    739                              &e_tr[3]);
    740      transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
    741                              zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
    742                              &e_tr[7]);
    743 
    744      int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
    745      start_col0[0] = vandq_s16(s_tr[0], mask_h);
    746      start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
    747      start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
    748      start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
    749      start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
    750 
    751      start_col1[0] = vandq_s16(s_tr[1], mask_h);
    752      start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
    753      start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
    754      start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
    755      start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
    756 
    757      start_col2[0] = vandq_s16(s_tr[2], mask_h);
    758      start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
    759      start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
    760      start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
    761      start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
    762 
    763      start_col3[0] = vandq_s16(s_tr[3], mask_h);
    764      start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
    765      start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
    766      start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
    767      start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
    768 
    769      // i = 1, j = 2;
    770      sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
    771 
    772      // i = 1, j = 3;
    773      sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
    774 
    775      // i = 1, j = 4
    776      sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
    777 
    778      // i = 2, j = 3
    779      sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
    780 
    781      // i = 2, j = 4
    782      sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
    783 
    784      // i = 3, j = 4
    785      sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
    786 
    787      int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
    788      end_col0[0] = vandq_s16(e_tr[0], mask_h);
    789      end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
    790      end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
    791      end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
    792      end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
    793 
    794      end_col1[0] = vandq_s16(e_tr[1], mask_h);
    795      end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
    796      end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
    797      end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
    798      end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
    799 
    800      end_col2[0] = vandq_s16(e_tr[2], mask_h);
    801      end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
    802      end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
    803      end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
    804      end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
    805 
    806      end_col3[0] = vandq_s16(e_tr[3], mask_h);
    807      end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
    808      end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
    809      end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
    810      end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
    811 
    812      // i = 1, j = 2;
    813      add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
    814 
    815      // i = 1, j = 3;
    816      add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
    817 
    818      // i = 1, j = 4
    819      add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
    820 
    821      // i = 2, j =3
    822      add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
    823 
    824      // i = 2, j = 4
    825      add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
    826 
    827      // i = 3, j = 4
    828      add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
    829    }
    830 
    831    int64_t single_delta[6];
    832 
    833    deltas[0][0] = vpaddq_s64(deltas[0][0], deltas[0][1]);
    834    deltas[0][1] = vpaddq_s64(deltas[0][2], deltas[0][3]);
    835    deltas[1][0] = vpaddq_s64(deltas[1][0], deltas[1][1]);
    836    deltas[1][1] = vpaddq_s64(deltas[1][2], deltas[1][3]);
    837    deltas[2][0] = vpaddq_s64(deltas[2][0], deltas[2][1]);
    838    deltas[2][1] = vpaddq_s64(deltas[2][2], deltas[2][3]);
    839    deltas[3][0] = vpaddq_s64(deltas[3][0], deltas[3][1]);
    840    deltas[3][1] = vpaddq_s64(deltas[3][2], deltas[3][3]);
    841    deltas[4][0] = vpaddq_s64(deltas[4][0], deltas[4][1]);
    842    deltas[4][1] = vpaddq_s64(deltas[4][2], deltas[4][3]);
    843    deltas[5][0] = vpaddq_s64(deltas[5][0], deltas[5][1]);
    844    deltas[5][1] = vpaddq_s64(deltas[5][2], deltas[5][3]);
    845 
    846    deltas[0][5] = vpaddq_s64(deltas[0][5], deltas[0][6]);
    847    deltas[0][7] = vpaddq_s64(deltas[0][7], deltas[0][8]);
    848    deltas[1][5] = vpaddq_s64(deltas[1][5], deltas[1][6]);
    849    deltas[1][7] = vpaddq_s64(deltas[1][7], deltas[1][8]);
    850    deltas[2][5] = vpaddq_s64(deltas[2][5], deltas[2][6]);
    851    deltas[2][7] = vpaddq_s64(deltas[2][7], deltas[2][8]);
    852    deltas[3][5] = vpaddq_s64(deltas[3][5], deltas[3][6]);
    853    deltas[3][7] = vpaddq_s64(deltas[3][7], deltas[3][8]);
    854    deltas[4][5] = vpaddq_s64(deltas[4][5], deltas[4][6]);
    855    deltas[4][7] = vpaddq_s64(deltas[4][7], deltas[4][8]);
    856    deltas[5][5] = vpaddq_s64(deltas[5][5], deltas[5][6]);
    857    deltas[5][7] = vpaddq_s64(deltas[5][7], deltas[5][8]);
    858 
    859    vst1q_s64(single_delta + 0, vpaddq_s64(deltas[0][4], deltas[1][4]));
    860    vst1q_s64(single_delta + 2, vpaddq_s64(deltas[2][4], deltas[3][4]));
    861    vst1q_s64(single_delta + 4, vpaddq_s64(deltas[4][4], deltas[5][4]));
    862 
    863    int idx = 0;
    864    for (i = 1; i < wiener_win - 1; i++) {
    865      for (j = i + 1; j < wiener_win; j++) {
    866        update_4_stats_sve(
    867            H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
    868            deltas[idx], H + i * wiener_win * wiener_win2 + j * wiener_win);
    869        H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
    870            H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
    871            single_delta[idx];
    872 
    873        H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
    874            H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
    875            vgetq_lane_s64(deltas[idx][5], 0);
    876        H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
    877            H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
    878            vgetq_lane_s64(deltas[idx][5], 1);
    879        H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
    880            H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
    881            vgetq_lane_s64(deltas[idx][7], 0);
    882        H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
    883            H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
    884            vgetq_lane_s64(deltas[idx][7], 1);
    885 
    886        idx++;
    887      }
    888    }
    889  }
    890 
    891  // Step 5: Derive other points of each square. No square in bottom row.
    892  i = 0;
    893  do {
    894    const int16_t *const di = d + i;
    895 
    896    j = i + 1;
    897    do {
    898      const int16_t *const dj = d + j;
    899      int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
    900        { vdupq_n_s64(0) }, { vdupq_n_s64(0) }
    901      };
    902      int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
    903      int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
    904 
    905      x = 0;
    906      while (x < width - 16) {
    907        load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
    908                              d_js, d_je);
    909        derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
    910        x += 16;
    911      }
    912 
    913      load_square_win5_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
    914                           d_je, p0, p1);
    915      derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
    916 
    917      hadd_update_4_stats_sve(
    918          H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
    919          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
    920      hadd_update_4_stats_sve(
    921          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
    922          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
    923      hadd_update_4_stats_sve(
    924          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
    925          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
    926      hadd_update_4_stats_sve(
    927          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
    928          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
    929    } while (++j < wiener_win);
    930  } while (++i < wiener_win - 1);
    931 
    932  // Step 6: Derive other points of each upper triangle along the diagonal.
    933  i = 0;
    934  do {
    935    const int16_t *const di = d + i;
    936    int64x2_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s64(0) };
    937    int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
    938 
    939    x = 0;
    940    while (x < width - 16) {
    941      load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
    942      derive_triangle_win5_sve(d_is, d_ie, deltas);
    943      x += 16;
    944    }
    945 
    946    load_triangle_win5_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
    947    derive_triangle_win5_sve(d_is, d_ie, deltas);
    948 
    949    // Row 1: 4 points
    950    hadd_update_4_stats_sve(
    951        H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
    952        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
    953 
    954    // Row 2: 3 points
    955    int64x2_t src0 =
    956        vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
    957    vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
    958              vaddq_s64(src0, vpaddq_s64(deltas[4], deltas[5])));
    959 
    960    int64x2_t deltas69 = vpaddq_s64(deltas[6], deltas[9]);
    961 
    962    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
    963        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
    964        vgetq_lane_s64(deltas69, 0);
    965 
    966    // Row 3: 2 points
    967    int64x2_t src1 =
    968        vld1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
    969    vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
    970              vaddq_s64(src1, vpaddq_s64(deltas[7], deltas[8])));
    971 
    972    // Row 4: 1 point
    973    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
    974        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
    975        vgetq_lane_s64(deltas69, 1);
    976  } while (++i < wiener_win);
    977 }
    978 
    979 static inline void stats_top_win7_sve(const int16x8_t src[2],
    980                                      const int16x8_t dgd[2],
    981                                      const int16_t *const d,
    982                                      const int32_t d_stride, int64x2_t *sum_m,
    983                                      int64x2_t *sum_h) {
    984  int16x8_t dgds[WIENER_WIN * 2];
    985 
    986  load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
    987               &dgds[8], &dgds[10], &dgds[12]);
    988  load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
    989               &dgds[9], &dgds[11], &dgds[13]);
    990 
    991  sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
    992  sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
    993  sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
    994  sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
    995  sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
    996  sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
    997  sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
    998  sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
    999  sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
   1000  sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
   1001  sum_m[5] = aom_sdotq_s16(sum_m[5], src[0], dgds[10]);
   1002  sum_m[5] = aom_sdotq_s16(sum_m[5], src[1], dgds[11]);
   1003  sum_m[6] = aom_sdotq_s16(sum_m[6], src[0], dgds[12]);
   1004  sum_m[6] = aom_sdotq_s16(sum_m[6], src[1], dgds[13]);
   1005 
   1006  sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
   1007  sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
   1008  sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
   1009  sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
   1010  sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
   1011  sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
   1012  sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
   1013  sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
   1014  sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
   1015  sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
   1016  sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[0], dgds[10]);
   1017  sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[1], dgds[11]);
   1018  sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[0], dgds[12]);
   1019  sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[1], dgds[13]);
   1020 }
   1021 
   1022 static inline void stats_left_win7_sve(const int16x8_t src[2], const int16_t *d,
   1023                                       const int32_t d_stride, int64x2_t *sum) {
   1024  int16x8_t dgds[WIN_7];
   1025 
   1026  load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
   1027               &dgds[6], &dgds[8], &dgds[10]);
   1028  load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
   1029               &dgds[7], &dgds[9], &dgds[11]);
   1030 
   1031  sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
   1032  sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
   1033  sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
   1034  sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
   1035  sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
   1036  sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
   1037  sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
   1038  sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
   1039  sum[4] = aom_sdotq_s16(sum[4], src[0], dgds[8]);
   1040  sum[4] = aom_sdotq_s16(sum[4], src[1], dgds[9]);
   1041  sum[5] = aom_sdotq_s16(sum[5], src[0], dgds[10]);
   1042  sum[5] = aom_sdotq_s16(sum[5], src[1], dgds[11]);
   1043 }
   1044 
   1045 static inline void load_square_win7_sve(
   1046    const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
   1047    const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
   1048    int16x8_t *d_je, svbool_t p0, svbool_t p1) {
   1049  d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
   1050  d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
   1051  d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
   1052  d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
   1053  d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
   1054  d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
   1055  d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
   1056  d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
   1057  d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
   1058  d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
   1059  d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
   1060  d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
   1061 
   1062  d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
   1063  d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
   1064  d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
   1065  d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
   1066  d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
   1067  d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
   1068  d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
   1069  d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
   1070  d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
   1071  d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
   1072  d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
   1073  d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
   1074 
   1075  load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
   1076               &d_js[8], &d_js[10]);
   1077  load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
   1078               &d_js[9], &d_js[11]);
   1079  load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
   1080               &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
   1081  load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
   1082               &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
   1083 }
   1084 
   1085 static inline void derive_square_win7_sve(int16x8_t *d_is,
   1086                                          const int16x8_t *d_ie,
   1087                                          const int16x8_t *d_js,
   1088                                          const int16x8_t *d_je,
   1089                                          int64x2_t deltas[][WIN_7]) {
   1090  d_is[0] = vnegq_s16(d_is[0]);
   1091  d_is[1] = vnegq_s16(d_is[1]);
   1092  d_is[2] = vnegq_s16(d_is[2]);
   1093  d_is[3] = vnegq_s16(d_is[3]);
   1094  d_is[4] = vnegq_s16(d_is[4]);
   1095  d_is[5] = vnegq_s16(d_is[5]);
   1096  d_is[6] = vnegq_s16(d_is[6]);
   1097  d_is[7] = vnegq_s16(d_is[7]);
   1098  d_is[8] = vnegq_s16(d_is[8]);
   1099  d_is[9] = vnegq_s16(d_is[9]);
   1100  d_is[10] = vnegq_s16(d_is[10]);
   1101  d_is[11] = vnegq_s16(d_is[11]);
   1102 
   1103  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
   1104  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
   1105  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
   1106  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
   1107  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
   1108  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
   1109  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
   1110  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
   1111  deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[0], d_js[8]);
   1112  deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[1], d_js[9]);
   1113  deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[0], d_js[10]);
   1114  deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[1], d_js[11]);
   1115 
   1116  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
   1117  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
   1118  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
   1119  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
   1120  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
   1121  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
   1122  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
   1123  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
   1124  deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[2], d_js[8]);
   1125  deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[3], d_js[9]);
   1126  deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[2], d_js[10]);
   1127  deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[3], d_js[11]);
   1128 
   1129  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
   1130  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
   1131  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
   1132  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
   1133  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
   1134  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
   1135  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
   1136  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
   1137  deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[4], d_js[8]);
   1138  deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[5], d_js[9]);
   1139  deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[4], d_js[10]);
   1140  deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[5], d_js[11]);
   1141 
   1142  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
   1143  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
   1144  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
   1145  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
   1146  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
   1147  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
   1148  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
   1149  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
   1150  deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[6], d_js[8]);
   1151  deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[7], d_js[9]);
   1152  deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[6], d_js[10]);
   1153  deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[7], d_js[11]);
   1154 
   1155  deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[8], d_js[0]);
   1156  deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[9], d_js[1]);
   1157  deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[8], d_js[2]);
   1158  deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[9], d_js[3]);
   1159  deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[8], d_js[4]);
   1160  deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[9], d_js[5]);
   1161  deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[8], d_js[6]);
   1162  deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[9], d_js[7]);
   1163  deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[8], d_js[8]);
   1164  deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[9], d_js[9]);
   1165  deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[8], d_js[10]);
   1166  deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[9], d_js[11]);
   1167 
   1168  deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[10], d_js[0]);
   1169  deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[11], d_js[1]);
   1170  deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[10], d_js[2]);
   1171  deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[11], d_js[3]);
   1172  deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[10], d_js[4]);
   1173  deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[11], d_js[5]);
   1174  deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[10], d_js[6]);
   1175  deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[11], d_js[7]);
   1176  deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[10], d_js[8]);
   1177  deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[11], d_js[9]);
   1178  deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[10], d_js[10]);
   1179  deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[11], d_js[11]);
   1180 
   1181  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
   1182  deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
   1183  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
   1184  deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
   1185  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
   1186  deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
   1187  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
   1188  deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
   1189  deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[0], d_je[8]);
   1190  deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[1], d_je[9]);
   1191  deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[0], d_je[10]);
   1192  deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[1], d_je[11]);
   1193 
   1194  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
   1195  deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
   1196  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
   1197  deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
   1198  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
   1199  deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
   1200  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
   1201  deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
   1202  deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[2], d_je[8]);
   1203  deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[3], d_je[9]);
   1204  deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[2], d_je[10]);
   1205  deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[3], d_je[11]);
   1206 
   1207  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
   1208  deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
   1209  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
   1210  deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
   1211  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
   1212  deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
   1213  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
   1214  deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
   1215  deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[4], d_je[8]);
   1216  deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[5], d_je[9]);
   1217  deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[4], d_je[10]);
   1218  deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[5], d_je[11]);
   1219 
   1220  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
   1221  deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
   1222  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
   1223  deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
   1224  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
   1225  deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
   1226  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
   1227  deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
   1228  deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[6], d_je[8]);
   1229  deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[7], d_je[9]);
   1230  deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[6], d_je[10]);
   1231  deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[7], d_je[11]);
   1232 
   1233  deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[8], d_je[0]);
   1234  deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[9], d_je[1]);
   1235  deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[8], d_je[2]);
   1236  deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[9], d_je[3]);
   1237  deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[8], d_je[4]);
   1238  deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[9], d_je[5]);
   1239  deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[8], d_je[6]);
   1240  deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[9], d_je[7]);
   1241  deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[8], d_je[8]);
   1242  deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[9], d_je[9]);
   1243  deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[8], d_je[10]);
   1244  deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[9], d_je[11]);
   1245 
   1246  deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[10], d_je[0]);
   1247  deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[11], d_je[1]);
   1248  deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[10], d_je[2]);
   1249  deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[11], d_je[3]);
   1250  deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[10], d_je[4]);
   1251  deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[11], d_je[5]);
   1252  deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[10], d_je[6]);
   1253  deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[11], d_je[7]);
   1254  deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[10], d_je[8]);
   1255  deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[11], d_je[9]);
   1256  deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[10], d_je[10]);
   1257  deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[11], d_je[11]);
   1258 }
   1259 
   1260 static inline void hadd_update_6_stats_sve(const int64_t *const src,
   1261                                           const int64x2_t *deltas,
   1262                                           int64_t *const dst) {
   1263  int64x2_t src0 = vld1q_s64(src + 0);
   1264  int64x2_t src1 = vld1q_s64(src + 2);
   1265  int64x2_t src2 = vld1q_s64(src + 4);
   1266 
   1267  int64x2_t deltas01 = vpaddq_s64(deltas[0], deltas[1]);
   1268  int64x2_t deltas23 = vpaddq_s64(deltas[2], deltas[3]);
   1269  int64x2_t deltas45 = vpaddq_s64(deltas[4], deltas[5]);
   1270 
   1271  vst1q_s64(dst + 0, vaddq_s64(src0, deltas01));
   1272  vst1q_s64(dst + 2, vaddq_s64(src1, deltas23));
   1273  vst1q_s64(dst + 4, vaddq_s64(src2, deltas45));
   1274 }
   1275 
   1276 static inline void load_triangle_win7_sve(const int16_t *const di,
   1277                                          const int32_t d_stride,
   1278                                          const int32_t height, int16x8_t *d_is,
   1279                                          int16x8_t *d_ie, svbool_t p0,
   1280                                          svbool_t p1) {
   1281  d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
   1282  d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
   1283  d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
   1284  d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
   1285  d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
   1286  d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
   1287  d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
   1288  d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
   1289  d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
   1290  d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
   1291  d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
   1292  d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
   1293 
   1294  d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
   1295  d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
   1296  d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
   1297  d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
   1298  d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
   1299  d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
   1300  d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
   1301  d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
   1302  d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
   1303  d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
   1304  d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
   1305  d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
   1306 }
   1307 
   1308 static inline void derive_triangle_win7_sve(const int16x8_t *d_is,
   1309                                            const int16x8_t *d_ie,
   1310                                            int64x2_t *deltas) {
   1311  deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
   1312  deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
   1313  deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
   1314  deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
   1315  deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
   1316  deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
   1317  deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
   1318  deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
   1319  deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[0]), d_is[8]);
   1320  deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[1]), d_is[9]);
   1321  deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[0]), d_is[10]);
   1322  deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[1]), d_is[11]);
   1323 
   1324  deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[2]);
   1325  deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[3]);
   1326  deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[2]), d_is[4]);
   1327  deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[3]), d_is[5]);
   1328  deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[2]), d_is[6]);
   1329  deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[3]), d_is[7]);
   1330  deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[2]), d_is[8]);
   1331  deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[3]), d_is[9]);
   1332  deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[2]), d_is[10]);
   1333  deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[3]), d_is[11]);
   1334 
   1335  deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[4]), d_is[4]);
   1336  deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[5]), d_is[5]);
   1337  deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[4]), d_is[6]);
   1338  deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[5]), d_is[7]);
   1339  deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[4]), d_is[8]);
   1340  deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[5]), d_is[9]);
   1341  deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[4]), d_is[10]);
   1342  deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[5]), d_is[11]);
   1343 
   1344  deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[6]), d_is[6]);
   1345  deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[7]), d_is[7]);
   1346  deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[6]), d_is[8]);
   1347  deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[7]), d_is[9]);
   1348  deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[6]), d_is[10]);
   1349  deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[7]), d_is[11]);
   1350 
   1351  deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[8]), d_is[8]);
   1352  deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[9]), d_is[9]);
   1353  deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[8]), d_is[10]);
   1354  deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[9]), d_is[11]);
   1355 
   1356  deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[10]), d_is[10]);
   1357  deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[11]), d_is[11]);
   1358 
   1359  deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
   1360  deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
   1361  deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
   1362  deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
   1363  deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
   1364  deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
   1365  deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
   1366  deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
   1367  deltas[4] = aom_sdotq_s16(deltas[4], d_ie[0], d_ie[8]);
   1368  deltas[4] = aom_sdotq_s16(deltas[4], d_ie[1], d_ie[9]);
   1369  deltas[5] = aom_sdotq_s16(deltas[5], d_ie[0], d_ie[10]);
   1370  deltas[5] = aom_sdotq_s16(deltas[5], d_ie[1], d_ie[11]);
   1371 
   1372  deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[2]);
   1373  deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[3]);
   1374  deltas[7] = aom_sdotq_s16(deltas[7], d_ie[2], d_ie[4]);
   1375  deltas[7] = aom_sdotq_s16(deltas[7], d_ie[3], d_ie[5]);
   1376  deltas[8] = aom_sdotq_s16(deltas[8], d_ie[2], d_ie[6]);
   1377  deltas[8] = aom_sdotq_s16(deltas[8], d_ie[3], d_ie[7]);
   1378  deltas[9] = aom_sdotq_s16(deltas[9], d_ie[2], d_ie[8]);
   1379  deltas[9] = aom_sdotq_s16(deltas[9], d_ie[3], d_ie[9]);
   1380  deltas[10] = aom_sdotq_s16(deltas[10], d_ie[2], d_ie[10]);
   1381  deltas[10] = aom_sdotq_s16(deltas[10], d_ie[3], d_ie[11]);
   1382 
   1383  deltas[11] = aom_sdotq_s16(deltas[11], d_ie[4], d_ie[4]);
   1384  deltas[11] = aom_sdotq_s16(deltas[11], d_ie[5], d_ie[5]);
   1385  deltas[12] = aom_sdotq_s16(deltas[12], d_ie[4], d_ie[6]);
   1386  deltas[12] = aom_sdotq_s16(deltas[12], d_ie[5], d_ie[7]);
   1387  deltas[13] = aom_sdotq_s16(deltas[13], d_ie[4], d_ie[8]);
   1388  deltas[13] = aom_sdotq_s16(deltas[13], d_ie[5], d_ie[9]);
   1389  deltas[14] = aom_sdotq_s16(deltas[14], d_ie[4], d_ie[10]);
   1390  deltas[14] = aom_sdotq_s16(deltas[14], d_ie[5], d_ie[11]);
   1391 
   1392  deltas[15] = aom_sdotq_s16(deltas[15], d_ie[6], d_ie[6]);
   1393  deltas[15] = aom_sdotq_s16(deltas[15], d_ie[7], d_ie[7]);
   1394  deltas[16] = aom_sdotq_s16(deltas[16], d_ie[6], d_ie[8]);
   1395  deltas[16] = aom_sdotq_s16(deltas[16], d_ie[7], d_ie[9]);
   1396  deltas[17] = aom_sdotq_s16(deltas[17], d_ie[6], d_ie[10]);
   1397  deltas[17] = aom_sdotq_s16(deltas[17], d_ie[7], d_ie[11]);
   1398 
   1399  deltas[18] = aom_sdotq_s16(deltas[18], d_ie[8], d_ie[8]);
   1400  deltas[18] = aom_sdotq_s16(deltas[18], d_ie[9], d_ie[9]);
   1401  deltas[19] = aom_sdotq_s16(deltas[19], d_ie[8], d_ie[10]);
   1402  deltas[19] = aom_sdotq_s16(deltas[19], d_ie[9], d_ie[11]);
   1403 
   1404  deltas[20] = aom_sdotq_s16(deltas[20], d_ie[10], d_ie[10]);
   1405  deltas[20] = aom_sdotq_s16(deltas[20], d_ie[11], d_ie[11]);
   1406 }
   1407 
   1408 static inline void compute_stats_win7_sve(
   1409    const int16_t *const d, const int32_t d_stride, const int16_t *const s,
   1410    const int32_t s_stride, const int32_t width, const int32_t height,
   1411    int64_t *const M, int64_t *const H) {
   1412  const int32_t wiener_win = WIENER_WIN;
   1413  const int32_t wiener_win2 = wiener_win * wiener_win;
   1414  const int32_t h8 = height & ~7;
   1415  int32_t i, j, x, y;
   1416 
   1417  // Use a predicate to compute the last columns.
   1418  svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
   1419  svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
   1420 
   1421  // Step 1: Calculate the top edge of the whole matrix, i.e., the top
   1422  // edge of each triangle and square on the top row.
   1423  j = 0;
   1424  do {
   1425    const int16_t *s_t = s;
   1426    const int16_t *d_t = d;
   1427    int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
   1428    int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
   1429    int16x8_t src[2], dgd[2];
   1430 
   1431    y = height;
   1432    do {
   1433      x = 0;
   1434      while (x < width - 16) {
   1435        src[0] = vld1q_s16(s_t + x + 0);
   1436        src[1] = vld1q_s16(s_t + x + 8);
   1437        dgd[0] = vld1q_s16(d_t + x + 0);
   1438        dgd[1] = vld1q_s16(d_t + x + 8);
   1439        stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
   1440        x += 16;
   1441      }
   1442 
   1443      src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
   1444      src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
   1445      dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
   1446      dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
   1447      stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
   1448 
   1449      s_t += s_stride;
   1450      d_t += d_stride;
   1451    } while (--y);
   1452 
   1453    vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
   1454    vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
   1455    vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
   1456    M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
   1457 
   1458    vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
   1459    vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
   1460    vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
   1461    H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
   1462  } while (++j < wiener_win);
   1463 
   1464  // Step 2: Calculate the left edge of each square on the top row.
   1465  j = 1;
   1466  do {
   1467    const int16_t *d_t = d;
   1468    int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
   1469    int16x8_t dgd[2];
   1470 
   1471    y = height;
   1472    do {
   1473      x = 0;
   1474      while (x < width - 16) {
   1475        dgd[0] = vld1q_s16(d_t + j + x + 0);
   1476        dgd[1] = vld1q_s16(d_t + j + x + 8);
   1477        stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
   1478        x += 16;
   1479      }
   1480 
   1481      dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
   1482      dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
   1483      stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
   1484 
   1485      d_t += d_stride;
   1486    } while (--y);
   1487 
   1488    int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
   1489    int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
   1490    int64x2_t sum_h45 = vpaddq_s64(sum_h[4], sum_h[5]);
   1491    vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
   1492    vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
   1493    vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
   1494    vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
   1495    vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h45));
   1496    vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h45));
   1497  } while (++j < wiener_win);
   1498 
   1499  // Step 3: Derive the top edge of each triangle along the diagonal. No
   1500  // triangle in top row.
   1501  {
   1502    const int16_t *d_t = d;
   1503    // Pad to call transpose function.
   1504    int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1505    int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1506    int16x8_t ds[WIENER_WIN * 2];
   1507 
   1508    load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
   1509                 &ds[10]);
   1510    load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
   1511                 &ds[11]);
   1512 
   1513    d_t += 6 * d_stride;
   1514 
   1515    step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
   1516    transpose_arrays_s32_8x8(deltas, deltas_tr);
   1517 
   1518    update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
   1519                        deltas_tr[0], deltas_tr[4],
   1520                        H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
   1521    update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
   1522                        deltas_tr[1], deltas_tr[5],
   1523                        H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
   1524    update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
   1525                        deltas_tr[2], deltas_tr[6],
   1526                        H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
   1527    update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
   1528                        deltas_tr[3], deltas_tr[7],
   1529                        H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   1530    update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
   1531                        deltas_tr[8], deltas_tr[12],
   1532                        H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
   1533    update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
   1534                        deltas_tr[9], deltas_tr[13],
   1535                        H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
   1536  }
   1537 
   1538  // Step 4: Derive the top and left edge of each square. No square in top and
   1539  // bottom row.
   1540 
   1541  i = 1;
   1542  do {
   1543    j = i + 1;
   1544    do {
   1545      const int16_t *di = d + i - 1;
   1546      const int16_t *dj = d + j - 1;
   1547      int64x2_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s64(0) };
   1548      int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
   1549 
   1550      dd[5] = vdupq_n_s16(0);  // Initialize to avoid warning.
   1551      const int16_t dd0_values[] = { di[0 * d_stride],
   1552                                     di[1 * d_stride],
   1553                                     di[2 * d_stride],
   1554                                     di[3 * d_stride],
   1555                                     di[4 * d_stride],
   1556                                     di[5 * d_stride],
   1557                                     0,
   1558                                     0 };
   1559      dd[0] = vld1q_s16(dd0_values);
   1560      const int16_t dd1_values[] = { di[0 * d_stride + width],
   1561                                     di[1 * d_stride + width],
   1562                                     di[2 * d_stride + width],
   1563                                     di[3 * d_stride + width],
   1564                                     di[4 * d_stride + width],
   1565                                     di[5 * d_stride + width],
   1566                                     0,
   1567                                     0 };
   1568      dd[1] = vld1q_s16(dd1_values);
   1569      const int16_t ds0_values[] = { dj[0 * d_stride],
   1570                                     dj[1 * d_stride],
   1571                                     dj[2 * d_stride],
   1572                                     dj[3 * d_stride],
   1573                                     dj[4 * d_stride],
   1574                                     dj[5 * d_stride],
   1575                                     0,
   1576                                     0 };
   1577      ds[0] = vld1q_s16(ds0_values);
   1578      int16_t ds1_values[] = { dj[0 * d_stride + width],
   1579                               dj[1 * d_stride + width],
   1580                               dj[2 * d_stride + width],
   1581                               dj[3 * d_stride + width],
   1582                               dj[4 * d_stride + width],
   1583                               dj[5 * d_stride + width],
   1584                               0,
   1585                               0 };
   1586      ds[1] = vld1q_s16(ds1_values);
   1587 
   1588      y = 0;
   1589      while (y < h8) {
   1590        // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
   1591        dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
   1592        dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
   1593        dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
   1594        dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
   1595 
   1596        // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
   1597        // 01s 11s 21s 31s 41s 51s 61s 71s  01e 11e 21e 31e 41e 51e 61e 71e
   1598        ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
   1599        ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
   1600        ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
   1601        ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
   1602 
   1603        load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
   1604        load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
   1605        load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
   1606        load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
   1607        load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
   1608        load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
   1609        load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
   1610        load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
   1611        load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
   1612        load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
   1613        load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
   1614        load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
   1615 
   1616        deltas[0] = aom_sdotq_s16(deltas[0], dd[0], ds[0]);
   1617        deltas[1] = aom_sdotq_s16(deltas[1], dd[1], ds[1]);
   1618        deltas[2] = aom_sdotq_s16(deltas[2], dd[0], ds[2]);
   1619        deltas[3] = aom_sdotq_s16(deltas[3], dd[1], ds[3]);
   1620        deltas[4] = aom_sdotq_s16(deltas[4], dd[0], ds[4]);
   1621        deltas[5] = aom_sdotq_s16(deltas[5], dd[1], ds[5]);
   1622        deltas[6] = aom_sdotq_s16(deltas[6], dd[0], ds[6]);
   1623        deltas[7] = aom_sdotq_s16(deltas[7], dd[1], ds[7]);
   1624        deltas[8] = aom_sdotq_s16(deltas[8], dd[0], ds[8]);
   1625        deltas[9] = aom_sdotq_s16(deltas[9], dd[1], ds[9]);
   1626        deltas[10] = aom_sdotq_s16(deltas[10], dd[0], ds[10]);
   1627        deltas[11] = aom_sdotq_s16(deltas[11], dd[1], ds[11]);
   1628        deltas[12] = aom_sdotq_s16(deltas[12], dd[0], ds[12]);
   1629        deltas[13] = aom_sdotq_s16(deltas[13], dd[1], ds[13]);
   1630        deltas[14] = aom_sdotq_s16(deltas[14], dd[2], ds[0]);
   1631        deltas[15] = aom_sdotq_s16(deltas[15], dd[3], ds[1]);
   1632        deltas[16] = aom_sdotq_s16(deltas[16], dd[4], ds[0]);
   1633        deltas[17] = aom_sdotq_s16(deltas[17], dd[5], ds[1]);
   1634        deltas[18] = aom_sdotq_s16(deltas[18], dd[6], ds[0]);
   1635        deltas[19] = aom_sdotq_s16(deltas[19], dd[7], ds[1]);
   1636        deltas[20] = aom_sdotq_s16(deltas[20], dd[8], ds[0]);
   1637        deltas[21] = aom_sdotq_s16(deltas[21], dd[9], ds[1]);
   1638        deltas[22] = aom_sdotq_s16(deltas[22], dd[10], ds[0]);
   1639        deltas[23] = aom_sdotq_s16(deltas[23], dd[11], ds[1]);
   1640        deltas[24] = aom_sdotq_s16(deltas[24], dd[12], ds[0]);
   1641        deltas[25] = aom_sdotq_s16(deltas[25], dd[13], ds[1]);
   1642 
   1643        dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
   1644        dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
   1645        ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
   1646        ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
   1647 
   1648        di += 8 * d_stride;
   1649        dj += 8 * d_stride;
   1650        y += 8;
   1651      }
   1652 
   1653      int64x2_t deltas02 = vpaddq_s64(deltas[0], deltas[2]);
   1654      int64x2_t deltas13 = vpaddq_s64(deltas[1], deltas[3]);
   1655      int64x2_t deltas46 = vpaddq_s64(deltas[4], deltas[6]);
   1656      int64x2_t deltas57 = vpaddq_s64(deltas[5], deltas[7]);
   1657      int64x2_t deltas810 = vpaddq_s64(deltas[8], deltas[10]);
   1658      int64x2_t deltas911 = vpaddq_s64(deltas[9], deltas[11]);
   1659      int64x2_t deltas1212 = vpaddq_s64(deltas[12], deltas[12]);
   1660      int64x2_t deltas1313 = vpaddq_s64(deltas[13], deltas[13]);
   1661      int64x2_t deltas1416 = vpaddq_s64(deltas[14], deltas[16]);
   1662      int64x2_t deltas1820 = vpaddq_s64(deltas[18], deltas[20]);
   1663      int64x2_t deltas1517 = vpaddq_s64(deltas[15], deltas[17]);
   1664      int64x2_t deltas1921 = vpaddq_s64(deltas[19], deltas[21]);
   1665      int64x2_t deltas2224 = vpaddq_s64(deltas[22], deltas[24]);
   1666      int64x2_t deltas2325 = vpaddq_s64(deltas[23], deltas[25]);
   1667      deltas02 = vsubq_s64(deltas13, deltas02);
   1668      deltas46 = vsubq_s64(deltas57, deltas46);
   1669      deltas810 = vsubq_s64(deltas911, deltas810);
   1670      deltas1212 = vsubq_s64(deltas1313, deltas1212);
   1671      deltas1416 = vsubq_s64(deltas1517, deltas1416);
   1672      deltas1820 = vsubq_s64(deltas1921, deltas1820);
   1673      deltas2224 = vsubq_s64(deltas2325, deltas2224);
   1674 
   1675      if (h8 != height) {
   1676        const int16_t ds0_vals[] = {
   1677          dj[0 * d_stride], dj[0 * d_stride + width],
   1678          dj[1 * d_stride], dj[1 * d_stride + width],
   1679          dj[2 * d_stride], dj[2 * d_stride + width],
   1680          dj[3 * d_stride], dj[3 * d_stride + width]
   1681        };
   1682        ds[0] = vld1q_s16(ds0_vals);
   1683 
   1684        ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
   1685        ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
   1686        ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
   1687        ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
   1688        const int16_t dd4_vals[] = {
   1689          -di[1 * d_stride], di[1 * d_stride + width],
   1690          -di[2 * d_stride], di[2 * d_stride + width],
   1691          -di[3 * d_stride], di[3 * d_stride + width],
   1692          -di[4 * d_stride], di[4 * d_stride + width]
   1693        };
   1694        dd[4] = vld1q_s16(dd4_vals);
   1695 
   1696        dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
   1697        dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
   1698        do {
   1699          dd[0] = vdupq_n_s16(-di[0 * d_stride]);
   1700          dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
   1701          dd[0] = dd[1] = vzip1q_s16(dd[0], dd[2]);
   1702 
   1703          ds[4] = vdupq_n_s16(dj[0 * d_stride]);
   1704          ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
   1705          ds[4] = ds[5] = vzip1q_s16(ds[4], ds[6]);
   1706 
   1707          dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
   1708          dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
   1709          ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
   1710          ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
   1711 
   1712          const int32x4_t res0 =
   1713              vpaddq_s32(vmull_s16(vget_low_s16(dd[0]), vget_low_s16(ds[0])),
   1714                         vmull_s16(vget_high_s16(dd[0]), vget_high_s16(ds[0])));
   1715          deltas02 = vaddw_s32(deltas02, vget_low_s32(res0));
   1716          deltas46 = vaddw_s32(deltas46, vget_high_s32(res0));
   1717          const int32x4_t res1 =
   1718              vpaddq_s32(vmull_s16(vget_low_s16(dd[1]), vget_low_s16(ds[1])),
   1719                         vmull_s16(vget_high_s16(dd[1]), vget_high_s16(ds[1])));
   1720          deltas810 = vaddw_s32(deltas810, vget_low_s32(res1));
   1721          deltas1212 = vaddw_s32(deltas1212, vget_high_s32(res1));
   1722          const int32x4_t res2 =
   1723              vpaddq_s32(vmull_s16(vget_low_s16(dd[4]), vget_low_s16(ds[4])),
   1724                         vmull_s16(vget_high_s16(dd[4]), vget_high_s16(ds[4])));
   1725          deltas1416 = vaddw_s32(deltas1416, vget_low_s32(res2));
   1726          deltas1820 = vaddw_s32(deltas1820, vget_high_s32(res2));
   1727          const int32x4_t res3 =
   1728              vpaddq_s32(vmull_s16(vget_low_s16(dd[5]), vget_low_s16(ds[5])),
   1729                         vmull_s16(vget_high_s16(dd[5]), vget_high_s16(ds[5])));
   1730          deltas2224 = vaddw_s32(deltas2224, vget_low_s32(res3));
   1731 
   1732          int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
   1733          ds[0] = vextq_s16(ds[0], ds[1], 2);
   1734          ds[1] = vextq_s16(ds[1], ds[0], 2);
   1735          ds[1] = vreinterpretq_s16_s32(
   1736              vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
   1737          int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
   1738          dd[4] = vextq_s16(dd[4], dd[5], 2);
   1739          dd[5] = vextq_s16(dd[5], dd[4], 2);
   1740          dd[5] = vreinterpretq_s16_s32(
   1741              vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
   1742          di += d_stride;
   1743          dj += d_stride;
   1744        } while (++y < height);
   1745      }
   1746 
   1747      // Writing one more element on the top edge of a square falls to
   1748      // the next square in the same row or the first element in the next
   1749      // row, which will just be overwritten later.
   1750      int64x2_t s0 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
   1751                               (j - 1) * wiener_win + 0);
   1752      int64x2_t s1 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
   1753                               (j - 1) * wiener_win + 2);
   1754      int64x2_t s2 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
   1755                               (j - 1) * wiener_win + 4);
   1756      int64x2_t s3 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
   1757                               (j - 1) * wiener_win + 6);
   1758 
   1759      vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 0,
   1760                vaddq_s64(s0, deltas02));
   1761      vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 2,
   1762                vaddq_s64(s1, deltas46));
   1763      vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 4,
   1764                vaddq_s64(s2, deltas810));
   1765      vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 6,
   1766                vaddq_s64(s3, deltas1212));
   1767 
   1768      H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
   1769          H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
   1770          vgetq_lane_s64(deltas1416, 0);
   1771      H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
   1772          H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
   1773          vgetq_lane_s64(deltas1416, 1);
   1774      H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
   1775          H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
   1776          vgetq_lane_s64(deltas1820, 0);
   1777      H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
   1778          H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
   1779          vgetq_lane_s64(deltas1820, 1);
   1780      H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
   1781          H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
   1782          vgetq_lane_s64(deltas2224, 0);
   1783      H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
   1784          H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
   1785          vgetq_lane_s64(deltas2224, 1);
   1786    } while (++j < wiener_win);
   1787  } while (++i < wiener_win - 1);
   1788 
   1789  // Step 5: Derive other points of each square. No square in bottom row.
   1790  i = 0;
   1791  do {
   1792    const int16_t *const di = d + i;
   1793 
   1794    j = i + 1;
   1795    do {
   1796      const int16_t *const dj = d + j;
   1797      int64x2_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s64(0) },
   1798                                                  { vdupq_n_s64(0) } };
   1799      int16x8_t d_is[WIN_7];
   1800      int16x8_t d_ie[WIN_7];
   1801      int16x8_t d_js[WIN_7];
   1802      int16x8_t d_je[WIN_7];
   1803 
   1804      x = 0;
   1805      while (x < width - 16) {
   1806        load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1807                              d_js, d_je);
   1808        derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
   1809        x += 16;
   1810      }
   1811 
   1812      load_square_win7_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
   1813                           d_je, p0, p1);
   1814      derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
   1815 
   1816      hadd_update_6_stats_sve(
   1817          H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
   1818          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
   1819      hadd_update_6_stats_sve(
   1820          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
   1821          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
   1822      hadd_update_6_stats_sve(
   1823          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
   1824          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
   1825      hadd_update_6_stats_sve(
   1826          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
   1827          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
   1828      hadd_update_6_stats_sve(
   1829          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
   1830          H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
   1831      hadd_update_6_stats_sve(
   1832          H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
   1833          H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
   1834    } while (++j < wiener_win);
   1835  } while (++i < wiener_win - 1);
   1836 
   1837  // Step 6: Derive other points of each upper triangle along the diagonal.
   1838  i = 0;
   1839  do {
   1840    const int16_t *const di = d + i;
   1841    int64x2_t deltas[3 * WIENER_WIN] = { vdupq_n_s64(0) };
   1842    int16x8_t d_is[WIN_7], d_ie[WIN_7];
   1843 
   1844    x = 0;
   1845    while (x < width - 16) {
   1846      load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
   1847      derive_triangle_win7_sve(d_is, d_ie, deltas);
   1848      x += 16;
   1849    }
   1850 
   1851    load_triangle_win7_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
   1852    derive_triangle_win7_sve(d_is, d_ie, deltas);
   1853 
   1854    // Row 1: 6 points
   1855    hadd_update_6_stats_sve(
   1856        H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
   1857        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
   1858 
   1859    int64x2_t deltas1017 = vpaddq_s64(deltas[10], deltas[17]);
   1860 
   1861    // Row 2: 5 points
   1862    hadd_update_4_stats_sve(
   1863        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
   1864        H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
   1865    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
   1866        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
   1867        vgetq_lane_s64(deltas1017, 0);
   1868 
   1869    // Row 3: 4 points
   1870    hadd_update_4_stats_sve(
   1871        H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
   1872        deltas + 11,
   1873        H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
   1874 
   1875    // Row 4: 3 points
   1876    int64x2_t h0 =
   1877        vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
   1878    vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
   1879              vaddq_s64(h0, vpaddq_s64(deltas[15], deltas[16])));
   1880    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
   1881        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
   1882        vgetq_lane_s64(deltas1017, 1);
   1883 
   1884    // Row 5: 2 points
   1885    int64x2_t h1 =
   1886        vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
   1887    vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
   1888              vaddq_s64(h1, vpaddq_s64(deltas[18], deltas[19])));
   1889 
   1890    // Row 6: 1 points
   1891    H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
   1892        H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
   1893        vaddvq_s64(deltas[20]);
   1894  } while (++i < wiener_win);
   1895 }
   1896 
   1897 #endif  // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_