tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pickrst_neon.c (108499B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/arm/mem_neon.h"
     18 #include "aom_dsp/arm/sum_neon.h"
     19 #include "aom_dsp/arm/transpose_neon.h"
     20 #include "av1/common/restoration.h"
     21 #include "av1/encoder/arm/pickrst_neon.h"
     22 #include "av1/encoder/pickrst.h"
     23 
     24 int64_t av1_lowbd_pixel_proj_error_neon(
     25    const uint8_t *src, int width, int height, int src_stride,
     26    const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride,
     27    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
     28  int64_t sse = 0;
     29  int64x2_t sse_s64 = vdupq_n_s64(0);
     30 
     31  if (params->r[0] > 0 && params->r[1] > 0) {
     32    int32x2_t xq_v = vld1_s32(xq);
     33    int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS);
     34 
     35    do {
     36      int j = 0;
     37      int32x4_t sse_s32 = vdupq_n_s32(0);
     38 
     39      do {
     40        const uint8x8_t d = vld1_u8(&dat[j]);
     41        const uint8x8_t s = vld1_u8(&src[j]);
     42        int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
     43        int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
     44        int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
     45        int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
     46 
     47        int32x4_t offset =
     48            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
     49        int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0);
     50        int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0);
     51 
     52        v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
     53        v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
     54 
     55        int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d));
     56        v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16),
     57                            vreinterpret_s16_s32(xq_sum_v), 0);
     58        v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16),
     59                            vreinterpret_s16_s32(xq_sum_v), 0);
     60 
     61        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
     62        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
     63 
     64        int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
     65        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
     66        int16x4_t e_lo = vget_low_s16(e);
     67        int16x4_t e_hi = vget_high_s16(e);
     68 
     69        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
     70        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
     71 
     72        j += 8;
     73      } while (j <= width - 8);
     74 
     75      for (int k = j; k < width; ++k) {
     76        int32_t u = (dat[k] << SGRPROJ_RST_BITS);
     77        int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) +
     78                    xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]);
     79        int32_t e =
     80            (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
     81        sse += e * e;
     82      }
     83 
     84      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
     85 
     86      dat += dat_stride;
     87      src += src_stride;
     88      flt0 += flt0_stride;
     89      flt1 += flt1_stride;
     90    } while (--height != 0);
     91  } else if (params->r[0] > 0 || params->r[1] > 0) {
     92    int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
     93    int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
     94    int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
     95    int32x2_t xq_v = vdup_n_s32(xq_active);
     96 
     97    do {
     98      int32x4_t sse_s32 = vdupq_n_s32(0);
     99      int j = 0;
    100 
    101      do {
    102        const uint8x8_t d = vld1_u8(&dat[j]);
    103        const uint8x8_t s = vld1_u8(&src[j]);
    104        int32x4_t flt_0 = vld1q_s32(&flt[j]);
    105        int32x4_t flt_1 = vld1q_s32(&flt[j + 4]);
    106        int16x8_t d_s16 =
    107            vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
    108 
    109        int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16));
    110        int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16));
    111 
    112        int32x4_t offset =
    113            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
    114        int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0);
    115        int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0);
    116 
    117        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
    118        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
    119 
    120        int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
    121        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
    122        int16x4_t e_lo = vget_low_s16(e);
    123        int16x4_t e_hi = vget_high_s16(e);
    124 
    125        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
    126        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
    127 
    128        j += 8;
    129      } while (j <= width - 8);
    130 
    131      for (int k = j; k < width; ++k) {
    132        int32_t u = dat[k] << SGRPROJ_RST_BITS;
    133        int32_t v = xq_active * (flt[k] - u);
    134        int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) +
    135                    dat[k] - src[k];
    136        sse += e * e;
    137      }
    138 
    139      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
    140 
    141      dat += dat_stride;
    142      src += src_stride;
    143      flt += flt_stride;
    144    } while (--height != 0);
    145  } else {
    146    uint32x4_t sse_s32 = vdupq_n_u32(0);
    147 
    148    do {
    149      int j = 0;
    150 
    151      do {
    152        const uint8x16_t d = vld1q_u8(&dat[j]);
    153        const uint8x16_t s = vld1q_u8(&src[j]);
    154 
    155        uint8x16_t diff = vabdq_u8(d, s);
    156        uint8x8_t diff_lo = vget_low_u8(diff);
    157        uint8x8_t diff_hi = vget_high_u8(diff);
    158 
    159        sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo));
    160        sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi));
    161 
    162        j += 16;
    163      } while (j <= width - 16);
    164 
    165      for (int k = j; k < width; ++k) {
    166        int32_t e = dat[k] - src[k];
    167        sse += e * e;
    168      }
    169 
    170      dat += dat_stride;
    171      src += src_stride;
    172    } while (--height != 0);
    173 
    174    sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32));
    175  }
    176 
    177  sse += horizontal_add_s64x2(sse_s64);
    178  return sse;
    179 }
    180 
    181 // We can accumulate up to 32768 8-bit multiplication results in a signed
    182 // 32-bit integer. We are processing 2 pixels at a time, so the accumulator max
    183 // can be as high as 16384 for the compute stats.
    184 #define STAT_ACCUMULATOR_MAX 16384
    185 
    186 static inline uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
    187 #if AOM_ARCH_AARCH64
    188  uint8x16x2_t table = { { a, b } };
    189  return vqtbl2_u8(table, idx);
    190 #else
    191  uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
    192                          vget_high_u8(b) } };
    193  return vtbl4_u8(table, idx);
    194 #endif
    195 }
    196 
    197 static inline uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
    198 #if AOM_ARCH_AARCH64
    199  uint8x16x2_t table = { { a, b } };
    200  return vqtbl2q_u8(table, idx);
    201 #else
    202  uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
    203                          vget_high_u8(b) } };
    204  return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
    205                     vtbl4_u8(table, vget_high_u8(idx)));
    206 #endif
    207 }
    208 
    209 // The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the
    210 // computation. This function computes the final M from the accumulated
    211 // (src_s64) and the residual parts (src_s32). It also transposes the result as
    212 // the output needs to be column-major.
    213 static inline void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
    214                                   const int32_t *src_s32, const int wiener_win,
    215                                   int scale) {
    216  for (int i = 0; i < wiener_win; ++i) {
    217    for (int j = 0; j < wiener_win; ++j) {
    218      int tr_idx = j * wiener_win + i;
    219      *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
    220    }
    221  }
    222 }
    223 
    224 // The resulting H is a column-major matrix accumulated from the transposed
    225 // (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
    226 // vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
    227 // function transforms back to the originally expected format (double
    228 // transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to
    229 // speed-up the computation. This function computes the final H from the
    230 // accumulated (src_s64) and the residual parts (src_s32). The computed H is
    231 // only an upper triangle matrix, this function also fills the lower triangle of
    232 // the resulting matrix.
    233 static void update_H(int64_t *dst, const int64_t *src_s64,
    234                     const int32_t *src_s32, const int wiener_win, int stride,
    235                     int scale) {
    236  // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
    237  // `wiener_win2` is 9, the M matrix is 3x3:
    238  // 0, 3, 6
    239  // 1, 4, 7
    240  // 2, 5, 8
    241  //
    242  // This is viewed as a vector to compute H (9x9) by vector outer product:
    243  // 0, 3, 6, 1, 4, 7, 2, 5, 8
    244  //
    245  // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
    246  // 0,    3,    6,    1,    4,    7,    2,    5,    8,
    247  // 3,   30,   33,   12,   31,   34,   21,   32,   35,
    248  // 6,   33,   60,   15,   42,   61,   24,   51,   62,
    249  // 1,   12,   15,   10,   13,   16,   11,   14,   17,
    250  // 4,   31,   42,   13,   40,   43,   22,   41,   44,
    251  // 7,   34,   61,   16,   43,   70,   25,   52,   71,
    252  // 2,   21,   24,   11,   22,   25,   20,   23,   26,
    253  // 5,   32,   51,   14,   41,   52,   23,   50,   53,
    254  // 8,   35,   62,   17,   44,   71,   26,   53,   80,
    255  const int wiener_win2 = wiener_win * wiener_win;
    256 
    257  // Loop through the indices according to the remapping above, along the
    258  // columns:
    259  // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
    260  // wiener_win - 1, wiener_win - 1 + wiener_win, ...
    261  // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
    262  for (int i = 0; i < wiener_win; ++i) {
    263    for (int j = i; j < wiener_win2; j += wiener_win) {
    264      // These two inner loops are the same as the two outer loops, but running
    265      // along rows instead of columns. For the 3x3 case `l` will be:
    266      // 0, 3, 6, 1, 4, 7, 2, 5, 8.
    267      for (int k = 0; k < wiener_win; ++k) {
    268        for (int l = k; l < wiener_win2; l += wiener_win) {
    269          // The nominal double transpose indexing would be:
    270          // int idx = stride * j + l;
    271          // However we need the upper-triangle indices, it is easy with some
    272          // min/max operations.
    273          int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
    274 
    275          // Resulting matrix is filled by combining the 64-bit and the residual
    276          // 32-bit matrices together with scaling.
    277          *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
    278        }
    279      }
    280    }
    281  }
    282 }
    283 
    284 // Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the
    285 // last load address is offset to prevent out-of-bounds access.
    286 static inline void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
    287                                        ptrdiff_t stride) {
    288  dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
    289  src += 2 * stride;
    290  dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
    291  src += 2 * stride;
    292  dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
    293  src += 2 * stride;
    294  dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
    295 }
    296 
    297 static inline void compute_stats_win7_downsampled_neon(
    298    const uint8_t *dgd, const uint8_t *src, int width, int height,
    299    int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H,
    300    int downsample_factor) {
    301  // Matrix names are capitalized to help readability.
    302  DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
    303  DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
    304  DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
    305  DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
    306  DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
    307  DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
    308 
    309  memset(M_s32, 0, sizeof(M_s32));
    310  memset(M_s64, 0, sizeof(M_s64));
    311  memset(H_s32, 0, sizeof(H_s32));
    312  memset(H_s64, 0, sizeof(H_s64));
    313 
    314  // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
    315  // matrices.
    316  // clang-format off
    317  DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = {
    318    0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 16, 17,
    319    2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
    320    4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22,
    321    1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 17, 18,
    322    3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
    323    5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
    324  };
    325  // clang-format on
    326 
    327  const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0);
    328  const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16);
    329  const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32);
    330  const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48);
    331  const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64);
    332  const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80);
    333 
    334  int acc_cnt = STAT_ACCUMULATOR_MAX;
    335  const int src_next = downsample_factor * src_stride - width;
    336  const int dgd_next = downsample_factor * dgd_stride - width;
    337  const uint8x8_t avg_u8 = vdup_n_u8(avg);
    338 
    339  do {
    340    int j = width;
    341    while (j >= 2) {
    342      // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
    343      // middle 6x7 elements being shared.
    344      uint8x16_t dgd_rows[4];
    345      load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
    346 
    347      const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
    348      dgd += 2;
    349 
    350      // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7
    351      // matrices (1 for each of the 2 pixels) separated into distinct
    352      // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7).
    353      // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49
    354      // consecutive elements.
    355      int16x8_t dgd_avg0[6];
    356      int16x8_t dgd_avg1[6];
    357      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
    358      uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
    359 
    360      dgd_avg0[0] =
    361          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
    362      dgd_avg0[1] =
    363          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
    364      dgd_avg1[0] =
    365          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8));
    366      dgd_avg1[1] =
    367          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8));
    368 
    369      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
    370      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
    371      vst1q_s16(DGD_AVG1, dgd_avg1[0]);
    372      vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
    373 
    374      uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
    375      uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4);
    376 
    377      dgd_avg0[2] =
    378          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
    379      dgd_avg0[3] =
    380          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
    381      dgd_avg1[2] =
    382          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8));
    383      dgd_avg1[3] =
    384          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8));
    385 
    386      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
    387      vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
    388      vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
    389      vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
    390 
    391      uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
    392      uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5);
    393 
    394      dgd_avg0[4] =
    395          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
    396      dgd_avg0[5] =
    397          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
    398      dgd_avg1[4] =
    399          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8));
    400      dgd_avg1[5] =
    401          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8));
    402 
    403      vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
    404      vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
    405      vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
    406      vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
    407 
    408      // The remaining last (49th) elements of `dgd - avg`.
    409      DGD_AVG0[48] = dgd_ptr[6] - avg;
    410      DGD_AVG1[48] = dgd_ptr[7] - avg;
    411 
    412      // Accumulate into row-major variant of matrix M (cross-correlation) for 2
    413      // output pixels at a time. M is of size 7 * 7. It needs to be filled such
    414      // that multiplying one element from src with each element of a row of the
    415      // wiener window will fill one column of M. However this is not very
    416      // convenient in terms of memory access, as it means we do contiguous
    417      // loads of dgd but strided stores to M. As a result, we use an
    418      // intermediate matrix M_s32 which is instead filled such that one row of
    419      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
    420      // then transposed to return M.
    421      int src_avg0 = *src++ - avg;
    422      int src_avg1 = *src++ - avg;
    423      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
    424      int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
    425      update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
    426                       dgd_avg1[0]);
    427      update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
    428                       dgd_avg1[1]);
    429      update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
    430                       dgd_avg1[2]);
    431      update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
    432                       dgd_avg1[3]);
    433      update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
    434                       dgd_avg1[4]);
    435      update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
    436                       dgd_avg1[5]);
    437 
    438      // Last (49th) element of M_s32 can be computed as scalar more efficiently
    439      // for 2 output pixels.
    440      M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
    441 
    442      // Start accumulating into row-major version of matrix H
    443      // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
    444      // row-major. H is of size 49 * 49. It is filled by multiplying every pair
    445      // of elements of the wiener window together (vector outer product). Since
    446      // it is a symmetric matrix, we only compute the upper-right triangle, and
    447      // then copy it down to the lower-left later. The upper triangle is
    448      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
    449      // column-major and the resulting H matrix is also expected to be
    450      // column-major. It is not efficient to work with column-major matrices,
    451      // so we accumulate into a row-major matrix H_s32. At the end of the
    452      // algorithm a double transpose transformation will convert H_s32 back to
    453      // the expected output layout.
    454      update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
    455 
    456      // The last element of the triangle of H_s32 matrix can be computed as a
    457      // scalar more efficiently.
    458      H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
    459          DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
    460 
    461      // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
    462      // overflow.
    463      if (--acc_cnt == 0) {
    464        acc_cnt = STAT_ACCUMULATOR_MAX;
    465 
    466        accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
    467 
    468        // The widening accumulation is only needed for the upper triangle part
    469        // of the matrix.
    470        int64_t *lh = H_s64;
    471        int32_t *lh32 = H_s32;
    472        for (int k = 0; k < WIENER_WIN2; ++k) {
    473          // The widening accumulation is only run for the relevant parts
    474          // (upper-right triangle) in a row 4-element aligned.
    475          int k4 = k / 4 * 4;
    476          accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
    477 
    478          // Last element of the row is computed separately.
    479          lh[48] += lh32[48];
    480          lh32[48] = 0;
    481 
    482          lh += WIENER_WIN2_ALIGN2;
    483          lh32 += WIENER_WIN2_ALIGN2;
    484        }
    485      }
    486 
    487      j -= 2;
    488    }
    489 
    490    // Computations for odd pixel in the row.
    491    if (width & 1) {
    492      // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
    493      // middle 6x7 elements being shared.
    494      uint8x16_t dgd_rows[4];
    495      load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
    496 
    497      const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
    498      ++dgd;
    499 
    500      // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7
    501      // matrix tightly packed into a int16x8_t[6] array. This array contains
    502      // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer.
    503      // The DGD_AVG buffer contains 49 consecutive elements.
    504      int16x8_t dgd_avg0[6];
    505      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
    506      dgd_avg0[0] =
    507          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
    508      dgd_avg0[1] =
    509          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
    510      vst1q_s16(DGD_AVG0, dgd_avg0[0]);
    511      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
    512 
    513      uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
    514      dgd_avg0[2] =
    515          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
    516      dgd_avg0[3] =
    517          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
    518      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
    519      vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
    520 
    521      uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
    522      dgd_avg0[4] =
    523          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
    524      dgd_avg0[5] =
    525          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
    526      vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
    527      vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
    528 
    529      // The remaining last (49th) element of `dgd - avg`.
    530      DGD_AVG0[48] = dgd_ptr[6] - avg;
    531 
    532      // Accumulate into row-major order variant of matrix M (cross-correlation)
    533      // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
    534      // such that multiplying one element from src with each element of a row
    535      // of the wiener window will fill one column of M. However this is not
    536      // very convenient in terms of memory access, as it means we do
    537      // contiguous loads of dgd but strided stores to M. As a result, we use an
    538      // intermediate matrix M_s32 which is instead filled such that one row of
    539      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
    540      // then transposed to return M.
    541      int src_avg0 = *src++ - avg;
    542      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
    543      update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
    544      update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
    545      update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
    546      update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
    547      update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
    548      update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
    549 
    550      // Last (49th) element of M_s32 can be computed as scalar more efficiently
    551      // for 1 output pixel.
    552      M_s32[48] += DGD_AVG0[48] * src_avg0;
    553 
    554      // Start accumulating into row-major order version of matrix H
    555      // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
    556      // H is of size 49 * 49. It is filled by multiplying every pair of
    557      // elements of the wiener window together (vector outer product). Since it
    558      // is a symmetric matrix, we only compute the upper-right triangle, and
    559      // then copy it down to the lower-left later. The upper triangle is
    560      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
    561      // column-major and the resulting H matrix is also expected to be
    562      // column-major. It is not efficient to work column-major matrices, so we
    563      // accumulate into a row-major matrix H_s32. At the end of the algorithm a
    564      // double transpose transformation will convert H_s32 back to the expected
    565      // output layout.
    566      update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
    567 
    568      // The last element of the triangle of H_s32 matrix can be computed as
    569      // scalar more efficiently.
    570      H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
    571    }
    572 
    573    src += src_next;
    574    dgd += dgd_next;
    575  } while (--height != 0);
    576 
    577  acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor);
    578 
    579  update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor);
    580 }
    581 
    582 // Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the
    583 // last load address is offset to prevent out-of-bounds access.
    584 static inline void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
    585                                        ptrdiff_t stride) {
    586  dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
    587  src += 2 * stride;
    588  dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
    589  src += 2 * stride;
    590  dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0));
    591 }
    592 
    593 static inline void compute_stats_win5_downsampled_neon(
    594    const uint8_t *dgd, const uint8_t *src, int width, int height,
    595    int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H,
    596    int downsample_factor) {
    597  // Matrix names are capitalized to help readability.
    598  DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
    599  DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
    600  DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
    601  DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
    602  DECLARE_ALIGNED(64, int32_t,
    603                  H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
    604  DECLARE_ALIGNED(64, int64_t,
    605                  H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
    606 
    607  memset(M_s32, 0, sizeof(M_s32));
    608  memset(M_s64, 0, sizeof(M_s64));
    609  memset(H_s32, 0, sizeof(H_s32));
    610  memset(H_s64, 0, sizeof(H_s64));
    611 
    612  // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5
    613  // matrices.
    614  // clang-format off
    615  DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = {
    616    0,  1,  2,  3,  4,  8,  9, 10, 11, 12, 16, 17, 18, 19, 20, 24,
    617    1,  2,  3,  4,  5,  9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25,
    618    9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23,
    619  };
    620  // clang-format on
    621 
    622  const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0);
    623  const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16);
    624  const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32);
    625 
    626  int acc_cnt = STAT_ACCUMULATOR_MAX;
    627  const int src_next = downsample_factor * src_stride - width;
    628  const int dgd_next = downsample_factor * dgd_stride - width;
    629  const uint8x8_t avg_u8 = vdup_n_u8(avg);
    630 
    631  do {
    632    int j = width;
    633    while (j >= 2) {
    634      // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
    635      // middle 4x5 elements being shared.
    636      uint8x16_t dgd_rows[3];
    637      load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
    638 
    639      const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
    640      dgd += 2;
    641 
    642      // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5
    643      // matrices (1 for each of the 2 pixels) separated into distinct
    644      // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5).
    645      // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25
    646      // consecutive elements.
    647      int16x8_t dgd_avg0[3];
    648      int16x8_t dgd_avg1[3];
    649      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
    650      uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1);
    651      uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2);
    652 
    653      dgd_avg0[0] =
    654          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
    655      dgd_avg0[1] =
    656          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
    657      dgd_avg0[2] =
    658          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
    659      dgd_avg1[0] =
    660          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
    661      dgd_avg1[1] =
    662          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
    663      dgd_avg1[2] =
    664          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
    665 
    666      vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
    667      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
    668      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
    669      vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]);
    670      vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
    671      vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
    672 
    673      // The remaining last (25th) elements of `dgd - avg`.
    674      DGD_AVG0[24] = dgd_ptr[4] - avg;
    675      DGD_AVG1[24] = dgd_ptr[5] - avg;
    676 
    677      // Accumulate into row-major variant of matrix M (cross-correlation) for 2
    678      // output pixels at a time. M is of size 5 * 5. It needs to be filled such
    679      // that multiplying one element from src with each element of a row of the
    680      // wiener window will fill one column of M. However this is not very
    681      // convenient in terms of memory access, as it means we do contiguous
    682      // loads of dgd but strided stores to M. As a result, we use an
    683      // intermediate matrix M_s32 which is instead filled such that one row of
    684      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
    685      // then transposed to return M.
    686      int src_avg0 = *src++ - avg;
    687      int src_avg1 = *src++ - avg;
    688      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
    689      int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
    690      update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
    691                       dgd_avg1[0]);
    692      update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
    693                       dgd_avg1[1]);
    694      update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
    695                       dgd_avg1[2]);
    696 
    697      // Last (25th) element of M_s32 can be computed as scalar more efficiently
    698      // for 2 output pixels.
    699      M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
    700 
    701      // Start accumulating into row-major version of matrix H
    702      // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
    703      // row-major. H is of size 25 * 25. It is filled by multiplying every pair
    704      // of elements of the wiener window together (vector outer product). Since
    705      // it is a symmetric matrix, we only compute the upper-right triangle, and
    706      // then copy it down to the lower-left later. The upper triangle is
    707      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
    708      // column-major and the resulting H matrix is also expected to be
    709      // column-major. It is not efficient to work with column-major matrices,
    710      // so we accumulate into a row-major matrix H_s32. At the end of the
    711      // algorithm a double transpose transformation will convert H_s32 back to
    712      // the expected output layout.
    713      update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
    714 
    715      // The last element of the triangle of H_s32 matrix can be computed as a
    716      // scalar more efficiently.
    717      H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
    718          DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
    719 
    720      // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
    721      // overflow.
    722      if (--acc_cnt == 0) {
    723        acc_cnt = STAT_ACCUMULATOR_MAX;
    724 
    725        accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
    726 
    727        // The widening accumulation is only needed for the upper triangle part
    728        // of the matrix.
    729        int64_t *lh = H_s64;
    730        int32_t *lh32 = H_s32;
    731        for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
    732          // The widening accumulation is only run for the relevant parts
    733          // (upper-right triangle) in a row 4-element aligned.
    734          int k4 = k / 4 * 4;
    735          accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
    736 
    737          // Last element of the row is computed separately.
    738          lh[24] += lh32[24];
    739          lh32[24] = 0;
    740 
    741          lh += WIENER_WIN2_REDUCED_ALIGN2;
    742          lh32 += WIENER_WIN2_REDUCED_ALIGN2;
    743        }
    744      }
    745 
    746      j -= 2;
    747    }
    748 
    749    // Computations for odd pixel in the row.
    750    if (width & 1) {
    751      // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
    752      // middle 4x5 elements being shared.
    753      uint8x16_t dgd_rows[3];
    754      load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
    755 
    756      const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
    757      ++dgd;
    758 
    759      // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
    760      // matrix tightly packed into a int16x8_t[3] array. This array contains
    761      // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
    762      // The DGD_AVG buffer contains 25 consecutive elements.
    763      int16x8_t dgd_avg0[3];
    764      uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
    765      uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2));
    766 
    767      dgd_avg0[0] =
    768          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
    769      dgd_avg0[1] =
    770          vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
    771      dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8));
    772 
    773      vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
    774      vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
    775      vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
    776 
    777      // The remaining last (25th) element of `dgd - avg`.
    778      DGD_AVG0[24] = dgd_ptr[4] - avg;
    779 
    780      // Accumulate into row-major order variant of matrix M (cross-correlation)
    781      // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
    782      // such that multiplying one element from src with each element of a row
    783      // of the wiener window will fill one column of M. However this is not
    784      // very convenient in terms of memory access, as it means we do
    785      // contiguous loads of dgd but strided stores to M. As a result, we use an
    786      // intermediate matrix M_s32 which is instead filled such that one row of
    787      // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
    788      // then transposed to return M.
    789      int src_avg0 = *src++ - avg;
    790      int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
    791      update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
    792      update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
    793      update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
    794 
    795      // Last (25th) element of M_s32 can be computed as scalar more efficiently
    796      // for 1 output pixel.
    797      M_s32[24] += DGD_AVG0[24] * src_avg0;
    798 
    799      // Start accumulating into row-major order version of matrix H
    800      // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
    801      // H is of size 25 * 25. It is filled by multiplying every pair of
    802      // elements of the wiener window together (vector outer product). Since it
    803      // is a symmetric matrix, we only compute the upper-right triangle, and
    804      // then copy it down to the lower-left later. The upper triangle is
    805      // covered by 4x4 tiles. The original algorithm assumes the M matrix is
    806      // column-major and the resulting H matrix is also expected to be
    807      // column-major. It is not efficient to work column-major matrices, so we
    808      // accumulate into a row-major matrix H_s32. At the end of the algorithm a
    809      // double transpose transformation will convert H_s32 back to the expected
    810      // output layout.
    811      update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
    812 
    813      // The last element of the triangle of H_s32 matrix can be computed as a
    814      // scalar more efficiently.
    815      H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
    816          DGD_AVG0[24] * DGD_AVG0[24];
    817    }
    818 
    819    src += src_next;
    820    dgd += dgd_next;
    821  } while (--height != 0);
    822 
    823  acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor);
    824 
    825  update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
    826           downsample_factor);
    827 }
    828 
    829 static inline void hadd_update_6_stats_neon(const int64_t *const src,
    830                                            const int32x4_t *deltas,
    831                                            int64_t *const dst) {
    832  int32x4_t delta01 = horizontal_add_2d_s32(deltas[0], deltas[1]);
    833  int32x4_t delta23 = horizontal_add_2d_s32(deltas[2], deltas[3]);
    834  int32x4_t delta45 = horizontal_add_2d_s32(deltas[4], deltas[5]);
    835 
    836  int64x2_t delta01_s64 = vpaddlq_s32(delta01);
    837  int64x2_t delta23_s64 = vpaddlq_s32(delta23);
    838  int64x2_t delta45_s64 = vpaddlq_s32(delta45);
    839 
    840  int64x2_t src0 = vld1q_s64(src);
    841  int64x2_t src1 = vld1q_s64(src + 2);
    842  int64x2_t src2 = vld1q_s64(src + 4);
    843 
    844  vst1q_s64(dst, vaddq_s64(src0, delta01_s64));
    845  vst1q_s64(dst + 2, vaddq_s64(src1, delta23_s64));
    846  vst1q_s64(dst + 4, vaddq_s64(src2, delta45_s64));
    847 }
    848 
    849 static inline void hadd_update_4_stats_neon(const int64_t *const src,
    850                                            const int32x4_t *deltas,
    851                                            int64_t *const dst) {
    852  int32x4_t delta01 = horizontal_add_2d_s32(deltas[0], deltas[1]);
    853  int32x4_t delta23 = horizontal_add_2d_s32(deltas[2], deltas[3]);
    854  int64x2_t delta01_s64 = vpaddlq_s32(delta01);
    855  int64x2_t delta23_s64 = vpaddlq_s32(delta23);
    856 
    857  int64x2_t src0 = vld1q_s64(src);
    858  int64x2_t src1 = vld1q_s64(src + 2);
    859  vst1q_s64(dst, vaddq_s64(src0, delta01_s64));
    860  vst1q_s64(dst + 2, vaddq_s64(src1, delta23_s64));
    861 }
    862 
    863 static inline void compute_stats_win5_neon(
    864    const int16_t *const d, const int32_t d_stride, const int16_t *const s,
    865    const int32_t s_stride, const int32_t width, const int32_t height,
    866    int64_t *const M, int64_t *const H) {
    867  const int32_t wiener_win = WIENER_WIN_CHROMA;
    868  const int32_t wiener_win2 = wiener_win * wiener_win;
    869  const int32_t w16 = width & ~15;
    870  const int32_t h8 = height & ~7;
    871  int16x8_t mask[2];
    872  mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16);
    873  mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8);
    874  const int bit_depth = 8;
    875  int32_t i, j, x, y;
    876 
    877  const int32_t num_bit_left =
    878      32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
    879  const int32_t h_allowed =
    880      (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0));
    881 
    882  // Step 1: Calculate the top edge of the whole matrix, i.e., the top
    883  // edge of each triangle and square on the top row.
    884  j = 0;
    885  do {
    886    const int16_t *s_t = s;
    887    const int16_t *d_t = d;
    888    int32_t height_t = 0;
    889    int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
    890    int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
    891    int16x8_t src[2], dgd[2];
    892 
    893    do {
    894      const int32_t h_t =
    895          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
    896      int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
    897      int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
    898 
    899      y = h_t;
    900      do {
    901        x = 0;
    902        while (x < w16) {
    903          src[0] = vld1q_s16(s_t + x + 0);
    904          src[1] = vld1q_s16(s_t + x + 8);
    905          dgd[0] = vld1q_s16(d_t + x + 0);
    906          dgd[1] = vld1q_s16(d_t + x + 8);
    907          stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
    908          x += 16;
    909        }
    910 
    911        if (w16 != width) {
    912          src[0] = vld1q_s16(s_t + w16 + 0);
    913          src[1] = vld1q_s16(s_t + w16 + 8);
    914          dgd[0] = vld1q_s16(d_t + w16 + 0);
    915          dgd[1] = vld1q_s16(d_t + w16 + 8);
    916          src[0] = vandq_s16(src[0], mask[0]);
    917          src[1] = vandq_s16(src[1], mask[1]);
    918          dgd[0] = vandq_s16(dgd[0], mask[0]);
    919          dgd[1] = vandq_s16(dgd[1], mask[1]);
    920          stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
    921        }
    922 
    923        s_t += s_stride;
    924        d_t += d_stride;
    925      } while (--y);
    926 
    927      sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]);
    928      sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]);
    929      sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]);
    930      sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]);
    931      sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]);
    932      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
    933      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
    934      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
    935      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
    936      sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
    937 
    938      height_t += h_t;
    939    } while (height_t < height);
    940 
    941 #if AOM_ARCH_AARCH64
    942    int64x2_t sum_m0 = vpaddq_s64(sum_m[0], sum_m[1]);
    943    int64x2_t sum_m2 = vpaddq_s64(sum_m[2], sum_m[3]);
    944    vst1q_s64(&M[wiener_win * j + 0], sum_m0);
    945    vst1q_s64(&M[wiener_win * j + 2], sum_m2);
    946    M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
    947 
    948    int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
    949    int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]);
    950    vst1q_s64(&H[wiener_win * j + 0], sum_h0);
    951    vst1q_s64(&H[wiener_win * j + 2], sum_h2);
    952    H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
    953 #else
    954    M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]);
    955    M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]);
    956    M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]);
    957    M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]);
    958    M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]);
    959 
    960    H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]);
    961    H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]);
    962    H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]);
    963    H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]);
    964    H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]);
    965 #endif  // AOM_ARCH_AARCH64
    966  } while (++j < wiener_win);
    967 
    968  // Step 2: Calculate the left edge of each square on the top row.
    969  j = 1;
    970  do {
    971    const int16_t *d_t = d;
    972    int32_t height_t = 0;
    973    int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
    974    int16x8_t dgd[2];
    975 
    976    do {
    977      const int32_t h_t =
    978          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
    979      int32x4_t row_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s32(0) };
    980 
    981      y = h_t;
    982      do {
    983        x = 0;
    984        while (x < w16) {
    985          dgd[0] = vld1q_s16(d_t + j + x + 0);
    986          dgd[1] = vld1q_s16(d_t + j + x + 8);
    987          stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
    988          x += 16;
    989        }
    990 
    991        if (w16 != width) {
    992          dgd[0] = vld1q_s16(d_t + j + x + 0);
    993          dgd[1] = vld1q_s16(d_t + j + x + 8);
    994          dgd[0] = vandq_s16(dgd[0], mask[0]);
    995          dgd[1] = vandq_s16(dgd[1], mask[1]);
    996          stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
    997        }
    998 
    999        d_t += d_stride;
   1000      } while (--y);
   1001 
   1002      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
   1003      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
   1004      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
   1005      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
   1006 
   1007      height_t += h_t;
   1008    } while (height_t < height);
   1009 
   1010 #if AOM_ARCH_AARCH64
   1011    int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
   1012    int64x2_t sum_h1 = vpaddq_s64(sum_h[2], sum_h[3]);
   1013    vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
   1014    vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
   1015    vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1));
   1016    vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1));
   1017 #else
   1018    H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]);
   1019    H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]);
   1020    H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]);
   1021    H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]);
   1022 #endif  // AOM_ARCH_AARCH64
   1023  } while (++j < wiener_win);
   1024 
   1025  // Step 3: Derive the top edge of each triangle along the diagonal. No
   1026  // triangle in top row.
   1027  {
   1028    const int16_t *d_t = d;
   1029 
   1030    if (height % 2) {
   1031      int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1032      int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1033      int16x8_t ds[WIENER_WIN * 2];
   1034 
   1035      load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
   1036      load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
   1037      d_t += 4 * d_stride;
   1038 
   1039      step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
   1040      transpose_arrays_s32_8x8(deltas, deltas_tr);
   1041 
   1042      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
   1043                          deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
   1044                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
   1045 
   1046      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
   1047                          deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
   1048                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
   1049 
   1050      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
   1051                          deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
   1052                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
   1053 
   1054      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
   1055                          deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
   1056                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   1057 
   1058    } else {
   1059      int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
   1060      int16x8_t ds[WIENER_WIN_CHROMA * 2];
   1061 
   1062      ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
   1063      ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
   1064      ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
   1065      ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
   1066 
   1067      step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
   1068 
   1069      transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
   1070                                      &deltas[3]);
   1071 
   1072      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
   1073                          deltas[0], vgetq_lane_s32(deltas[4], 0),
   1074                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
   1075 
   1076      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
   1077                          deltas[1], vgetq_lane_s32(deltas[4], 1),
   1078                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
   1079 
   1080      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
   1081                          deltas[2], vgetq_lane_s32(deltas[4], 2),
   1082                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
   1083 
   1084      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
   1085                          deltas[3], vgetq_lane_s32(deltas[4], 3),
   1086                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   1087    }
   1088  }
   1089 
   1090  // Step 4: Derive the top and left edge of each square. No square in top and
   1091  // bottom row.
   1092 
   1093  {
   1094    y = h8;
   1095 
   1096    int16x4_t d_s[12];
   1097    int16x4_t d_e[12];
   1098    const int16_t *d_t = d;
   1099    int16x4_t zeros = vdup_n_s16(0);
   1100    load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
   1101    load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
   1102    int32x4_t deltas[6][18] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } };
   1103 
   1104    while (y >= 8) {
   1105      load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
   1106                   &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
   1107      load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
   1108                   &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
   1109 
   1110      int16x8_t s_tr[8], e_tr[8];
   1111      transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
   1112                              d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
   1113                              &s_tr[3]);
   1114      transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
   1115                              zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
   1116                              &s_tr[7]);
   1117 
   1118      transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
   1119                              d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
   1120                              &e_tr[3]);
   1121      transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
   1122                              zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
   1123                              &e_tr[7]);
   1124 
   1125      int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
   1126      start_col0[0] = s_tr[0];
   1127      start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
   1128      start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
   1129      start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
   1130      start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
   1131 
   1132      start_col1[0] = s_tr[1];
   1133      start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
   1134      start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
   1135      start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
   1136      start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
   1137 
   1138      start_col2[0] = s_tr[2];
   1139      start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
   1140      start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
   1141      start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
   1142      start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
   1143 
   1144      start_col3[0] = s_tr[3];
   1145      start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
   1146      start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
   1147      start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
   1148      start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
   1149 
   1150      // i = 1, j = 2;
   1151      sub_deltas_step4(start_col0, start_col1, deltas[0]);
   1152 
   1153      // i = 1, j = 3;
   1154      sub_deltas_step4(start_col0, start_col2, deltas[1]);
   1155 
   1156      // i = 1, j = 4
   1157      sub_deltas_step4(start_col0, start_col3, deltas[2]);
   1158 
   1159      // i = 2, j =3
   1160      sub_deltas_step4(start_col1, start_col2, deltas[3]);
   1161 
   1162      // i = 2, j = 4
   1163      sub_deltas_step4(start_col1, start_col3, deltas[4]);
   1164 
   1165      // i = 3, j = 4
   1166      sub_deltas_step4(start_col2, start_col3, deltas[5]);
   1167 
   1168      int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
   1169      end_col0[0] = e_tr[0];
   1170      end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
   1171      end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
   1172      end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
   1173      end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
   1174 
   1175      end_col1[0] = e_tr[1];
   1176      end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
   1177      end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
   1178      end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
   1179      end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
   1180 
   1181      end_col2[0] = e_tr[2];
   1182      end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
   1183      end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
   1184      end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
   1185      end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
   1186 
   1187      end_col3[0] = e_tr[3];
   1188      end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
   1189      end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
   1190      end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
   1191      end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
   1192 
   1193      // i = 1, j = 2;
   1194      add_deltas_step4(end_col0, end_col1, deltas[0]);
   1195 
   1196      // i = 1, j = 3;
   1197      add_deltas_step4(end_col0, end_col2, deltas[1]);
   1198 
   1199      // i = 1, j = 4
   1200      add_deltas_step4(end_col0, end_col3, deltas[2]);
   1201 
   1202      // i = 2, j =3
   1203      add_deltas_step4(end_col1, end_col2, deltas[3]);
   1204 
   1205      // i = 2, j = 4
   1206      add_deltas_step4(end_col1, end_col3, deltas[4]);
   1207 
   1208      // i = 3, j = 4
   1209      add_deltas_step4(end_col2, end_col3, deltas[5]);
   1210 
   1211      d_s[0] = d_s[8];
   1212      d_s[1] = d_s[9];
   1213      d_s[2] = d_s[10];
   1214      d_s[3] = d_s[11];
   1215      d_e[0] = d_e[8];
   1216      d_e[1] = d_e[9];
   1217      d_e[2] = d_e[10];
   1218      d_e[3] = d_e[11];
   1219 
   1220      d_t += 8 * d_stride;
   1221      y -= 8;
   1222    }
   1223 
   1224    if (h8 != height) {
   1225      const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
   1226 
   1227      load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
   1228                   &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
   1229      load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
   1230                   &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
   1231      int16x8_t s_tr[8], e_tr[8];
   1232      transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
   1233                              d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
   1234                              &s_tr[3]);
   1235      transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
   1236                              zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
   1237                              &s_tr[7]);
   1238      transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
   1239                              d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
   1240                              &e_tr[3]);
   1241      transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
   1242                              zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
   1243                              &e_tr[7]);
   1244 
   1245      int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
   1246      start_col0[0] = vandq_s16(s_tr[0], mask_h);
   1247      start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
   1248      start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
   1249      start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
   1250      start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
   1251 
   1252      start_col1[0] = vandq_s16(s_tr[1], mask_h);
   1253      start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
   1254      start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
   1255      start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
   1256      start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
   1257 
   1258      start_col2[0] = vandq_s16(s_tr[2], mask_h);
   1259      start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
   1260      start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
   1261      start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
   1262      start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
   1263 
   1264      start_col3[0] = vandq_s16(s_tr[3], mask_h);
   1265      start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
   1266      start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
   1267      start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
   1268      start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
   1269 
   1270      // i = 1, j = 2;
   1271      sub_deltas_step4(start_col0, start_col1, deltas[0]);
   1272 
   1273      // i = 1, j = 3;
   1274      sub_deltas_step4(start_col0, start_col2, deltas[1]);
   1275 
   1276      // i = 1, j = 4
   1277      sub_deltas_step4(start_col0, start_col3, deltas[2]);
   1278 
   1279      // i = 2, j = 3
   1280      sub_deltas_step4(start_col1, start_col2, deltas[3]);
   1281 
   1282      // i = 2, j = 4
   1283      sub_deltas_step4(start_col1, start_col3, deltas[4]);
   1284 
   1285      // i = 3, j = 4
   1286      sub_deltas_step4(start_col2, start_col3, deltas[5]);
   1287 
   1288      int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
   1289      end_col0[0] = vandq_s16(e_tr[0], mask_h);
   1290      end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
   1291      end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
   1292      end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
   1293      end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
   1294 
   1295      end_col1[0] = vandq_s16(e_tr[1], mask_h);
   1296      end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
   1297      end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
   1298      end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
   1299      end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
   1300 
   1301      end_col2[0] = vandq_s16(e_tr[2], mask_h);
   1302      end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
   1303      end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
   1304      end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
   1305      end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
   1306 
   1307      end_col3[0] = vandq_s16(e_tr[3], mask_h);
   1308      end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
   1309      end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
   1310      end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
   1311      end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
   1312 
   1313      // i = 1, j = 2;
   1314      add_deltas_step4(end_col0, end_col1, deltas[0]);
   1315 
   1316      // i = 1, j = 3;
   1317      add_deltas_step4(end_col0, end_col2, deltas[1]);
   1318 
   1319      // i = 1, j = 4
   1320      add_deltas_step4(end_col0, end_col3, deltas[2]);
   1321 
   1322      // i = 2, j =3
   1323      add_deltas_step4(end_col1, end_col2, deltas[3]);
   1324 
   1325      // i = 2, j = 4
   1326      add_deltas_step4(end_col1, end_col3, deltas[4]);
   1327 
   1328      // i = 3, j = 4
   1329      add_deltas_step4(end_col2, end_col3, deltas[5]);
   1330    }
   1331 
   1332    int32x4_t delta[6][2];
   1333    int32_t single_delta[6];
   1334 
   1335    delta[0][0] = horizontal_add_4d_s32x4(&deltas[0][0]);
   1336    delta[1][0] = horizontal_add_4d_s32x4(&deltas[1][0]);
   1337    delta[2][0] = horizontal_add_4d_s32x4(&deltas[2][0]);
   1338    delta[3][0] = horizontal_add_4d_s32x4(&deltas[3][0]);
   1339    delta[4][0] = horizontal_add_4d_s32x4(&deltas[4][0]);
   1340    delta[5][0] = horizontal_add_4d_s32x4(&deltas[5][0]);
   1341 
   1342    delta[0][1] = horizontal_add_4d_s32x4(&deltas[0][5]);
   1343    delta[1][1] = horizontal_add_4d_s32x4(&deltas[1][5]);
   1344    delta[2][1] = horizontal_add_4d_s32x4(&deltas[2][5]);
   1345    delta[3][1] = horizontal_add_4d_s32x4(&deltas[3][5]);
   1346    delta[4][1] = horizontal_add_4d_s32x4(&deltas[4][5]);
   1347    delta[5][1] = horizontal_add_4d_s32x4(&deltas[5][5]);
   1348 
   1349    single_delta[0] = horizontal_add_s32x4(deltas[0][4]);
   1350    single_delta[1] = horizontal_add_s32x4(deltas[1][4]);
   1351    single_delta[2] = horizontal_add_s32x4(deltas[2][4]);
   1352    single_delta[3] = horizontal_add_s32x4(deltas[3][4]);
   1353    single_delta[4] = horizontal_add_s32x4(deltas[4][4]);
   1354    single_delta[5] = horizontal_add_s32x4(deltas[5][4]);
   1355 
   1356    int idx = 0;
   1357    for (i = 1; i < wiener_win - 1; i++) {
   1358      for (j = i + 1; j < wiener_win; j++) {
   1359        update_4_stats_neon(
   1360            H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
   1361            delta[idx][0], H + i * wiener_win * wiener_win2 + j * wiener_win);
   1362        H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
   1363            H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
   1364            single_delta[idx];
   1365 
   1366        H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
   1367            H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
   1368            vgetq_lane_s32(delta[idx][1], 0);
   1369        H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
   1370            H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
   1371            vgetq_lane_s32(delta[idx][1], 1);
   1372        H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
   1373            H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
   1374            vgetq_lane_s32(delta[idx][1], 2);
   1375        H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
   1376            H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
   1377            vgetq_lane_s32(delta[idx][1], 3);
   1378 
   1379        idx++;
   1380      }
   1381    }
   1382  }
   1383 
   1384  // Step 5: Derive other points of each square. No square in bottom row.
   1385  i = 0;
   1386  do {
   1387    const int16_t *const di = d + i;
   1388 
   1389    j = i + 1;
   1390    do {
   1391      const int16_t *const dj = d + j;
   1392      int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
   1393        { vdupq_n_s32(0) }, { vdupq_n_s32(0) }
   1394      };
   1395      int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
   1396      int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
   1397 
   1398      x = 0;
   1399      while (x < w16) {
   1400        load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1401                              d_js, d_je);
   1402        derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
   1403        x += 16;
   1404      }
   1405 
   1406      if (w16 != width) {
   1407        load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1408                              d_js, d_je);
   1409        d_is[0] = vandq_s16(d_is[0], mask[0]);
   1410        d_is[1] = vandq_s16(d_is[1], mask[1]);
   1411        d_is[2] = vandq_s16(d_is[2], mask[0]);
   1412        d_is[3] = vandq_s16(d_is[3], mask[1]);
   1413        d_is[4] = vandq_s16(d_is[4], mask[0]);
   1414        d_is[5] = vandq_s16(d_is[5], mask[1]);
   1415        d_is[6] = vandq_s16(d_is[6], mask[0]);
   1416        d_is[7] = vandq_s16(d_is[7], mask[1]);
   1417        d_ie[0] = vandq_s16(d_ie[0], mask[0]);
   1418        d_ie[1] = vandq_s16(d_ie[1], mask[1]);
   1419        d_ie[2] = vandq_s16(d_ie[2], mask[0]);
   1420        d_ie[3] = vandq_s16(d_ie[3], mask[1]);
   1421        d_ie[4] = vandq_s16(d_ie[4], mask[0]);
   1422        d_ie[5] = vandq_s16(d_ie[5], mask[1]);
   1423        d_ie[6] = vandq_s16(d_ie[6], mask[0]);
   1424        d_ie[7] = vandq_s16(d_ie[7], mask[1]);
   1425        derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
   1426      }
   1427 
   1428      hadd_update_4_stats_neon(
   1429          H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
   1430          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
   1431      hadd_update_4_stats_neon(
   1432          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
   1433          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
   1434      hadd_update_4_stats_neon(
   1435          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
   1436          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
   1437      hadd_update_4_stats_neon(
   1438          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
   1439          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
   1440    } while (++j < wiener_win);
   1441  } while (++i < wiener_win - 1);
   1442 
   1443  // Step 6: Derive other points of each upper triangle along the diagonal.
   1444  i = 0;
   1445  do {
   1446    const int16_t *const di = d + i;
   1447    int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) };
   1448    int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
   1449 
   1450    x = 0;
   1451    while (x < w16) {
   1452      load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
   1453      derive_triangle_win5_neon(d_is, d_ie, deltas);
   1454      x += 16;
   1455    }
   1456 
   1457    if (w16 != width) {
   1458      load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
   1459      d_is[0] = vandq_s16(d_is[0], mask[0]);
   1460      d_is[1] = vandq_s16(d_is[1], mask[1]);
   1461      d_is[2] = vandq_s16(d_is[2], mask[0]);
   1462      d_is[3] = vandq_s16(d_is[3], mask[1]);
   1463      d_is[4] = vandq_s16(d_is[4], mask[0]);
   1464      d_is[5] = vandq_s16(d_is[5], mask[1]);
   1465      d_is[6] = vandq_s16(d_is[6], mask[0]);
   1466      d_is[7] = vandq_s16(d_is[7], mask[1]);
   1467      d_ie[0] = vandq_s16(d_ie[0], mask[0]);
   1468      d_ie[1] = vandq_s16(d_ie[1], mask[1]);
   1469      d_ie[2] = vandq_s16(d_ie[2], mask[0]);
   1470      d_ie[3] = vandq_s16(d_ie[3], mask[1]);
   1471      d_ie[4] = vandq_s16(d_ie[4], mask[0]);
   1472      d_ie[5] = vandq_s16(d_ie[5], mask[1]);
   1473      d_ie[6] = vandq_s16(d_ie[6], mask[0]);
   1474      d_ie[7] = vandq_s16(d_ie[7], mask[1]);
   1475      derive_triangle_win5_neon(d_is, d_ie, deltas);
   1476    }
   1477 
   1478    // Row 1: 4 points
   1479    hadd_update_4_stats_neon(
   1480        H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
   1481        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
   1482 
   1483    // Row 2: 3 points
   1484    int32x4_t deltas45 = horizontal_add_2d_s32(deltas[4], deltas[5]);
   1485    int32x4_t deltas78 = horizontal_add_2d_s32(deltas[7], deltas[8]);
   1486 
   1487    int64x2_t deltas45_s64 = vpaddlq_s32(deltas45);
   1488    int64x2_t deltas78_s64 = vpaddlq_s32(deltas78);
   1489 
   1490    int64x2_t src =
   1491        vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
   1492    int64x2_t dst = vaddq_s64(src, deltas45_s64);
   1493    vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, dst);
   1494 
   1495    int32x4_t delta69 = horizontal_add_2d_s32(deltas[6], deltas[9]);
   1496    int64x2_t delta69_s64 = vpaddlq_s32(delta69);
   1497    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
   1498        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
   1499        vgetq_lane_s64(delta69_s64, 0);
   1500 
   1501    // Row 3: 2 points
   1502    vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
   1503              vaddq_s64(dst, deltas78_s64));
   1504 
   1505    // Row 4: 1 point
   1506    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
   1507        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
   1508        vgetq_lane_s64(delta69_s64, 1);
   1509  } while (++i < wiener_win);
   1510 }
   1511 
   1512 static inline void compute_stats_win7_neon(
   1513    const int16_t *const d, const int32_t d_stride, const int16_t *const s,
   1514    const int32_t s_stride, const int32_t width, const int32_t height,
   1515    int64_t *const M, int64_t *const H) {
   1516  const int32_t wiener_win = WIENER_WIN;
   1517  const int32_t wiener_win2 = wiener_win * wiener_win;
   1518  const int32_t w16 = width & ~15;
   1519  const int32_t h8 = height & ~7;
   1520  int16x8_t mask[2];
   1521  mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16);
   1522  mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8);
   1523  const int bit_depth = 8;
   1524  int32_t i, j, x, y;
   1525 
   1526  const int32_t num_bit_left =
   1527      32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
   1528  const int32_t h_allowed =
   1529      (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0));
   1530 
   1531  // Step 1: Calculate the top edge of the whole matrix, i.e., the top
   1532  // edge of each triangle and square on the top row.
   1533  j = 0;
   1534  do {
   1535    const int16_t *s_t = s;
   1536    const int16_t *d_t = d;
   1537    int32_t height_t = 0;
   1538    int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
   1539    int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
   1540    int16x8_t src[2], dgd[2];
   1541 
   1542    do {
   1543      const int32_t h_t =
   1544          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
   1545      int32x4_t row_m[WIENER_WIN * 2] = { vdupq_n_s32(0) };
   1546      int32x4_t row_h[WIENER_WIN * 2] = { vdupq_n_s32(0) };
   1547 
   1548      y = h_t;
   1549      do {
   1550        x = 0;
   1551        while (x < w16) {
   1552          src[0] = vld1q_s16(s_t + x);
   1553          src[1] = vld1q_s16(s_t + x + 8);
   1554          dgd[0] = vld1q_s16(d_t + x);
   1555          dgd[1] = vld1q_s16(d_t + x + 8);
   1556          stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
   1557          x += 16;
   1558        }
   1559 
   1560        if (w16 != width) {
   1561          src[0] = vld1q_s16(s_t + w16);
   1562          src[1] = vld1q_s16(s_t + w16 + 8);
   1563          dgd[0] = vld1q_s16(d_t + w16);
   1564          dgd[1] = vld1q_s16(d_t + w16 + 8);
   1565          src[0] = vandq_s16(src[0], mask[0]);
   1566          src[1] = vandq_s16(src[1], mask[1]);
   1567          dgd[0] = vandq_s16(dgd[0], mask[0]);
   1568          dgd[1] = vandq_s16(dgd[1], mask[1]);
   1569          stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
   1570        }
   1571 
   1572        s_t += s_stride;
   1573        d_t += d_stride;
   1574      } while (--y);
   1575 
   1576      sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]);
   1577      sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]);
   1578      sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]);
   1579      sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]);
   1580      sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]);
   1581      sum_m[5] = vpadalq_s32(sum_m[5], row_m[5]);
   1582      sum_m[6] = vpadalq_s32(sum_m[6], row_m[6]);
   1583 
   1584      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
   1585      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
   1586      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
   1587      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
   1588      sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
   1589      sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]);
   1590      sum_h[6] = vpadalq_s32(sum_h[6], row_h[6]);
   1591 
   1592      height_t += h_t;
   1593    } while (height_t < height);
   1594 
   1595 #if AOM_ARCH_AARCH64
   1596    vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
   1597    vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
   1598    vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
   1599    M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
   1600 
   1601    vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
   1602    vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
   1603    vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
   1604    H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
   1605 #else
   1606    M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]);
   1607    M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]);
   1608    M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]);
   1609    M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]);
   1610    M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]);
   1611    M[wiener_win * j + 5] = horizontal_add_s64x2(sum_m[5]);
   1612    M[wiener_win * j + 6] = horizontal_add_s64x2(sum_m[6]);
   1613 
   1614    H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]);
   1615    H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]);
   1616    H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]);
   1617    H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]);
   1618    H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]);
   1619    H[wiener_win * j + 5] = horizontal_add_s64x2(sum_h[5]);
   1620    H[wiener_win * j + 6] = horizontal_add_s64x2(sum_h[6]);
   1621 #endif  // AOM_ARCH_AARCH64
   1622  } while (++j < wiener_win);
   1623 
   1624  // Step 2: Calculate the left edge of each square on the top row.
   1625  j = 1;
   1626  do {
   1627    const int16_t *d_t = d;
   1628    int32_t height_t = 0;
   1629    int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
   1630    int16x8_t dgd[2];
   1631 
   1632    do {
   1633      const int32_t h_t =
   1634          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
   1635      int32x4_t row_h[WIENER_WIN - 1] = { vdupq_n_s32(0) };
   1636 
   1637      y = h_t;
   1638      do {
   1639        x = 0;
   1640        while (x < w16) {
   1641          dgd[0] = vld1q_s16(d_t + j + x + 0);
   1642          dgd[1] = vld1q_s16(d_t + j + x + 8);
   1643          stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
   1644          x += 16;
   1645        }
   1646 
   1647        if (w16 != width) {
   1648          dgd[0] = vld1q_s16(d_t + j + x + 0);
   1649          dgd[1] = vld1q_s16(d_t + j + x + 8);
   1650          dgd[0] = vandq_s16(dgd[0], mask[0]);
   1651          dgd[1] = vandq_s16(dgd[1], mask[1]);
   1652          stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
   1653        }
   1654 
   1655        d_t += d_stride;
   1656      } while (--y);
   1657 
   1658      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
   1659      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
   1660      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
   1661      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
   1662      sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
   1663      sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]);
   1664 
   1665      height_t += h_t;
   1666    } while (height_t < height);
   1667 
   1668 #if AOM_ARCH_AARCH64
   1669    int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
   1670    int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]);
   1671    int64x2_t sum_h4 = vpaddq_s64(sum_h[4], sum_h[5]);
   1672    vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
   1673    vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
   1674    vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2));
   1675    vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2));
   1676    vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4));
   1677    vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4));
   1678 #else
   1679    H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]);
   1680    H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]);
   1681    H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]);
   1682    H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]);
   1683    H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4]);
   1684    H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5]);
   1685 #endif  // AOM_ARCH_AARCH64
   1686  } while (++j < wiener_win);
   1687 
   1688  // Step 3: Derive the top edge of each triangle along the diagonal. No
   1689  // triangle in top row.
   1690  {
   1691    const int16_t *d_t = d;
   1692    // Pad to call transpose function.
   1693    int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1694    int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1695    int16x8_t ds[WIENER_WIN * 2];
   1696 
   1697    load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
   1698                 &ds[10]);
   1699    load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
   1700                 &ds[11]);
   1701 
   1702    d_t += 6 * d_stride;
   1703 
   1704    step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
   1705    transpose_arrays_s32_8x8(deltas, deltas_tr);
   1706 
   1707    update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
   1708                        deltas_tr[0], deltas_tr[4],
   1709                        H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
   1710    update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
   1711                        deltas_tr[1], deltas_tr[5],
   1712                        H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
   1713    update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
   1714                        deltas_tr[2], deltas_tr[6],
   1715                        H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
   1716    update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
   1717                        deltas_tr[3], deltas_tr[7],
   1718                        H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   1719    update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
   1720                        deltas_tr[8], deltas_tr[12],
   1721                        H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
   1722    update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
   1723                        deltas_tr[9], deltas_tr[13],
   1724                        H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
   1725  }
   1726 
   1727  // Step 4: Derive the top and left edge of each square. No square in top and
   1728  // bottom row.
   1729 
   1730  i = 1;
   1731  do {
   1732    j = i + 1;
   1733    do {
   1734      const int16_t *di = d + i - 1;
   1735      const int16_t *dj = d + j - 1;
   1736      int32x4_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s32(0) };
   1737      int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
   1738 
   1739      dd[5] = vdupq_n_s16(0);  // Initialize to avoid warning.
   1740      const int16_t dd0_values[] = { di[0 * d_stride],
   1741                                     di[1 * d_stride],
   1742                                     di[2 * d_stride],
   1743                                     di[3 * d_stride],
   1744                                     di[4 * d_stride],
   1745                                     di[5 * d_stride],
   1746                                     0,
   1747                                     0 };
   1748      dd[0] = vld1q_s16(dd0_values);
   1749      const int16_t dd1_values[] = { di[0 * d_stride + width],
   1750                                     di[1 * d_stride + width],
   1751                                     di[2 * d_stride + width],
   1752                                     di[3 * d_stride + width],
   1753                                     di[4 * d_stride + width],
   1754                                     di[5 * d_stride + width],
   1755                                     0,
   1756                                     0 };
   1757      dd[1] = vld1q_s16(dd1_values);
   1758      const int16_t ds0_values[] = { dj[0 * d_stride],
   1759                                     dj[1 * d_stride],
   1760                                     dj[2 * d_stride],
   1761                                     dj[3 * d_stride],
   1762                                     dj[4 * d_stride],
   1763                                     dj[5 * d_stride],
   1764                                     0,
   1765                                     0 };
   1766      ds[0] = vld1q_s16(ds0_values);
   1767      int16_t ds1_values[] = { dj[0 * d_stride + width],
   1768                               dj[1 * d_stride + width],
   1769                               dj[2 * d_stride + width],
   1770                               dj[3 * d_stride + width],
   1771                               dj[4 * d_stride + width],
   1772                               dj[5 * d_stride + width],
   1773                               0,
   1774                               0 };
   1775      ds[1] = vld1q_s16(ds1_values);
   1776 
   1777      y = 0;
   1778      while (y < h8) {
   1779        // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
   1780        dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
   1781        dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
   1782        dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
   1783        dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
   1784 
   1785        // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
   1786        // 01s 11s 21s 31s 41s 51s 61s 71s  01e 11e 21e 31e 41e 51e 61e 71e
   1787        ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
   1788        ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
   1789        ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
   1790        ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
   1791 
   1792        load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
   1793        load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
   1794        load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
   1795        load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
   1796        load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
   1797        load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
   1798        load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
   1799        load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
   1800        load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
   1801        load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
   1802        load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
   1803        load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
   1804 
   1805        madd_neon(&deltas[0], dd[0], ds[0]);
   1806        madd_neon(&deltas[1], dd[1], ds[1]);
   1807        madd_neon(&deltas[2], dd[0], ds[2]);
   1808        madd_neon(&deltas[3], dd[1], ds[3]);
   1809        madd_neon(&deltas[4], dd[0], ds[4]);
   1810        madd_neon(&deltas[5], dd[1], ds[5]);
   1811        madd_neon(&deltas[6], dd[0], ds[6]);
   1812        madd_neon(&deltas[7], dd[1], ds[7]);
   1813        madd_neon(&deltas[8], dd[0], ds[8]);
   1814        madd_neon(&deltas[9], dd[1], ds[9]);
   1815        madd_neon(&deltas[10], dd[0], ds[10]);
   1816        madd_neon(&deltas[11], dd[1], ds[11]);
   1817        madd_neon(&deltas[12], dd[0], ds[12]);
   1818        madd_neon(&deltas[13], dd[1], ds[13]);
   1819        madd_neon(&deltas[14], dd[2], ds[0]);
   1820        madd_neon(&deltas[15], dd[3], ds[1]);
   1821        madd_neon(&deltas[16], dd[4], ds[0]);
   1822        madd_neon(&deltas[17], dd[5], ds[1]);
   1823        madd_neon(&deltas[18], dd[6], ds[0]);
   1824        madd_neon(&deltas[19], dd[7], ds[1]);
   1825        madd_neon(&deltas[20], dd[8], ds[0]);
   1826        madd_neon(&deltas[21], dd[9], ds[1]);
   1827        madd_neon(&deltas[22], dd[10], ds[0]);
   1828        madd_neon(&deltas[23], dd[11], ds[1]);
   1829        madd_neon(&deltas[24], dd[12], ds[0]);
   1830        madd_neon(&deltas[25], dd[13], ds[1]);
   1831 
   1832        dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
   1833        dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
   1834        ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
   1835        ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
   1836 
   1837        di += 8 * d_stride;
   1838        dj += 8 * d_stride;
   1839        y += 8;
   1840      }
   1841 
   1842      deltas[0] = hadd_four_32_neon(deltas[0], deltas[2], deltas[4], deltas[6]);
   1843      deltas[1] = hadd_four_32_neon(deltas[1], deltas[3], deltas[5], deltas[7]);
   1844      deltas[2] =
   1845          hadd_four_32_neon(deltas[8], deltas[10], deltas[12], deltas[12]);
   1846      deltas[3] =
   1847          hadd_four_32_neon(deltas[9], deltas[11], deltas[13], deltas[13]);
   1848      deltas[4] =
   1849          hadd_four_32_neon(deltas[14], deltas[16], deltas[18], deltas[20]);
   1850      deltas[5] =
   1851          hadd_four_32_neon(deltas[15], deltas[17], deltas[19], deltas[21]);
   1852      deltas[6] =
   1853          hadd_four_32_neon(deltas[22], deltas[24], deltas[22], deltas[24]);
   1854      deltas[7] =
   1855          hadd_four_32_neon(deltas[23], deltas[25], deltas[23], deltas[25]);
   1856      deltas[0] = vsubq_s32(deltas[1], deltas[0]);
   1857      deltas[1] = vsubq_s32(deltas[3], deltas[2]);
   1858      deltas[2] = vsubq_s32(deltas[5], deltas[4]);
   1859      deltas[3] = vsubq_s32(deltas[7], deltas[6]);
   1860 
   1861      if (h8 != height) {
   1862        const int16_t ds0_vals[] = {
   1863          dj[0 * d_stride], dj[0 * d_stride + width],
   1864          dj[1 * d_stride], dj[1 * d_stride + width],
   1865          dj[2 * d_stride], dj[2 * d_stride + width],
   1866          dj[3 * d_stride], dj[3 * d_stride + width]
   1867        };
   1868        ds[0] = vld1q_s16(ds0_vals);
   1869 
   1870        ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
   1871        ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
   1872        ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
   1873        ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
   1874        const int16_t dd4_vals[] = {
   1875          -di[1 * d_stride], di[1 * d_stride + width],
   1876          -di[2 * d_stride], di[2 * d_stride + width],
   1877          -di[3 * d_stride], di[3 * d_stride + width],
   1878          -di[4 * d_stride], di[4 * d_stride + width]
   1879        };
   1880        dd[4] = vld1q_s16(dd4_vals);
   1881 
   1882        dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
   1883        dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
   1884        do {
   1885          dd[0] = vdupq_n_s16(-di[0 * d_stride]);
   1886          dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
   1887          dd[0] = dd[1] = vzipq_s16(dd[0], dd[2]).val[0];
   1888 
   1889          ds[4] = vdupq_n_s16(dj[0 * d_stride]);
   1890          ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
   1891          ds[4] = ds[5] = vzipq_s16(ds[4], ds[6]).val[0];
   1892 
   1893          dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
   1894          dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
   1895          ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
   1896          ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
   1897 
   1898          madd_neon_pairwise(&deltas[0], dd[0], ds[0]);
   1899          madd_neon_pairwise(&deltas[1], dd[1], ds[1]);
   1900          madd_neon_pairwise(&deltas[2], dd[4], ds[4]);
   1901          madd_neon_pairwise(&deltas[3], dd[5], ds[5]);
   1902 
   1903          int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
   1904          ds[0] = vextq_s16(ds[0], ds[1], 2);
   1905          ds[1] = vextq_s16(ds[1], ds[0], 2);
   1906          ds[1] = vreinterpretq_s16_s32(
   1907              vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
   1908          int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
   1909          dd[4] = vextq_s16(dd[4], dd[5], 2);
   1910          dd[5] = vextq_s16(dd[5], dd[4], 2);
   1911          dd[5] = vreinterpretq_s16_s32(
   1912              vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
   1913          di += d_stride;
   1914          dj += d_stride;
   1915        } while (++y < height);
   1916      }
   1917 
   1918      // Writing one more element on the top edge of a square falls to
   1919      // the next square in the same row or the first element in the next
   1920      // row, which will just be overwritten later.
   1921      update_8_stats_neon(
   1922          H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
   1923          deltas[0], deltas[1],
   1924          H + i * wiener_win * wiener_win2 + j * wiener_win);
   1925 
   1926      H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
   1927          H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
   1928          vgetq_lane_s32(deltas[2], 0);
   1929      H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
   1930          H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
   1931          vgetq_lane_s32(deltas[2], 1);
   1932      H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
   1933          H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
   1934          vgetq_lane_s32(deltas[2], 2);
   1935      H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
   1936          H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
   1937          vgetq_lane_s32(deltas[2], 3);
   1938      H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
   1939          H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
   1940          vgetq_lane_s32(deltas[3], 0);
   1941      H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
   1942          H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
   1943          vgetq_lane_s32(deltas[3], 1);
   1944    } while (++j < wiener_win);
   1945  } while (++i < wiener_win - 1);
   1946 
   1947  // Step 5: Derive other points of each square. No square in bottom row.
   1948  i = 0;
   1949  do {
   1950    const int16_t *const di = d + i;
   1951 
   1952    j = i + 1;
   1953    do {
   1954      const int16_t *const dj = d + j;
   1955      int32x4_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s32(0) },
   1956                                                  { vdupq_n_s32(0) } };
   1957      int16x8_t d_is[WIN_7];
   1958      int16x8_t d_ie[WIN_7];
   1959      int16x8_t d_js[WIN_7];
   1960      int16x8_t d_je[WIN_7];
   1961 
   1962      x = 0;
   1963      while (x < w16) {
   1964        load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1965                              d_js, d_je);
   1966        derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
   1967        x += 16;
   1968      }
   1969 
   1970      if (w16 != width) {
   1971        load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1972                              d_js, d_je);
   1973        d_is[0] = vandq_s16(d_is[0], mask[0]);
   1974        d_is[1] = vandq_s16(d_is[1], mask[1]);
   1975        d_is[2] = vandq_s16(d_is[2], mask[0]);
   1976        d_is[3] = vandq_s16(d_is[3], mask[1]);
   1977        d_is[4] = vandq_s16(d_is[4], mask[0]);
   1978        d_is[5] = vandq_s16(d_is[5], mask[1]);
   1979        d_is[6] = vandq_s16(d_is[6], mask[0]);
   1980        d_is[7] = vandq_s16(d_is[7], mask[1]);
   1981        d_is[8] = vandq_s16(d_is[8], mask[0]);
   1982        d_is[9] = vandq_s16(d_is[9], mask[1]);
   1983        d_is[10] = vandq_s16(d_is[10], mask[0]);
   1984        d_is[11] = vandq_s16(d_is[11], mask[1]);
   1985        d_ie[0] = vandq_s16(d_ie[0], mask[0]);
   1986        d_ie[1] = vandq_s16(d_ie[1], mask[1]);
   1987        d_ie[2] = vandq_s16(d_ie[2], mask[0]);
   1988        d_ie[3] = vandq_s16(d_ie[3], mask[1]);
   1989        d_ie[4] = vandq_s16(d_ie[4], mask[0]);
   1990        d_ie[5] = vandq_s16(d_ie[5], mask[1]);
   1991        d_ie[6] = vandq_s16(d_ie[6], mask[0]);
   1992        d_ie[7] = vandq_s16(d_ie[7], mask[1]);
   1993        d_ie[8] = vandq_s16(d_ie[8], mask[0]);
   1994        d_ie[9] = vandq_s16(d_ie[9], mask[1]);
   1995        d_ie[10] = vandq_s16(d_ie[10], mask[0]);
   1996        d_ie[11] = vandq_s16(d_ie[11], mask[1]);
   1997        derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
   1998      }
   1999 
   2000      hadd_update_6_stats_neon(
   2001          H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
   2002          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
   2003      hadd_update_6_stats_neon(
   2004          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
   2005          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
   2006      hadd_update_6_stats_neon(
   2007          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
   2008          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
   2009      hadd_update_6_stats_neon(
   2010          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
   2011          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
   2012      hadd_update_6_stats_neon(
   2013          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
   2014          H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
   2015      hadd_update_6_stats_neon(
   2016          H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
   2017          H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
   2018    } while (++j < wiener_win);
   2019  } while (++i < wiener_win - 1);
   2020 
   2021  // Step 6: Derive other points of each upper triangle along the diagonal.
   2022  i = 0;
   2023  do {
   2024    const int16_t *const di = d + i;
   2025    int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) };
   2026    int16x8_t d_is[WIN_7], d_ie[WIN_7];
   2027 
   2028    x = 0;
   2029    while (x < w16) {
   2030      load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
   2031      derive_triangle_win7_neon(d_is, d_ie, deltas);
   2032      x += 16;
   2033    }
   2034 
   2035    if (w16 != width) {
   2036      load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
   2037      d_is[0] = vandq_s16(d_is[0], mask[0]);
   2038      d_is[1] = vandq_s16(d_is[1], mask[1]);
   2039      d_is[2] = vandq_s16(d_is[2], mask[0]);
   2040      d_is[3] = vandq_s16(d_is[3], mask[1]);
   2041      d_is[4] = vandq_s16(d_is[4], mask[0]);
   2042      d_is[5] = vandq_s16(d_is[5], mask[1]);
   2043      d_is[6] = vandq_s16(d_is[6], mask[0]);
   2044      d_is[7] = vandq_s16(d_is[7], mask[1]);
   2045      d_is[8] = vandq_s16(d_is[8], mask[0]);
   2046      d_is[9] = vandq_s16(d_is[9], mask[1]);
   2047      d_is[10] = vandq_s16(d_is[10], mask[0]);
   2048      d_is[11] = vandq_s16(d_is[11], mask[1]);
   2049      d_ie[0] = vandq_s16(d_ie[0], mask[0]);
   2050      d_ie[1] = vandq_s16(d_ie[1], mask[1]);
   2051      d_ie[2] = vandq_s16(d_ie[2], mask[0]);
   2052      d_ie[3] = vandq_s16(d_ie[3], mask[1]);
   2053      d_ie[4] = vandq_s16(d_ie[4], mask[0]);
   2054      d_ie[5] = vandq_s16(d_ie[5], mask[1]);
   2055      d_ie[6] = vandq_s16(d_ie[6], mask[0]);
   2056      d_ie[7] = vandq_s16(d_ie[7], mask[1]);
   2057      d_ie[8] = vandq_s16(d_ie[8], mask[0]);
   2058      d_ie[9] = vandq_s16(d_ie[9], mask[1]);
   2059      d_ie[10] = vandq_s16(d_ie[10], mask[0]);
   2060      d_ie[11] = vandq_s16(d_ie[11], mask[1]);
   2061      derive_triangle_win7_neon(d_is, d_ie, deltas);
   2062    }
   2063 
   2064    // Row 1: 6 points
   2065    hadd_update_6_stats_neon(
   2066        H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
   2067        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
   2068 
   2069    int32x4_t delta1710 = horizontal_add_2d_s32(deltas[17], deltas[10]);
   2070    int32x4_t delta1516 = horizontal_add_2d_s32(deltas[15], deltas[16]);
   2071 
   2072    int64x2_t delta1710_s64 = vpaddlq_s32(delta1710);
   2073    int64x2_t delta1516_s64 = vpaddlq_s32(delta1516);
   2074 
   2075    // Row 2: 5 points
   2076    hadd_update_4_stats_neon(
   2077        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
   2078        H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
   2079    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
   2080        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
   2081        vgetq_lane_s64(delta1710_s64, 1);
   2082 
   2083    // Row 3: 4 points
   2084    hadd_update_4_stats_neon(
   2085        H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
   2086        deltas + 11,
   2087        H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
   2088 
   2089    // Row 4: 3 points
   2090    int64x2_t h0 =
   2091        vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
   2092    vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
   2093              vaddq_s64(h0, delta1516_s64));
   2094    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
   2095        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
   2096        vgetq_lane_s64(delta1710_s64, 0);
   2097 
   2098    int32x4_t delta1819 = horizontal_add_2d_s32(deltas[18], deltas[19]);
   2099    int64x2_t delta1819_s64 = vpaddlq_s32(delta1819);
   2100 
   2101    // Row 5: 2 points
   2102    int64x2_t h1 =
   2103        vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
   2104    vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
   2105              vaddq_s64(h1, delta1819_s64));
   2106 
   2107    // Row 6: 1 points
   2108    H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
   2109        H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
   2110        horizontal_long_add_s32x4(deltas[20]);
   2111  } while (++i < wiener_win);
   2112 }
   2113 
   2114 static inline uint8_t find_average_neon(const uint8_t *src, int src_stride,
   2115                                        int width, int height) {
   2116  uint64_t sum = 0;
   2117 
   2118  if (width >= 16) {
   2119    int h = 0;
   2120    // We can accumulate up to 257 8-bit values in a 16-bit value, given
   2121    // that each 16-bit vector has 8 elements, that means we can process up to
   2122    // int(257*8/width) rows before we need to widen to 32-bit vector
   2123    // elements.
   2124    int h_overflow = 257 * 8 / width;
   2125    int h_limit = height > h_overflow ? h_overflow : height;
   2126    uint32x4_t avg_u32 = vdupq_n_u32(0);
   2127    do {
   2128      uint16x8_t avg_u16 = vdupq_n_u16(0);
   2129      do {
   2130        int j = width;
   2131        const uint8_t *src_ptr = src;
   2132        do {
   2133          uint8x16_t s = vld1q_u8(src_ptr);
   2134          avg_u16 = vpadalq_u8(avg_u16, s);
   2135          j -= 16;
   2136          src_ptr += 16;
   2137        } while (j >= 16);
   2138        if (j >= 8) {
   2139          uint8x8_t s = vld1_u8(src_ptr);
   2140          avg_u16 = vaddw_u8(avg_u16, s);
   2141          j -= 8;
   2142          src_ptr += 8;
   2143        }
   2144        // Scalar tail case.
   2145        while (j > 0) {
   2146          sum += src[width - j];
   2147          j--;
   2148        }
   2149        src += src_stride;
   2150      } while (++h < h_limit);
   2151      avg_u32 = vpadalq_u16(avg_u32, avg_u16);
   2152 
   2153      h_limit += h_overflow;
   2154      h_limit = height > h_overflow ? h_overflow : height;
   2155    } while (h < height);
   2156    return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
   2157                     (width * height));
   2158  }
   2159  if (width >= 8) {
   2160    int h = 0;
   2161    // We can accumulate up to 257 8-bit values in a 16-bit value, given
   2162    // that each 16-bit vector has 4 elements, that means we can process up to
   2163    // int(257*4/width) rows before we need to widen to 32-bit vector
   2164    // elements.
   2165    int h_overflow = 257 * 4 / width;
   2166    int h_limit = height > h_overflow ? h_overflow : height;
   2167    uint32x2_t avg_u32 = vdup_n_u32(0);
   2168    do {
   2169      uint16x4_t avg_u16 = vdup_n_u16(0);
   2170      do {
   2171        int j = width;
   2172        const uint8_t *src_ptr = src;
   2173        uint8x8_t s = vld1_u8(src_ptr);
   2174        avg_u16 = vpadal_u8(avg_u16, s);
   2175        j -= 8;
   2176        src_ptr += 8;
   2177        // Scalar tail case.
   2178        while (j > 0) {
   2179          sum += src[width - j];
   2180          j--;
   2181        }
   2182        src += src_stride;
   2183      } while (++h < h_limit);
   2184      avg_u32 = vpadal_u16(avg_u32, avg_u16);
   2185 
   2186      h_limit += h_overflow;
   2187      h_limit = height > h_overflow ? h_overflow : height;
   2188    } while (h < height);
   2189    return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
   2190                     (width * height));
   2191  }
   2192  int i = height;
   2193  do {
   2194    int j = 0;
   2195    do {
   2196      sum += src[j];
   2197    } while (++j < width);
   2198    src += src_stride;
   2199  } while (--i != 0);
   2200  return (uint8_t)(sum / (width * height));
   2201 }
   2202 
   2203 static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
   2204                                   int16_t *buf_avg, int buf_avg_stride,
   2205                                   int width, int height,
   2206                                   int downsample_factor) {
   2207  uint8x8_t avg_u8 = vdup_n_u8(avg);
   2208 
   2209  if (width > 8) {
   2210    int i = 0;
   2211    do {
   2212      int j = width;
   2213      const uint8_t *buf_ptr = buf;
   2214      int16_t *buf_avg_ptr = buf_avg;
   2215      do {
   2216        uint8x8_t d = vld1_u8(buf_ptr);
   2217        vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
   2218 
   2219        j -= 8;
   2220        buf_ptr += 8;
   2221        buf_avg_ptr += 8;
   2222      } while (j >= 8);
   2223      while (j > 0) {
   2224        *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg;
   2225        buf_avg_ptr++;
   2226        j--;
   2227      }
   2228      buf += buf_stride;
   2229      buf_avg += buf_avg_stride;
   2230      i += downsample_factor;
   2231    } while (i < height);
   2232  } else {
   2233    // For width < 8, don't use Neon.
   2234    for (int i = 0; i < height; i = i + downsample_factor) {
   2235      for (int j = 0; j < width; j++) {
   2236        buf_avg[j] = (int16_t)buf[j] - (int16_t)avg;
   2237      }
   2238      buf += buf_stride;
   2239      buf_avg += buf_avg_stride;
   2240    }
   2241  }
   2242 }
   2243 
   2244 static inline void av1_compute_stats_downsampled_neon(
   2245    int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg,
   2246    int16_t *src_avg, int h_start, int h_end, int v_start, int v_end,
   2247    int dgd_stride, int src_stride, int64_t *M, int64_t *H,
   2248    int use_downsampled_wiener_stats) {
   2249  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
   2250  assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
   2251  (void)dgd_avg;
   2252  (void)src_avg;
   2253 
   2254  const int wiener_win2 = wiener_win * wiener_win;
   2255  const int wiener_halfwin = wiener_win >> 1;
   2256  const int width = h_end - h_start;
   2257  const int height = v_end - v_start;
   2258 
   2259  const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
   2260  const uint8_t *src_start = src + h_start + v_start * src_stride;
   2261 
   2262  // The wiener window will slide along the dgd frame, centered on each pixel.
   2263  // For the top left pixel and all the pixels on the side of the frame this
   2264  // means half of the window will be outside of the frame. As such the actual
   2265  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
   2266  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
   2267  const int vert_offset = v_start - wiener_halfwin;
   2268  const int horiz_offset = h_start - wiener_halfwin;
   2269  const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
   2270 
   2271  uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
   2272 
   2273  // Since the height is not necessarily a multiple of the downsample factor,
   2274  // the last line of src will be scaled according to how many rows remain.
   2275  int downsample_factor =
   2276      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
   2277 
   2278  int downsampled_height = height / downsample_factor;
   2279  int downsample_remainder = height % downsample_factor;
   2280 
   2281  memset(M, 0, wiener_win2 * sizeof(*M));
   2282  memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H));
   2283 
   2284  // Calculate the M and H matrices for the normal and downsampled cases.
   2285  if (downsampled_height > 0) {
   2286    if (wiener_win == WIENER_WIN) {
   2287      compute_stats_win7_downsampled_neon(
   2288          dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride,
   2289          avg, M, H, downsample_factor);
   2290    } else {
   2291      compute_stats_win5_downsampled_neon(
   2292          dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride,
   2293          avg, M, H, downsample_factor);
   2294    }
   2295  }
   2296 
   2297  // Accumulate the remaining last rows in the downsampled case.
   2298  if (downsample_remainder > 0) {
   2299    int remainder_offset = height - downsample_remainder;
   2300    if (wiener_win == WIENER_WIN) {
   2301      compute_stats_win7_downsampled_neon(
   2302          dgd_win + remainder_offset * dgd_stride,
   2303          src_start + remainder_offset * src_stride, width, 1, dgd_stride,
   2304          src_stride, avg, M, H, downsample_remainder);
   2305    } else {
   2306      compute_stats_win5_downsampled_neon(
   2307          dgd_win + remainder_offset * dgd_stride,
   2308          src_start + remainder_offset * src_stride, width, 1, dgd_stride,
   2309          src_stride, avg, M, H, downsample_remainder);
   2310    }
   2311  }
   2312 }
   2313 
   2314 void av1_compute_stats_neon(int32_t wiener_win, const uint8_t *dgd,
   2315                            const uint8_t *src, int16_t *dgd_avg,
   2316                            int16_t *src_avg, int32_t h_start, int32_t h_end,
   2317                            int32_t v_start, int32_t v_end, int32_t dgd_stride,
   2318                            int32_t src_stride, int64_t *M, int64_t *H,
   2319                            int use_downsampled_wiener_stats) {
   2320  assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
   2321  if (use_downsampled_wiener_stats) {
   2322    av1_compute_stats_downsampled_neon(
   2323        wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, v_start, v_end,
   2324        dgd_stride, src_stride, M, H, use_downsampled_wiener_stats);
   2325    return;
   2326  }
   2327 
   2328  const int32_t wiener_win2 = wiener_win * wiener_win;
   2329  const int32_t wiener_halfwin = (wiener_win >> 1);
   2330  const int32_t width = h_end - h_start;
   2331  const int32_t height = v_end - v_start;
   2332  const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
   2333  const uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
   2334  const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
   2335  const int32_t s_stride = (width + 15) & ~15;
   2336 
   2337  compute_sub_avg(src + v_start * src_stride + h_start, src_stride, avg,
   2338                  src_avg, s_stride, width, height, 1);
   2339  compute_sub_avg(
   2340      dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
   2341      dgd_stride, avg, dgd_avg, d_stride, width + 2 * wiener_halfwin,
   2342      height + 2 * wiener_halfwin, 1);
   2343 
   2344  if (wiener_win == WIENER_WIN) {
   2345    compute_stats_win7_neon(dgd_avg, d_stride, src_avg, s_stride, width, height,
   2346                            M, H);
   2347  } else if (wiener_win == WIENER_WIN_CHROMA) {
   2348    compute_stats_win5_neon(dgd_avg, d_stride, src_avg, s_stride, width, height,
   2349                            M, H);
   2350  }
   2351 
   2352  // H is a symmetric matrix, so we only need to fill out the upper triangle.
   2353  // We can copy it down to the lower triangle outside the (i, j) loops.
   2354  diagonal_copy_stats_neon(wiener_win2, H);
   2355 }
   2356 
   2357 static inline void calc_proj_params_r0_r1_neon(
   2358    const uint8_t *src8, int width, int height, int src_stride,
   2359    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
   2360    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
   2361  assert(width % 8 == 0);
   2362  const int size = width * height;
   2363 
   2364  int64x2_t h00_lo = vdupq_n_s64(0);
   2365  int64x2_t h00_hi = vdupq_n_s64(0);
   2366  int64x2_t h11_lo = vdupq_n_s64(0);
   2367  int64x2_t h11_hi = vdupq_n_s64(0);
   2368  int64x2_t h01_lo = vdupq_n_s64(0);
   2369  int64x2_t h01_hi = vdupq_n_s64(0);
   2370  int64x2_t c0_lo = vdupq_n_s64(0);
   2371  int64x2_t c0_hi = vdupq_n_s64(0);
   2372  int64x2_t c1_lo = vdupq_n_s64(0);
   2373  int64x2_t c1_hi = vdupq_n_s64(0);
   2374 
   2375  do {
   2376    const uint8_t *src_ptr = src8;
   2377    const uint8_t *dat_ptr = dat8;
   2378    int32_t *flt0_ptr = flt0;
   2379    int32_t *flt1_ptr = flt1;
   2380    int w = width;
   2381 
   2382    do {
   2383      uint8x8_t s = vld1_u8(src_ptr);
   2384      uint8x8_t d = vld1_u8(dat_ptr);
   2385      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
   2386      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
   2387      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
   2388      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
   2389 
   2390      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
   2391      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
   2392 
   2393      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
   2394      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
   2395      f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
   2396      f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
   2397      f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
   2398      f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
   2399 
   2400      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
   2401      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
   2402      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
   2403      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
   2404 
   2405      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
   2406      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
   2407      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
   2408      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
   2409 
   2410      h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
   2411      h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
   2412      h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
   2413      h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
   2414 
   2415      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
   2416      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
   2417      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
   2418      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
   2419 
   2420      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
   2421      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
   2422      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
   2423      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
   2424 
   2425      src_ptr += 8;
   2426      dat_ptr += 8;
   2427      flt0_ptr += 8;
   2428      flt1_ptr += 8;
   2429      w -= 8;
   2430    } while (w != 0);
   2431 
   2432    src8 += src_stride;
   2433    dat8 += dat_stride;
   2434    flt0 += flt0_stride;
   2435    flt1 += flt1_stride;
   2436  } while (--height != 0);
   2437 
   2438  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
   2439  H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
   2440  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
   2441  H[1][0] = H[0][1];
   2442  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
   2443  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
   2444 }
   2445 
   2446 static inline void calc_proj_params_r0_neon(const uint8_t *src8, int width,
   2447                                            int height, int src_stride,
   2448                                            const uint8_t *dat8, int dat_stride,
   2449                                            int32_t *flt0, int flt0_stride,
   2450                                            int64_t H[2][2], int64_t C[2]) {
   2451  assert(width % 8 == 0);
   2452  const int size = width * height;
   2453 
   2454  int64x2_t h00_lo = vdupq_n_s64(0);
   2455  int64x2_t h00_hi = vdupq_n_s64(0);
   2456  int64x2_t c0_lo = vdupq_n_s64(0);
   2457  int64x2_t c0_hi = vdupq_n_s64(0);
   2458 
   2459  do {
   2460    const uint8_t *src_ptr = src8;
   2461    const uint8_t *dat_ptr = dat8;
   2462    int32_t *flt0_ptr = flt0;
   2463    int w = width;
   2464 
   2465    do {
   2466      uint8x8_t s = vld1_u8(src_ptr);
   2467      uint8x8_t d = vld1_u8(dat_ptr);
   2468      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
   2469      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
   2470 
   2471      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
   2472      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
   2473 
   2474      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
   2475      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
   2476      f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
   2477      f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
   2478 
   2479      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
   2480      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
   2481      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
   2482      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
   2483 
   2484      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
   2485      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
   2486      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
   2487      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
   2488 
   2489      src_ptr += 8;
   2490      dat_ptr += 8;
   2491      flt0_ptr += 8;
   2492      w -= 8;
   2493    } while (w != 0);
   2494 
   2495    src8 += src_stride;
   2496    dat8 += dat_stride;
   2497    flt0 += flt0_stride;
   2498  } while (--height != 0);
   2499 
   2500  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
   2501  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
   2502 }
   2503 
   2504 static inline void calc_proj_params_r1_neon(const uint8_t *src8, int width,
   2505                                            int height, int src_stride,
   2506                                            const uint8_t *dat8, int dat_stride,
   2507                                            int32_t *flt1, int flt1_stride,
   2508                                            int64_t H[2][2], int64_t C[2]) {
   2509  assert(width % 8 == 0);
   2510  const int size = width * height;
   2511 
   2512  int64x2_t h11_lo = vdupq_n_s64(0);
   2513  int64x2_t h11_hi = vdupq_n_s64(0);
   2514  int64x2_t c1_lo = vdupq_n_s64(0);
   2515  int64x2_t c1_hi = vdupq_n_s64(0);
   2516 
   2517  do {
   2518    const uint8_t *src_ptr = src8;
   2519    const uint8_t *dat_ptr = dat8;
   2520    int32_t *flt1_ptr = flt1;
   2521    int w = width;
   2522 
   2523    do {
   2524      uint8x8_t s = vld1_u8(src_ptr);
   2525      uint8x8_t d = vld1_u8(dat_ptr);
   2526      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
   2527      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
   2528 
   2529      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
   2530      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
   2531 
   2532      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
   2533      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
   2534      f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
   2535      f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
   2536 
   2537      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
   2538      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
   2539      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
   2540      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
   2541 
   2542      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
   2543      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
   2544      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
   2545      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
   2546 
   2547      src_ptr += 8;
   2548      dat_ptr += 8;
   2549      flt1_ptr += 8;
   2550      w -= 8;
   2551    } while (w != 0);
   2552 
   2553    src8 += src_stride;
   2554    dat8 += dat_stride;
   2555    flt1 += flt1_stride;
   2556  } while (--height != 0);
   2557 
   2558  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
   2559  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
   2560 }
   2561 
   2562 // The function calls 3 subfunctions for the following cases :
   2563 // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
   2564 //    of C and H need to be computed.
   2565 // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
   2566 //    non-zero and need to be computed.
   2567 // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
   2568 //    non-zero and need to be computed.
   2569 void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
   2570                               int src_stride, const uint8_t *dat8,
   2571                               int dat_stride, int32_t *flt0, int flt0_stride,
   2572                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
   2573                               int64_t C[2], const sgr_params_type *params) {
   2574  if ((params->r[0] > 0) && (params->r[1] > 0)) {
   2575    calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
   2576                                dat_stride, flt0, flt0_stride, flt1,
   2577                                flt1_stride, H, C);
   2578  } else if (params->r[0] > 0) {
   2579    calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
   2580                             flt0, flt0_stride, H, C);
   2581  } else if (params->r[1] > 0) {
   2582    calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
   2583                             flt1, flt1_stride, H, C);
   2584  }
   2585 }