tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_pickrst_neon.c (78726B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 #include <stdint.h>
     15 
     16 #include "aom_dsp/arm/mem_neon.h"
     17 #include "aom_dsp/arm/sum_neon.h"
     18 #include "aom_dsp/arm/transpose_neon.h"
     19 #include "av1/encoder/arm/pickrst_neon.h"
     20 #include "av1/encoder/pickrst.h"
     21 
     22 static inline void highbd_calc_proj_params_r0_r1_neon(
     23    const uint8_t *src8, int width, int height, int src_stride,
     24    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
     25    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
     26  assert(width % 8 == 0);
     27  const int size = width * height;
     28  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     29  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
     30 
     31  int64x2_t h00_lo = vdupq_n_s64(0);
     32  int64x2_t h00_hi = vdupq_n_s64(0);
     33  int64x2_t h11_lo = vdupq_n_s64(0);
     34  int64x2_t h11_hi = vdupq_n_s64(0);
     35  int64x2_t h01_lo = vdupq_n_s64(0);
     36  int64x2_t h01_hi = vdupq_n_s64(0);
     37  int64x2_t c0_lo = vdupq_n_s64(0);
     38  int64x2_t c0_hi = vdupq_n_s64(0);
     39  int64x2_t c1_lo = vdupq_n_s64(0);
     40  int64x2_t c1_hi = vdupq_n_s64(0);
     41 
     42  do {
     43    const uint16_t *src_ptr = src;
     44    const uint16_t *dat_ptr = dat;
     45    int32_t *flt0_ptr = flt0;
     46    int32_t *flt1_ptr = flt1;
     47    int w = width;
     48 
     49    do {
     50      uint16x8_t s = vld1q_u16(src_ptr);
     51      uint16x8_t d = vld1q_u16(dat_ptr);
     52      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
     53      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
     54      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
     55      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
     56 
     57      int32x4_t u_lo =
     58          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
     59      int32x4_t u_hi = vreinterpretq_s32_u32(
     60          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
     61      int32x4_t s_lo =
     62          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
     63      int32x4_t s_hi = vreinterpretq_s32_u32(
     64          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
     65      s_lo = vsubq_s32(s_lo, u_lo);
     66      s_hi = vsubq_s32(s_hi, u_hi);
     67 
     68      f0_lo = vsubq_s32(f0_lo, u_lo);
     69      f0_hi = vsubq_s32(f0_hi, u_hi);
     70      f1_lo = vsubq_s32(f1_lo, u_lo);
     71      f1_hi = vsubq_s32(f1_hi, u_hi);
     72 
     73      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
     74      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
     75      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
     76      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
     77 
     78      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
     79      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
     80      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
     81      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
     82 
     83      h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
     84      h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
     85      h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
     86      h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
     87 
     88      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
     89      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
     90      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
     91      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
     92 
     93      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
     94      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
     95      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
     96      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
     97 
     98      src_ptr += 8;
     99      dat_ptr += 8;
    100      flt0_ptr += 8;
    101      flt1_ptr += 8;
    102      w -= 8;
    103    } while (w != 0);
    104 
    105    src += src_stride;
    106    dat += dat_stride;
    107    flt0 += flt0_stride;
    108    flt1 += flt1_stride;
    109  } while (--height != 0);
    110 
    111  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
    112  H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
    113  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
    114  H[1][0] = H[0][1];
    115  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
    116  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
    117 }
    118 
    119 static inline void highbd_calc_proj_params_r0_neon(
    120    const uint8_t *src8, int width, int height, int src_stride,
    121    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
    122    int64_t H[2][2], int64_t C[2]) {
    123  assert(width % 8 == 0);
    124  const int size = width * height;
    125  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    126  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
    127 
    128  int64x2_t h00_lo = vdupq_n_s64(0);
    129  int64x2_t h00_hi = vdupq_n_s64(0);
    130  int64x2_t c0_lo = vdupq_n_s64(0);
    131  int64x2_t c0_hi = vdupq_n_s64(0);
    132 
    133  do {
    134    const uint16_t *src_ptr = src;
    135    const uint16_t *dat_ptr = dat;
    136    int32_t *flt0_ptr = flt0;
    137    int w = width;
    138 
    139    do {
    140      uint16x8_t s = vld1q_u16(src_ptr);
    141      uint16x8_t d = vld1q_u16(dat_ptr);
    142      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
    143      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
    144 
    145      int32x4_t u_lo =
    146          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
    147      int32x4_t u_hi = vreinterpretq_s32_u32(
    148          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
    149      int32x4_t s_lo =
    150          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
    151      int32x4_t s_hi = vreinterpretq_s32_u32(
    152          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
    153      s_lo = vsubq_s32(s_lo, u_lo);
    154      s_hi = vsubq_s32(s_hi, u_hi);
    155 
    156      f0_lo = vsubq_s32(f0_lo, u_lo);
    157      f0_hi = vsubq_s32(f0_hi, u_hi);
    158 
    159      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
    160      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
    161      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
    162      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
    163 
    164      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
    165      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
    166      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
    167      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
    168 
    169      src_ptr += 8;
    170      dat_ptr += 8;
    171      flt0_ptr += 8;
    172      w -= 8;
    173    } while (w != 0);
    174 
    175    src += src_stride;
    176    dat += dat_stride;
    177    flt0 += flt0_stride;
    178  } while (--height != 0);
    179 
    180  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
    181  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
    182 }
    183 
    184 static inline void highbd_calc_proj_params_r1_neon(
    185    const uint8_t *src8, int width, int height, int src_stride,
    186    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
    187    int64_t H[2][2], int64_t C[2]) {
    188  assert(width % 8 == 0);
    189  const int size = width * height;
    190  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    191  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
    192 
    193  int64x2_t h11_lo = vdupq_n_s64(0);
    194  int64x2_t h11_hi = vdupq_n_s64(0);
    195  int64x2_t c1_lo = vdupq_n_s64(0);
    196  int64x2_t c1_hi = vdupq_n_s64(0);
    197 
    198  do {
    199    const uint16_t *src_ptr = src;
    200    const uint16_t *dat_ptr = dat;
    201    int32_t *flt1_ptr = flt1;
    202    int w = width;
    203 
    204    do {
    205      uint16x8_t s = vld1q_u16(src_ptr);
    206      uint16x8_t d = vld1q_u16(dat_ptr);
    207      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
    208      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
    209 
    210      int32x4_t u_lo =
    211          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
    212      int32x4_t u_hi = vreinterpretq_s32_u32(
    213          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
    214      int32x4_t s_lo =
    215          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
    216      int32x4_t s_hi = vreinterpretq_s32_u32(
    217          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
    218      s_lo = vsubq_s32(s_lo, u_lo);
    219      s_hi = vsubq_s32(s_hi, u_hi);
    220 
    221      f1_lo = vsubq_s32(f1_lo, u_lo);
    222      f1_hi = vsubq_s32(f1_hi, u_hi);
    223 
    224      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
    225      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
    226      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
    227      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
    228 
    229      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
    230      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
    231      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
    232      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
    233 
    234      src_ptr += 8;
    235      dat_ptr += 8;
    236      flt1_ptr += 8;
    237      w -= 8;
    238    } while (w != 0);
    239 
    240    src += src_stride;
    241    dat += dat_stride;
    242    flt1 += flt1_stride;
    243  } while (--height != 0);
    244 
    245  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
    246  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
    247 }
    248 
    249 // The function calls 3 subfunctions for the following cases :
    250 // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
    251 //    of C and H need to be computed.
    252 // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
    253 //    non-zero and need to be computed.
    254 // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
    255 //    non-zero and need to be computed.
    256 void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
    257                                       int height, int src_stride,
    258                                       const uint8_t *dat8, int dat_stride,
    259                                       int32_t *flt0, int flt0_stride,
    260                                       int32_t *flt1, int flt1_stride,
    261                                       int64_t H[2][2], int64_t C[2],
    262                                       const sgr_params_type *params) {
    263  if ((params->r[0] > 0) && (params->r[1] > 0)) {
    264    highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
    265                                       dat_stride, flt0, flt0_stride, flt1,
    266                                       flt1_stride, H, C);
    267  } else if (params->r[0] > 0) {
    268    highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
    269                                    dat_stride, flt0, flt0_stride, H, C);
    270  } else if (params->r[1] > 0) {
    271    highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
    272                                    dat_stride, flt1, flt1_stride, H, C);
    273  }
    274 }
    275 
    276 static inline void hadd_update_4_stats_neon(const int64_t *const src,
    277                                            const int32x4_t *deltas,
    278                                            int64_t *const dst) {
    279  int64x2_t delta0_s64 = vpaddlq_s32(deltas[0]);
    280  int64x2_t delta1_s64 = vpaddlq_s32(deltas[1]);
    281  int64x2_t delta2_s64 = vpaddlq_s32(deltas[2]);
    282  int64x2_t delta3_s64 = vpaddlq_s32(deltas[3]);
    283 
    284 #if AOM_ARCH_AARCH64
    285  int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64);
    286  int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64);
    287 
    288  int64x2_t src0 = vld1q_s64(src);
    289  int64x2_t src1 = vld1q_s64(src + 2);
    290  vst1q_s64(dst, vaddq_s64(src0, delta01));
    291  vst1q_s64(dst + 2, vaddq_s64(src1, delta23));
    292 #else
    293  dst[0] = src[0] + horizontal_add_s64x2(delta0_s64);
    294  dst[1] = src[1] + horizontal_add_s64x2(delta1_s64);
    295  dst[2] = src[2] + horizontal_add_s64x2(delta2_s64);
    296  dst[3] = src[3] + horizontal_add_s64x2(delta3_s64);
    297 #endif
    298 }
    299 
    300 static inline void compute_stats_win5_highbd_neon(
    301    const int16_t *const d, const int32_t d_stride, const int16_t *const s,
    302    const int32_t s_stride, const int32_t width, const int32_t height,
    303    int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) {
    304  const int32_t wiener_win = WIENER_WIN_CHROMA;
    305  const int32_t wiener_win2 = wiener_win * wiener_win;
    306  const int32_t w16 = width & ~15;
    307  const int32_t h8 = height & ~7;
    308  int16x8_t mask[2];
    309  mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16);
    310  mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8);
    311  int32_t i, j, x, y;
    312 
    313  const int32_t num_bit_left =
    314      32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
    315  const int32_t h_allowed =
    316      (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0));
    317 
    318  // Step 1: Calculate the top edge of the whole matrix, i.e., the top
    319  // edge of each triangle and square on the top row.
    320  j = 0;
    321  do {
    322    const int16_t *s_t = s;
    323    const int16_t *d_t = d;
    324    int32_t height_t = 0;
    325    int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
    326    int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
    327    int16x8_t src[2], dgd[2];
    328 
    329    do {
    330      const int32_t h_t =
    331          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
    332      int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
    333      int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
    334 
    335      y = h_t;
    336      do {
    337        x = 0;
    338        while (x < w16) {
    339          src[0] = vld1q_s16(s_t + x + 0);
    340          src[1] = vld1q_s16(s_t + x + 8);
    341          dgd[0] = vld1q_s16(d_t + x + 0);
    342          dgd[1] = vld1q_s16(d_t + x + 8);
    343          stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
    344          x += 16;
    345        }
    346 
    347        if (w16 != width) {
    348          src[0] = vld1q_s16(s_t + w16 + 0);
    349          src[1] = vld1q_s16(s_t + w16 + 8);
    350          dgd[0] = vld1q_s16(d_t + w16 + 0);
    351          dgd[1] = vld1q_s16(d_t + w16 + 8);
    352          src[0] = vandq_s16(src[0], mask[0]);
    353          src[1] = vandq_s16(src[1], mask[1]);
    354          dgd[0] = vandq_s16(dgd[0], mask[0]);
    355          dgd[1] = vandq_s16(dgd[1], mask[1]);
    356          stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
    357        }
    358 
    359        s_t += s_stride;
    360        d_t += d_stride;
    361      } while (--y);
    362 
    363      sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]);
    364      sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]);
    365      sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]);
    366      sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]);
    367      sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]);
    368      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
    369      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
    370      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
    371      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
    372      sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
    373 
    374      height_t += h_t;
    375    } while (height_t < height);
    376 
    377 #if AOM_ARCH_AARCH64
    378    int64x2_t sum_m0 = vpaddq_s64(sum_m[0], sum_m[1]);
    379    int64x2_t sum_m2 = vpaddq_s64(sum_m[2], sum_m[3]);
    380    vst1q_s64(&M[wiener_win * j + 0], sum_m0);
    381    vst1q_s64(&M[wiener_win * j + 2], sum_m2);
    382    M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
    383 
    384    int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
    385    int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]);
    386    vst1q_s64(&H[wiener_win * j + 0], sum_h0);
    387    vst1q_s64(&H[wiener_win * j + 2], sum_h2);
    388    H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
    389 #else
    390    M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]);
    391    M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]);
    392    M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]);
    393    M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]);
    394    M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]);
    395 
    396    H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]);
    397    H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]);
    398    H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]);
    399    H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]);
    400    H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]);
    401 #endif  // AOM_ARCH_AARCH64
    402  } while (++j < wiener_win);
    403 
    404  // Step 2: Calculate the left edge of each square on the top row.
    405  j = 1;
    406  do {
    407    const int16_t *d_t = d;
    408    int32_t height_t = 0;
    409    int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
    410    int16x8_t dgd[2];
    411 
    412    do {
    413      const int32_t h_t =
    414          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
    415      int32x4_t row_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s32(0) };
    416 
    417      y = h_t;
    418      do {
    419        x = 0;
    420        while (x < w16) {
    421          dgd[0] = vld1q_s16(d_t + j + x + 0);
    422          dgd[1] = vld1q_s16(d_t + j + x + 8);
    423          stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
    424          x += 16;
    425        }
    426 
    427        if (w16 != width) {
    428          dgd[0] = vld1q_s16(d_t + j + x + 0);
    429          dgd[1] = vld1q_s16(d_t + j + x + 8);
    430          dgd[0] = vandq_s16(dgd[0], mask[0]);
    431          dgd[1] = vandq_s16(dgd[1], mask[1]);
    432          stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
    433        }
    434 
    435        d_t += d_stride;
    436      } while (--y);
    437 
    438      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
    439      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
    440      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
    441      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
    442 
    443      height_t += h_t;
    444    } while (height_t < height);
    445 
    446 #if AOM_ARCH_AARCH64
    447    int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
    448    int64x2_t sum_h1 = vpaddq_s64(sum_h[2], sum_h[3]);
    449    vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
    450    vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
    451    vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1));
    452    vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1));
    453 #else
    454    H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]);
    455    H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]);
    456    H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]);
    457    H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]);
    458 #endif  // AOM_ARCH_AARCH64
    459  } while (++j < wiener_win);
    460 
    461  // Step 3: Derive the top edge of each triangle along the diagonal. No
    462  // triangle in top row.
    463  {
    464    const int16_t *d_t = d;
    465 
    466    if (height % 2) {
    467      int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
    468      int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
    469      int16x8_t ds[WIENER_WIN * 2];
    470 
    471      load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
    472      load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
    473      d_t += 4 * d_stride;
    474 
    475      step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
    476      transpose_arrays_s32_8x8(deltas, deltas_tr);
    477 
    478      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
    479                          deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
    480                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
    481 
    482      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
    483                          deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
    484                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
    485 
    486      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
    487                          deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
    488                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
    489 
    490      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
    491                          deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
    492                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
    493 
    494    } else {
    495      int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
    496      int16x8_t ds[WIENER_WIN_CHROMA * 2];
    497 
    498      ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
    499      ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
    500      ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
    501      ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
    502 
    503      step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
    504 
    505      transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
    506                                      &deltas[3]);
    507 
    508      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
    509                          deltas[0], vgetq_lane_s32(deltas[4], 0),
    510                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
    511 
    512      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
    513                          deltas[1], vgetq_lane_s32(deltas[4], 1),
    514                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
    515 
    516      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
    517                          deltas[2], vgetq_lane_s32(deltas[4], 2),
    518                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
    519 
    520      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
    521                          deltas[3], vgetq_lane_s32(deltas[4], 3),
    522                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
    523    }
    524  }
    525 
    526  // Step 4: Derive the top and left edge of each square. No square in top and
    527  // bottom row.
    528 
    529  {
    530    y = h8;
    531 
    532    int16x4_t d_s[12];
    533    int16x4_t d_e[12];
    534    const int16_t *d_t = d;
    535    int16x4_t zeros = vdup_n_s16(0);
    536    load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
    537    load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
    538    int32x4_t deltas[6][18] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } };
    539 
    540    while (y >= 8) {
    541      load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
    542                   &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
    543      load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
    544                   &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
    545 
    546      int16x8_t s_tr[8], e_tr[8];
    547      transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
    548                              d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
    549                              &s_tr[3]);
    550      transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
    551                              zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
    552                              &s_tr[7]);
    553 
    554      transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
    555                              d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
    556                              &e_tr[3]);
    557      transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
    558                              zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
    559                              &e_tr[7]);
    560 
    561      int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
    562      start_col0[0] = s_tr[0];
    563      start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
    564      start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
    565      start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
    566      start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
    567 
    568      start_col1[0] = s_tr[1];
    569      start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
    570      start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
    571      start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
    572      start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
    573 
    574      start_col2[0] = s_tr[2];
    575      start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
    576      start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
    577      start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
    578      start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
    579 
    580      start_col3[0] = s_tr[3];
    581      start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
    582      start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
    583      start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
    584      start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
    585 
    586      // i = 1, j = 2;
    587      sub_deltas_step4(start_col0, start_col1, deltas[0]);
    588 
    589      // i = 1, j = 3;
    590      sub_deltas_step4(start_col0, start_col2, deltas[1]);
    591 
    592      // i = 1, j = 4
    593      sub_deltas_step4(start_col0, start_col3, deltas[2]);
    594 
    595      // i = 2, j =3
    596      sub_deltas_step4(start_col1, start_col2, deltas[3]);
    597 
    598      // i = 2, j = 4
    599      sub_deltas_step4(start_col1, start_col3, deltas[4]);
    600 
    601      // i = 3, j = 4
    602      sub_deltas_step4(start_col2, start_col3, deltas[5]);
    603 
    604      int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
    605      end_col0[0] = e_tr[0];
    606      end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
    607      end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
    608      end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
    609      end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
    610 
    611      end_col1[0] = e_tr[1];
    612      end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
    613      end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
    614      end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
    615      end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
    616 
    617      end_col2[0] = e_tr[2];
    618      end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
    619      end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
    620      end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
    621      end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
    622 
    623      end_col3[0] = e_tr[3];
    624      end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
    625      end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
    626      end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
    627      end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
    628 
    629      // i = 1, j = 2;
    630      add_deltas_step4(end_col0, end_col1, deltas[0]);
    631 
    632      // i = 1, j = 3;
    633      add_deltas_step4(end_col0, end_col2, deltas[1]);
    634 
    635      // i = 1, j = 4
    636      add_deltas_step4(end_col0, end_col3, deltas[2]);
    637 
    638      // i = 2, j =3
    639      add_deltas_step4(end_col1, end_col2, deltas[3]);
    640 
    641      // i = 2, j = 4
    642      add_deltas_step4(end_col1, end_col3, deltas[4]);
    643 
    644      // i = 3, j = 4
    645      add_deltas_step4(end_col2, end_col3, deltas[5]);
    646 
    647      d_s[0] = d_s[8];
    648      d_s[1] = d_s[9];
    649      d_s[2] = d_s[10];
    650      d_s[3] = d_s[11];
    651      d_e[0] = d_e[8];
    652      d_e[1] = d_e[9];
    653      d_e[2] = d_e[10];
    654      d_e[3] = d_e[11];
    655 
    656      d_t += 8 * d_stride;
    657      y -= 8;
    658    }
    659 
    660    if (h8 != height) {
    661      const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
    662 
    663      load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
    664                   &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
    665      load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
    666                   &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
    667      int16x8_t s_tr[8], e_tr[8];
    668      transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
    669                              d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
    670                              &s_tr[3]);
    671      transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
    672                              zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
    673                              &s_tr[7]);
    674      transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
    675                              d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
    676                              &e_tr[3]);
    677      transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
    678                              zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
    679                              &e_tr[7]);
    680 
    681      int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
    682      start_col0[0] = vandq_s16(s_tr[0], mask_h);
    683      start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
    684      start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
    685      start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
    686      start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
    687 
    688      start_col1[0] = vandq_s16(s_tr[1], mask_h);
    689      start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
    690      start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
    691      start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
    692      start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
    693 
    694      start_col2[0] = vandq_s16(s_tr[2], mask_h);
    695      start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
    696      start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
    697      start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
    698      start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
    699 
    700      start_col3[0] = vandq_s16(s_tr[3], mask_h);
    701      start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
    702      start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
    703      start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
    704      start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
    705 
    706      // i = 1, j = 2;
    707      sub_deltas_step4(start_col0, start_col1, deltas[0]);
    708 
    709      // i = 1, j = 3;
    710      sub_deltas_step4(start_col0, start_col2, deltas[1]);
    711 
    712      // i = 1, j = 4
    713      sub_deltas_step4(start_col0, start_col3, deltas[2]);
    714 
    715      // i = 2, j = 3
    716      sub_deltas_step4(start_col1, start_col2, deltas[3]);
    717 
    718      // i = 2, j = 4
    719      sub_deltas_step4(start_col1, start_col3, deltas[4]);
    720 
    721      // i = 3, j = 4
    722      sub_deltas_step4(start_col2, start_col3, deltas[5]);
    723 
    724      int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
    725      end_col0[0] = vandq_s16(e_tr[0], mask_h);
    726      end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
    727      end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
    728      end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
    729      end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
    730 
    731      end_col1[0] = vandq_s16(e_tr[1], mask_h);
    732      end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
    733      end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
    734      end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
    735      end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
    736 
    737      end_col2[0] = vandq_s16(e_tr[2], mask_h);
    738      end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
    739      end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
    740      end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
    741      end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
    742 
    743      end_col3[0] = vandq_s16(e_tr[3], mask_h);
    744      end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
    745      end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
    746      end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
    747      end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
    748 
    749      // i = 1, j = 2;
    750      add_deltas_step4(end_col0, end_col1, deltas[0]);
    751 
    752      // i = 1, j = 3;
    753      add_deltas_step4(end_col0, end_col2, deltas[1]);
    754 
    755      // i = 1, j = 4
    756      add_deltas_step4(end_col0, end_col3, deltas[2]);
    757 
    758      // i = 2, j =3
    759      add_deltas_step4(end_col1, end_col2, deltas[3]);
    760 
    761      // i = 2, j = 4
    762      add_deltas_step4(end_col1, end_col3, deltas[4]);
    763 
    764      // i = 3, j = 4
    765      add_deltas_step4(end_col2, end_col3, deltas[5]);
    766    }
    767 
    768    int32x4_t delta[6][2];
    769    int32_t single_delta[6];
    770 
    771    delta[0][0] = horizontal_add_4d_s32x4(&deltas[0][0]);
    772    delta[1][0] = horizontal_add_4d_s32x4(&deltas[1][0]);
    773    delta[2][0] = horizontal_add_4d_s32x4(&deltas[2][0]);
    774    delta[3][0] = horizontal_add_4d_s32x4(&deltas[3][0]);
    775    delta[4][0] = horizontal_add_4d_s32x4(&deltas[4][0]);
    776    delta[5][0] = horizontal_add_4d_s32x4(&deltas[5][0]);
    777 
    778    delta[0][1] = horizontal_add_4d_s32x4(&deltas[0][5]);
    779    delta[1][1] = horizontal_add_4d_s32x4(&deltas[1][5]);
    780    delta[2][1] = horizontal_add_4d_s32x4(&deltas[2][5]);
    781    delta[3][1] = horizontal_add_4d_s32x4(&deltas[3][5]);
    782    delta[4][1] = horizontal_add_4d_s32x4(&deltas[4][5]);
    783    delta[5][1] = horizontal_add_4d_s32x4(&deltas[5][5]);
    784 
    785    single_delta[0] = horizontal_add_s32x4(deltas[0][4]);
    786    single_delta[1] = horizontal_add_s32x4(deltas[1][4]);
    787    single_delta[2] = horizontal_add_s32x4(deltas[2][4]);
    788    single_delta[3] = horizontal_add_s32x4(deltas[3][4]);
    789    single_delta[4] = horizontal_add_s32x4(deltas[4][4]);
    790    single_delta[5] = horizontal_add_s32x4(deltas[5][4]);
    791 
    792    int idx = 0;
    793    for (i = 1; i < wiener_win - 1; i++) {
    794      for (j = i + 1; j < wiener_win; j++) {
    795        update_4_stats_neon(
    796            H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
    797            delta[idx][0], H + i * wiener_win * wiener_win2 + j * wiener_win);
    798        H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
    799            H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
    800            single_delta[idx];
    801 
    802        H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
    803            H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
    804            vgetq_lane_s32(delta[idx][1], 0);
    805        H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
    806            H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
    807            vgetq_lane_s32(delta[idx][1], 1);
    808        H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
    809            H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
    810            vgetq_lane_s32(delta[idx][1], 2);
    811        H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
    812            H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
    813            vgetq_lane_s32(delta[idx][1], 3);
    814 
    815        idx++;
    816      }
    817    }
    818  }
    819 
    820  // Step 5: Derive other points of each square. No square in bottom row.
    821  i = 0;
    822  do {
    823    const int16_t *const di = d + i;
    824 
    825    j = i + 1;
    826    do {
    827      const int16_t *const dj = d + j;
    828      int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
    829        { vdupq_n_s32(0) }, { vdupq_n_s32(0) }
    830      };
    831      int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
    832      int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
    833 
    834      x = 0;
    835      while (x < w16) {
    836        load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
    837                              d_js, d_je);
    838        derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
    839        x += 16;
    840      }
    841 
    842      if (w16 != width) {
    843        load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
    844                              d_js, d_je);
    845        d_is[0] = vandq_s16(d_is[0], mask[0]);
    846        d_is[1] = vandq_s16(d_is[1], mask[1]);
    847        d_is[2] = vandq_s16(d_is[2], mask[0]);
    848        d_is[3] = vandq_s16(d_is[3], mask[1]);
    849        d_is[4] = vandq_s16(d_is[4], mask[0]);
    850        d_is[5] = vandq_s16(d_is[5], mask[1]);
    851        d_is[6] = vandq_s16(d_is[6], mask[0]);
    852        d_is[7] = vandq_s16(d_is[7], mask[1]);
    853        d_ie[0] = vandq_s16(d_ie[0], mask[0]);
    854        d_ie[1] = vandq_s16(d_ie[1], mask[1]);
    855        d_ie[2] = vandq_s16(d_ie[2], mask[0]);
    856        d_ie[3] = vandq_s16(d_ie[3], mask[1]);
    857        d_ie[4] = vandq_s16(d_ie[4], mask[0]);
    858        d_ie[5] = vandq_s16(d_ie[5], mask[1]);
    859        d_ie[6] = vandq_s16(d_ie[6], mask[0]);
    860        d_ie[7] = vandq_s16(d_ie[7], mask[1]);
    861        derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
    862      }
    863 
    864      hadd_update_4_stats_neon(
    865          H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
    866          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
    867      hadd_update_4_stats_neon(
    868          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
    869          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
    870      hadd_update_4_stats_neon(
    871          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
    872          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
    873      hadd_update_4_stats_neon(
    874          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
    875          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
    876    } while (++j < wiener_win);
    877  } while (++i < wiener_win - 1);
    878 
    879  // Step 6: Derive other points of each upper triangle along the diagonal.
    880  i = 0;
    881  do {
    882    const int16_t *const di = d + i;
    883    int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) };
    884    int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
    885 
    886    x = 0;
    887    while (x < w16) {
    888      load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
    889      derive_triangle_win5_neon(d_is, d_ie, deltas);
    890      x += 16;
    891    }
    892 
    893    if (w16 != width) {
    894      load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
    895      d_is[0] = vandq_s16(d_is[0], mask[0]);
    896      d_is[1] = vandq_s16(d_is[1], mask[1]);
    897      d_is[2] = vandq_s16(d_is[2], mask[0]);
    898      d_is[3] = vandq_s16(d_is[3], mask[1]);
    899      d_is[4] = vandq_s16(d_is[4], mask[0]);
    900      d_is[5] = vandq_s16(d_is[5], mask[1]);
    901      d_is[6] = vandq_s16(d_is[6], mask[0]);
    902      d_is[7] = vandq_s16(d_is[7], mask[1]);
    903      d_ie[0] = vandq_s16(d_ie[0], mask[0]);
    904      d_ie[1] = vandq_s16(d_ie[1], mask[1]);
    905      d_ie[2] = vandq_s16(d_ie[2], mask[0]);
    906      d_ie[3] = vandq_s16(d_ie[3], mask[1]);
    907      d_ie[4] = vandq_s16(d_ie[4], mask[0]);
    908      d_ie[5] = vandq_s16(d_ie[5], mask[1]);
    909      d_ie[6] = vandq_s16(d_ie[6], mask[0]);
    910      d_ie[7] = vandq_s16(d_ie[7], mask[1]);
    911      derive_triangle_win5_neon(d_is, d_ie, deltas);
    912    }
    913 
    914    // Row 1: 4 points
    915    hadd_update_4_stats_neon(
    916        H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
    917        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
    918 
    919    // Row 2: 3 points
    920    int64x2_t delta4_s64 = vpaddlq_s32(deltas[4]);
    921    int64x2_t delta5_s64 = vpaddlq_s32(deltas[5]);
    922 
    923 #if AOM_ARCH_AARCH64
    924    int64x2_t deltas45 = vpaddq_s64(delta4_s64, delta5_s64);
    925    int64x2_t src =
    926        vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
    927    int64x2_t dst = vaddq_s64(src, deltas45);
    928    vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, dst);
    929 #else
    930    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 0] =
    931        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1 + 0] +
    932        horizontal_add_s64x2(delta4_s64);
    933    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 1] =
    934        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1 + 1] +
    935        horizontal_add_s64x2(delta5_s64);
    936 #endif  // AOM_ARCH_AARCH64
    937 
    938    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
    939        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
    940        horizontal_long_add_s32x4(deltas[6]);
    941 
    942    // Row 3: 2 points
    943    int64x2_t delta7_s64 = vpaddlq_s32(deltas[7]);
    944    int64x2_t delta8_s64 = vpaddlq_s32(deltas[8]);
    945 
    946 #if AOM_ARCH_AARCH64
    947    int64x2_t deltas78 = vpaddq_s64(delta7_s64, delta8_s64);
    948    vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
    949              vaddq_s64(dst, deltas78));
    950 #else
    951    H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 0] =
    952        H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 0] +
    953        horizontal_add_s64x2(delta7_s64);
    954    H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 1] =
    955        H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 1] +
    956        horizontal_add_s64x2(delta8_s64);
    957 #endif  // AOM_ARCH_AARCH64
    958 
    959    // Row 4: 1 point
    960    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
    961        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
    962        horizontal_long_add_s32x4(deltas[9]);
    963  } while (++i < wiener_win);
    964 }
    965 
    966 static inline void hadd_update_6_stats_neon(const int64_t *const src,
    967                                            const int32x4_t *deltas,
    968                                            int64_t *const dst) {
    969  int64x2_t delta0_s64 = vpaddlq_s32(deltas[0]);
    970  int64x2_t delta1_s64 = vpaddlq_s32(deltas[1]);
    971  int64x2_t delta2_s64 = vpaddlq_s32(deltas[2]);
    972  int64x2_t delta3_s64 = vpaddlq_s32(deltas[3]);
    973  int64x2_t delta4_s64 = vpaddlq_s32(deltas[4]);
    974  int64x2_t delta5_s64 = vpaddlq_s32(deltas[5]);
    975 
    976 #if AOM_ARCH_AARCH64
    977  int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64);
    978  int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64);
    979  int64x2_t delta45 = vpaddq_s64(delta4_s64, delta5_s64);
    980 
    981  int64x2_t src0 = vld1q_s64(src);
    982  int64x2_t src1 = vld1q_s64(src + 2);
    983  int64x2_t src2 = vld1q_s64(src + 4);
    984 
    985  vst1q_s64(dst, vaddq_s64(src0, delta01));
    986  vst1q_s64(dst + 2, vaddq_s64(src1, delta23));
    987  vst1q_s64(dst + 4, vaddq_s64(src2, delta45));
    988 #else
    989  dst[0] = src[0] + horizontal_add_s64x2(delta0_s64);
    990  dst[1] = src[1] + horizontal_add_s64x2(delta1_s64);
    991  dst[2] = src[2] + horizontal_add_s64x2(delta2_s64);
    992  dst[3] = src[3] + horizontal_add_s64x2(delta3_s64);
    993  dst[4] = src[4] + horizontal_add_s64x2(delta4_s64);
    994  dst[5] = src[5] + horizontal_add_s64x2(delta5_s64);
    995 #endif
    996 }
    997 
    998 static inline void compute_stats_win7_highbd_neon(
    999    const int16_t *const d, const int32_t d_stride, const int16_t *const s,
   1000    const int32_t s_stride, const int32_t width, const int32_t height,
   1001    int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) {
   1002  const int32_t wiener_win = WIENER_WIN;
   1003  const int32_t wiener_win2 = wiener_win * wiener_win;
   1004  const int32_t w16 = width & ~15;
   1005  const int32_t h8 = height & ~7;
   1006  int16x8_t mask[2];
   1007  mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16);
   1008  mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8);
   1009  int32_t i, j, x, y;
   1010 
   1011  const int32_t num_bit_left =
   1012      32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
   1013  const int32_t h_allowed =
   1014      (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0));
   1015 
   1016  // Step 1: Calculate the top edge of the whole matrix, i.e., the top
   1017  // edge of each triangle and square on the top row.
   1018  j = 0;
   1019  do {
   1020    const int16_t *s_t = s;
   1021    const int16_t *d_t = d;
   1022    int32_t height_t = 0;
   1023    int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
   1024    int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
   1025    int16x8_t src[2], dgd[2];
   1026 
   1027    do {
   1028      const int32_t h_t =
   1029          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
   1030      int32x4_t row_m[WIENER_WIN * 2] = { vdupq_n_s32(0) };
   1031      int32x4_t row_h[WIENER_WIN * 2] = { vdupq_n_s32(0) };
   1032 
   1033      y = h_t;
   1034      do {
   1035        x = 0;
   1036        while (x < w16) {
   1037          src[0] = vld1q_s16(s_t + x);
   1038          src[1] = vld1q_s16(s_t + x + 8);
   1039          dgd[0] = vld1q_s16(d_t + x);
   1040          dgd[1] = vld1q_s16(d_t + x + 8);
   1041          stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
   1042          x += 16;
   1043        }
   1044 
   1045        if (w16 != width) {
   1046          src[0] = vld1q_s16(s_t + w16);
   1047          src[1] = vld1q_s16(s_t + w16 + 8);
   1048          dgd[0] = vld1q_s16(d_t + w16);
   1049          dgd[1] = vld1q_s16(d_t + w16 + 8);
   1050          src[0] = vandq_s16(src[0], mask[0]);
   1051          src[1] = vandq_s16(src[1], mask[1]);
   1052          dgd[0] = vandq_s16(dgd[0], mask[0]);
   1053          dgd[1] = vandq_s16(dgd[1], mask[1]);
   1054          stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
   1055        }
   1056 
   1057        s_t += s_stride;
   1058        d_t += d_stride;
   1059      } while (--y);
   1060 
   1061      sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]);
   1062      sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]);
   1063      sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]);
   1064      sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]);
   1065      sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]);
   1066      sum_m[5] = vpadalq_s32(sum_m[5], row_m[5]);
   1067      sum_m[6] = vpadalq_s32(sum_m[6], row_m[6]);
   1068 
   1069      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
   1070      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
   1071      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
   1072      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
   1073      sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
   1074      sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]);
   1075      sum_h[6] = vpadalq_s32(sum_h[6], row_h[6]);
   1076 
   1077      height_t += h_t;
   1078    } while (height_t < height);
   1079 
   1080 #if AOM_ARCH_AARCH64
   1081    vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
   1082    vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
   1083    vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
   1084    M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
   1085 
   1086    vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
   1087    vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
   1088    vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
   1089    H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
   1090 #else
   1091    M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]);
   1092    M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]);
   1093    M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]);
   1094    M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]);
   1095    M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]);
   1096    M[wiener_win * j + 5] = horizontal_add_s64x2(sum_m[5]);
   1097    M[wiener_win * j + 6] = horizontal_add_s64x2(sum_m[6]);
   1098 
   1099    H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]);
   1100    H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]);
   1101    H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]);
   1102    H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]);
   1103    H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]);
   1104    H[wiener_win * j + 5] = horizontal_add_s64x2(sum_h[5]);
   1105    H[wiener_win * j + 6] = horizontal_add_s64x2(sum_h[6]);
   1106 #endif  // AOM_ARCH_AARCH64
   1107  } while (++j < wiener_win);
   1108 
   1109  // Step 2: Calculate the left edge of each square on the top row.
   1110  j = 1;
   1111  do {
   1112    const int16_t *d_t = d;
   1113    int32_t height_t = 0;
   1114    int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
   1115    int16x8_t dgd[2];
   1116 
   1117    do {
   1118      const int32_t h_t =
   1119          ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
   1120      int32x4_t row_h[WIENER_WIN - 1] = { vdupq_n_s32(0) };
   1121 
   1122      y = h_t;
   1123      do {
   1124        x = 0;
   1125        while (x < w16) {
   1126          dgd[0] = vld1q_s16(d_t + j + x + 0);
   1127          dgd[1] = vld1q_s16(d_t + j + x + 8);
   1128          stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
   1129          x += 16;
   1130        }
   1131 
   1132        if (w16 != width) {
   1133          dgd[0] = vld1q_s16(d_t + j + x + 0);
   1134          dgd[1] = vld1q_s16(d_t + j + x + 8);
   1135          dgd[0] = vandq_s16(dgd[0], mask[0]);
   1136          dgd[1] = vandq_s16(dgd[1], mask[1]);
   1137          stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
   1138        }
   1139 
   1140        d_t += d_stride;
   1141      } while (--y);
   1142 
   1143      sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
   1144      sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
   1145      sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
   1146      sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
   1147      sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
   1148      sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]);
   1149 
   1150      height_t += h_t;
   1151    } while (height_t < height);
   1152 
   1153 #if AOM_ARCH_AARCH64
   1154    int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
   1155    int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]);
   1156    int64x2_t sum_h4 = vpaddq_s64(sum_h[4], sum_h[5]);
   1157    vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
   1158    vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
   1159    vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2));
   1160    vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2));
   1161    vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4));
   1162    vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4));
   1163 #else
   1164    H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]);
   1165    H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]);
   1166    H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]);
   1167    H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]);
   1168    H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4]);
   1169    H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5]);
   1170 #endif  // AOM_ARCH_AARCH64
   1171 
   1172  } while (++j < wiener_win);
   1173 
   1174  // Step 3: Derive the top edge of each triangle along the diagonal. No
   1175  // triangle in top row.
   1176  {
   1177    const int16_t *d_t = d;
   1178    // Pad to call transpose function.
   1179    int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1180    int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
   1181    int16x8_t ds[WIENER_WIN * 2];
   1182 
   1183    load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
   1184                 &ds[10]);
   1185    load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
   1186                 &ds[11]);
   1187 
   1188    d_t += 6 * d_stride;
   1189 
   1190    step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
   1191    transpose_arrays_s32_8x8(deltas, deltas_tr);
   1192 
   1193    update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
   1194                        deltas_tr[0], deltas_tr[4],
   1195                        H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
   1196    update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
   1197                        deltas_tr[1], deltas_tr[5],
   1198                        H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
   1199    update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
   1200                        deltas_tr[2], deltas_tr[6],
   1201                        H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
   1202    update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
   1203                        deltas_tr[3], deltas_tr[7],
   1204                        H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   1205    update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
   1206                        deltas_tr[8], deltas_tr[12],
   1207                        H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
   1208    update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
   1209                        deltas_tr[9], deltas_tr[13],
   1210                        H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
   1211  }
   1212 
   1213  // Step 4: Derive the top and left edge of each square. No square in top and
   1214  // bottom row.
   1215 
   1216  i = 1;
   1217  do {
   1218    j = i + 1;
   1219    do {
   1220      const int16_t *di = d + i - 1;
   1221      const int16_t *dj = d + j - 1;
   1222      int32x4_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s32(0) };
   1223      int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
   1224 
   1225      dd[5] = vdupq_n_s16(0);  // Initialize to avoid warning.
   1226      const int16_t dd0_values[] = { di[0 * d_stride],
   1227                                     di[1 * d_stride],
   1228                                     di[2 * d_stride],
   1229                                     di[3 * d_stride],
   1230                                     di[4 * d_stride],
   1231                                     di[5 * d_stride],
   1232                                     0,
   1233                                     0 };
   1234      dd[0] = vld1q_s16(dd0_values);
   1235      const int16_t dd1_values[] = { di[0 * d_stride + width],
   1236                                     di[1 * d_stride + width],
   1237                                     di[2 * d_stride + width],
   1238                                     di[3 * d_stride + width],
   1239                                     di[4 * d_stride + width],
   1240                                     di[5 * d_stride + width],
   1241                                     0,
   1242                                     0 };
   1243      dd[1] = vld1q_s16(dd1_values);
   1244      const int16_t ds0_values[] = { dj[0 * d_stride],
   1245                                     dj[1 * d_stride],
   1246                                     dj[2 * d_stride],
   1247                                     dj[3 * d_stride],
   1248                                     dj[4 * d_stride],
   1249                                     dj[5 * d_stride],
   1250                                     0,
   1251                                     0 };
   1252      ds[0] = vld1q_s16(ds0_values);
   1253      int16_t ds1_values[] = { dj[0 * d_stride + width],
   1254                               dj[1 * d_stride + width],
   1255                               dj[2 * d_stride + width],
   1256                               dj[3 * d_stride + width],
   1257                               dj[4 * d_stride + width],
   1258                               dj[5 * d_stride + width],
   1259                               0,
   1260                               0 };
   1261      ds[1] = vld1q_s16(ds1_values);
   1262 
   1263      y = 0;
   1264      while (y < h8) {
   1265        // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
   1266        dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
   1267        dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
   1268        dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
   1269        dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
   1270 
   1271        // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
   1272        // 01s 11s 21s 31s 41s 51s 61s 71s  01e 11e 21e 31e 41e 51e 61e 71e
   1273        ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
   1274        ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
   1275        ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
   1276        ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
   1277 
   1278        load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
   1279        load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
   1280        load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
   1281        load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
   1282        load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
   1283        load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
   1284        load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
   1285        load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
   1286        load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
   1287        load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
   1288        load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
   1289        load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
   1290 
   1291        madd_neon(&deltas[0], dd[0], ds[0]);
   1292        madd_neon(&deltas[1], dd[1], ds[1]);
   1293        madd_neon(&deltas[2], dd[0], ds[2]);
   1294        madd_neon(&deltas[3], dd[1], ds[3]);
   1295        madd_neon(&deltas[4], dd[0], ds[4]);
   1296        madd_neon(&deltas[5], dd[1], ds[5]);
   1297        madd_neon(&deltas[6], dd[0], ds[6]);
   1298        madd_neon(&deltas[7], dd[1], ds[7]);
   1299        madd_neon(&deltas[8], dd[0], ds[8]);
   1300        madd_neon(&deltas[9], dd[1], ds[9]);
   1301        madd_neon(&deltas[10], dd[0], ds[10]);
   1302        madd_neon(&deltas[11], dd[1], ds[11]);
   1303        madd_neon(&deltas[12], dd[0], ds[12]);
   1304        madd_neon(&deltas[13], dd[1], ds[13]);
   1305        madd_neon(&deltas[14], dd[2], ds[0]);
   1306        madd_neon(&deltas[15], dd[3], ds[1]);
   1307        madd_neon(&deltas[16], dd[4], ds[0]);
   1308        madd_neon(&deltas[17], dd[5], ds[1]);
   1309        madd_neon(&deltas[18], dd[6], ds[0]);
   1310        madd_neon(&deltas[19], dd[7], ds[1]);
   1311        madd_neon(&deltas[20], dd[8], ds[0]);
   1312        madd_neon(&deltas[21], dd[9], ds[1]);
   1313        madd_neon(&deltas[22], dd[10], ds[0]);
   1314        madd_neon(&deltas[23], dd[11], ds[1]);
   1315        madd_neon(&deltas[24], dd[12], ds[0]);
   1316        madd_neon(&deltas[25], dd[13], ds[1]);
   1317 
   1318        dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
   1319        dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
   1320        ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
   1321        ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
   1322 
   1323        di += 8 * d_stride;
   1324        dj += 8 * d_stride;
   1325        y += 8;
   1326      }
   1327 
   1328      deltas[0] = hadd_four_32_neon(deltas[0], deltas[2], deltas[4], deltas[6]);
   1329      deltas[1] = hadd_four_32_neon(deltas[1], deltas[3], deltas[5], deltas[7]);
   1330      deltas[2] =
   1331          hadd_four_32_neon(deltas[8], deltas[10], deltas[12], deltas[12]);
   1332      deltas[3] =
   1333          hadd_four_32_neon(deltas[9], deltas[11], deltas[13], deltas[13]);
   1334      deltas[4] =
   1335          hadd_four_32_neon(deltas[14], deltas[16], deltas[18], deltas[20]);
   1336      deltas[5] =
   1337          hadd_four_32_neon(deltas[15], deltas[17], deltas[19], deltas[21]);
   1338      deltas[6] =
   1339          hadd_four_32_neon(deltas[22], deltas[24], deltas[22], deltas[24]);
   1340      deltas[7] =
   1341          hadd_four_32_neon(deltas[23], deltas[25], deltas[23], deltas[25]);
   1342      deltas[0] = vsubq_s32(deltas[1], deltas[0]);
   1343      deltas[1] = vsubq_s32(deltas[3], deltas[2]);
   1344      deltas[2] = vsubq_s32(deltas[5], deltas[4]);
   1345      deltas[3] = vsubq_s32(deltas[7], deltas[6]);
   1346 
   1347      if (h8 != height) {
   1348        const int16_t ds0_vals[] = {
   1349          dj[0 * d_stride], dj[0 * d_stride + width],
   1350          dj[1 * d_stride], dj[1 * d_stride + width],
   1351          dj[2 * d_stride], dj[2 * d_stride + width],
   1352          dj[3 * d_stride], dj[3 * d_stride + width]
   1353        };
   1354        ds[0] = vld1q_s16(ds0_vals);
   1355 
   1356        ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
   1357        ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
   1358        ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
   1359        ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
   1360        const int16_t dd4_vals[] = {
   1361          -di[1 * d_stride], di[1 * d_stride + width],
   1362          -di[2 * d_stride], di[2 * d_stride + width],
   1363          -di[3 * d_stride], di[3 * d_stride + width],
   1364          -di[4 * d_stride], di[4 * d_stride + width]
   1365        };
   1366        dd[4] = vld1q_s16(dd4_vals);
   1367 
   1368        dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
   1369        dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
   1370        do {
   1371          dd[0] = vdupq_n_s16(-di[0 * d_stride]);
   1372          dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
   1373          dd[0] = dd[1] = vzipq_s16(dd[0], dd[2]).val[0];
   1374 
   1375          ds[4] = vdupq_n_s16(dj[0 * d_stride]);
   1376          ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
   1377          ds[4] = ds[5] = vzipq_s16(ds[4], ds[6]).val[0];
   1378 
   1379          dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
   1380          dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
   1381          ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
   1382          ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
   1383 
   1384          madd_neon_pairwise(&deltas[0], dd[0], ds[0]);
   1385          madd_neon_pairwise(&deltas[1], dd[1], ds[1]);
   1386          madd_neon_pairwise(&deltas[2], dd[4], ds[4]);
   1387          madd_neon_pairwise(&deltas[3], dd[5], ds[5]);
   1388 
   1389          int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
   1390          ds[0] = vextq_s16(ds[0], ds[1], 2);
   1391          ds[1] = vextq_s16(ds[1], ds[0], 2);
   1392          ds[1] = vreinterpretq_s16_s32(
   1393              vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
   1394          int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
   1395          dd[4] = vextq_s16(dd[4], dd[5], 2);
   1396          dd[5] = vextq_s16(dd[5], dd[4], 2);
   1397          dd[5] = vreinterpretq_s16_s32(
   1398              vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
   1399          di += d_stride;
   1400          dj += d_stride;
   1401        } while (++y < height);
   1402      }
   1403 
   1404      // Writing one more element on the top edge of a square falls to
   1405      // the next square in the same row or the first element in the next
   1406      // row, which will just be overwritten later.
   1407      update_8_stats_neon(
   1408          H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
   1409          deltas[0], deltas[1],
   1410          H + i * wiener_win * wiener_win2 + j * wiener_win);
   1411 
   1412      H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
   1413          H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
   1414          vgetq_lane_s32(deltas[2], 0);
   1415      H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
   1416          H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
   1417          vgetq_lane_s32(deltas[2], 1);
   1418      H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
   1419          H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
   1420          vgetq_lane_s32(deltas[2], 2);
   1421      H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
   1422          H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
   1423          vgetq_lane_s32(deltas[2], 3);
   1424      H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
   1425          H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
   1426          vgetq_lane_s32(deltas[3], 0);
   1427      H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
   1428          H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
   1429          vgetq_lane_s32(deltas[3], 1);
   1430    } while (++j < wiener_win);
   1431  } while (++i < wiener_win - 1);
   1432 
   1433  // Step 5: Derive other points of each square. No square in bottom row.
   1434  i = 0;
   1435  do {
   1436    const int16_t *const di = d + i;
   1437 
   1438    j = i + 1;
   1439    do {
   1440      const int16_t *const dj = d + j;
   1441      int32x4_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s32(0) },
   1442                                                  { vdupq_n_s32(0) } };
   1443      int16x8_t d_is[WIN_7];
   1444      int16x8_t d_ie[WIN_7];
   1445      int16x8_t d_js[WIN_7];
   1446      int16x8_t d_je[WIN_7];
   1447 
   1448      x = 0;
   1449      while (x < w16) {
   1450        load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1451                              d_js, d_je);
   1452        derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
   1453        x += 16;
   1454      }
   1455 
   1456      if (w16 != width) {
   1457        load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
   1458                              d_js, d_je);
   1459        d_is[0] = vandq_s16(d_is[0], mask[0]);
   1460        d_is[1] = vandq_s16(d_is[1], mask[1]);
   1461        d_is[2] = vandq_s16(d_is[2], mask[0]);
   1462        d_is[3] = vandq_s16(d_is[3], mask[1]);
   1463        d_is[4] = vandq_s16(d_is[4], mask[0]);
   1464        d_is[5] = vandq_s16(d_is[5], mask[1]);
   1465        d_is[6] = vandq_s16(d_is[6], mask[0]);
   1466        d_is[7] = vandq_s16(d_is[7], mask[1]);
   1467        d_is[8] = vandq_s16(d_is[8], mask[0]);
   1468        d_is[9] = vandq_s16(d_is[9], mask[1]);
   1469        d_is[10] = vandq_s16(d_is[10], mask[0]);
   1470        d_is[11] = vandq_s16(d_is[11], mask[1]);
   1471        d_ie[0] = vandq_s16(d_ie[0], mask[0]);
   1472        d_ie[1] = vandq_s16(d_ie[1], mask[1]);
   1473        d_ie[2] = vandq_s16(d_ie[2], mask[0]);
   1474        d_ie[3] = vandq_s16(d_ie[3], mask[1]);
   1475        d_ie[4] = vandq_s16(d_ie[4], mask[0]);
   1476        d_ie[5] = vandq_s16(d_ie[5], mask[1]);
   1477        d_ie[6] = vandq_s16(d_ie[6], mask[0]);
   1478        d_ie[7] = vandq_s16(d_ie[7], mask[1]);
   1479        d_ie[8] = vandq_s16(d_ie[8], mask[0]);
   1480        d_ie[9] = vandq_s16(d_ie[9], mask[1]);
   1481        d_ie[10] = vandq_s16(d_ie[10], mask[0]);
   1482        d_ie[11] = vandq_s16(d_ie[11], mask[1]);
   1483        derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
   1484      }
   1485 
   1486      hadd_update_6_stats_neon(
   1487          H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
   1488          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
   1489      hadd_update_6_stats_neon(
   1490          H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
   1491          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
   1492      hadd_update_6_stats_neon(
   1493          H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
   1494          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
   1495      hadd_update_6_stats_neon(
   1496          H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
   1497          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
   1498      hadd_update_6_stats_neon(
   1499          H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
   1500          H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
   1501      hadd_update_6_stats_neon(
   1502          H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
   1503          H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
   1504    } while (++j < wiener_win);
   1505  } while (++i < wiener_win - 1);
   1506 
   1507  // Step 6: Derive other points of each upper triangle along the diagonal.
   1508  i = 0;
   1509  do {
   1510    const int16_t *const di = d + i;
   1511    int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) };
   1512    int16x8_t d_is[WIN_7], d_ie[WIN_7];
   1513 
   1514    x = 0;
   1515    while (x < w16) {
   1516      load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
   1517      derive_triangle_win7_neon(d_is, d_ie, deltas);
   1518      x += 16;
   1519    }
   1520 
   1521    if (w16 != width) {
   1522      load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
   1523      d_is[0] = vandq_s16(d_is[0], mask[0]);
   1524      d_is[1] = vandq_s16(d_is[1], mask[1]);
   1525      d_is[2] = vandq_s16(d_is[2], mask[0]);
   1526      d_is[3] = vandq_s16(d_is[3], mask[1]);
   1527      d_is[4] = vandq_s16(d_is[4], mask[0]);
   1528      d_is[5] = vandq_s16(d_is[5], mask[1]);
   1529      d_is[6] = vandq_s16(d_is[6], mask[0]);
   1530      d_is[7] = vandq_s16(d_is[7], mask[1]);
   1531      d_is[8] = vandq_s16(d_is[8], mask[0]);
   1532      d_is[9] = vandq_s16(d_is[9], mask[1]);
   1533      d_is[10] = vandq_s16(d_is[10], mask[0]);
   1534      d_is[11] = vandq_s16(d_is[11], mask[1]);
   1535      d_ie[0] = vandq_s16(d_ie[0], mask[0]);
   1536      d_ie[1] = vandq_s16(d_ie[1], mask[1]);
   1537      d_ie[2] = vandq_s16(d_ie[2], mask[0]);
   1538      d_ie[3] = vandq_s16(d_ie[3], mask[1]);
   1539      d_ie[4] = vandq_s16(d_ie[4], mask[0]);
   1540      d_ie[5] = vandq_s16(d_ie[5], mask[1]);
   1541      d_ie[6] = vandq_s16(d_ie[6], mask[0]);
   1542      d_ie[7] = vandq_s16(d_ie[7], mask[1]);
   1543      d_ie[8] = vandq_s16(d_ie[8], mask[0]);
   1544      d_ie[9] = vandq_s16(d_ie[9], mask[1]);
   1545      d_ie[10] = vandq_s16(d_ie[10], mask[0]);
   1546      d_ie[11] = vandq_s16(d_ie[11], mask[1]);
   1547      derive_triangle_win7_neon(d_is, d_ie, deltas);
   1548    }
   1549 
   1550    // Row 1: 6 points
   1551    hadd_update_6_stats_neon(
   1552        H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
   1553        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
   1554 
   1555    // Row 2: 5 points
   1556    hadd_update_4_stats_neon(
   1557        H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
   1558        H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
   1559    H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
   1560        H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
   1561        horizontal_long_add_s32x4(deltas[10]);
   1562 
   1563    // Row 3: 4 points
   1564    hadd_update_4_stats_neon(
   1565        H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
   1566        deltas + 11,
   1567        H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
   1568 
   1569    // Row 4: 3 points
   1570 #if AOM_ARCH_AARCH64
   1571    int64x2_t delta15_s64 = vpaddlq_s32(deltas[15]);
   1572    int64x2_t delta16_s64 = vpaddlq_s32(deltas[16]);
   1573    int64x2_t delta1516 = vpaddq_s64(delta15_s64, delta16_s64);
   1574 
   1575    int64x2_t h0 =
   1576        vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
   1577    vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
   1578              vaddq_s64(h0, delta1516));
   1579 #else
   1580    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 0] =
   1581        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 0] +
   1582        horizontal_long_add_s32x4(deltas[15]);
   1583    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 1] =
   1584        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 1] +
   1585        horizontal_long_add_s32x4(deltas[16]);
   1586 #endif  // AOM_ARCH_AARCH64
   1587 
   1588    H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
   1589        H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
   1590        horizontal_long_add_s32x4(deltas[17]);
   1591 
   1592    // Row 5: 2 points
   1593    int64x2_t delta18_s64 = vpaddlq_s32(deltas[18]);
   1594    int64x2_t delta19_s64 = vpaddlq_s32(deltas[19]);
   1595 
   1596 #if AOM_ARCH_AARCH64
   1597    int64x2_t delta1819 = vpaddq_s64(delta18_s64, delta19_s64);
   1598 
   1599    int64x2_t h1 =
   1600        vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
   1601    vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
   1602              vaddq_s64(h1, delta1819));
   1603 #else
   1604    H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] =
   1605        H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] +
   1606        horizontal_add_s64x2(delta18_s64);
   1607    H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5 + 1] =
   1608        H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 1] +
   1609        horizontal_add_s64x2(delta19_s64);
   1610 #endif  // AOM_ARCH_AARCH64
   1611 
   1612    // Row 6: 1 points
   1613    H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
   1614        H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
   1615        horizontal_long_add_s32x4(deltas[20]);
   1616  } while (++i < wiener_win);
   1617 }
   1618 
   1619 static inline void sub_avg_block_highbd_neon(const uint16_t *src,
   1620                                             const int32_t src_stride,
   1621                                             const uint16_t avg,
   1622                                             const int32_t width,
   1623                                             const int32_t height, int16_t *dst,
   1624                                             const int32_t dst_stride) {
   1625  const uint16x8_t a = vdupq_n_u16(avg);
   1626 
   1627  int32_t i = height + 1;
   1628  do {
   1629    int32_t j = 0;
   1630    while (j < width) {
   1631      const uint16x8_t s = vld1q_u16(src + j);
   1632      const uint16x8_t d = vsubq_u16(s, a);
   1633      vst1q_s16(dst + j, vreinterpretq_s16_u16(d));
   1634      j += 8;
   1635    }
   1636 
   1637    src += src_stride;
   1638    dst += dst_stride;
   1639  } while (--i);
   1640 }
   1641 
   1642 static inline uint16_t highbd_find_average_neon(const uint16_t *src,
   1643                                                int src_stride, int width,
   1644                                                int height) {
   1645  assert(width > 0);
   1646  assert(height > 0);
   1647 
   1648  uint64x2_t sum_u64 = vdupq_n_u64(0);
   1649  uint64_t sum = 0;
   1650  const uint16x8_t mask =
   1651      vreinterpretq_u16_s16(vld1q_s16(&mask_16bit[16] - (width % 8)));
   1652 
   1653  int h = height;
   1654  do {
   1655    uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
   1656 
   1657    int w = width;
   1658    const uint16_t *row = src;
   1659    while (w >= 32) {
   1660      uint16x8_t s0 = vld1q_u16(row + 0);
   1661      uint16x8_t s1 = vld1q_u16(row + 8);
   1662      uint16x8_t s2 = vld1q_u16(row + 16);
   1663      uint16x8_t s3 = vld1q_u16(row + 24);
   1664 
   1665      s0 = vaddq_u16(s0, s1);
   1666      s2 = vaddq_u16(s2, s3);
   1667      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
   1668      sum_u32[1] = vpadalq_u16(sum_u32[1], s2);
   1669 
   1670      row += 32;
   1671      w -= 32;
   1672    }
   1673 
   1674    if (w >= 16) {
   1675      uint16x8_t s0 = vld1q_u16(row + 0);
   1676      uint16x8_t s1 = vld1q_u16(row + 8);
   1677 
   1678      s0 = vaddq_u16(s0, s1);
   1679      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
   1680 
   1681      row += 16;
   1682      w -= 16;
   1683    }
   1684 
   1685    if (w >= 8) {
   1686      uint16x8_t s0 = vld1q_u16(row);
   1687      sum_u32[1] = vpadalq_u16(sum_u32[1], s0);
   1688 
   1689      row += 8;
   1690      w -= 8;
   1691    }
   1692 
   1693    if (w) {
   1694      uint16x8_t s0 = vandq_u16(vld1q_u16(row), mask);
   1695      sum_u32[1] = vpadalq_u16(sum_u32[1], s0);
   1696 
   1697      row += 8;
   1698      w -= 8;
   1699    }
   1700 
   1701    sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1]));
   1702 
   1703    src += src_stride;
   1704  } while (--h != 0);
   1705 
   1706  return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width));
   1707 }
   1708 
   1709 void av1_compute_stats_highbd_neon(int32_t wiener_win, const uint8_t *dgd8,
   1710                                   const uint8_t *src8, int16_t *dgd_avg,
   1711                                   int16_t *src_avg, int32_t h_start,
   1712                                   int32_t h_end, int32_t v_start,
   1713                                   int32_t v_end, int32_t dgd_stride,
   1714                                   int32_t src_stride, int64_t *M, int64_t *H,
   1715                                   aom_bit_depth_t bit_depth) {
   1716  const int32_t wiener_win2 = wiener_win * wiener_win;
   1717  const int32_t wiener_halfwin = (wiener_win >> 1);
   1718  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   1719  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
   1720  const int32_t width = h_end - h_start;
   1721  const int32_t height = v_end - v_start;
   1722  const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride;
   1723  const uint16_t avg =
   1724      highbd_find_average_neon(dgd_start, dgd_stride, width, height);
   1725  const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
   1726  const int32_t s_stride = (width + 15) & ~15;
   1727 
   1728  sub_avg_block_highbd_neon(src + v_start * src_stride + h_start, src_stride,
   1729                            avg, width, height, src_avg, s_stride);
   1730  sub_avg_block_highbd_neon(
   1731      dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
   1732      dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin,
   1733      dgd_avg, d_stride);
   1734 
   1735  if (wiener_win == WIENER_WIN) {
   1736    compute_stats_win7_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width,
   1737                                   height, M, H, bit_depth);
   1738  } else if (wiener_win == WIENER_WIN_CHROMA) {
   1739    compute_stats_win5_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width,
   1740                                   height, M, H, bit_depth);
   1741  }
   1742 
   1743  // H is a symmetric matrix, so we only need to fill out the upper triangle.
   1744  // We can copy it down to the lower triangle outside the (i, j) loops.
   1745  if (bit_depth == AOM_BITS_8) {
   1746    diagonal_copy_stats_neon(wiener_win2, H);
   1747  } else if (bit_depth == AOM_BITS_10) {  // bit_depth == AOM_BITS_10
   1748    const int32_t k4 = wiener_win2 & ~3;
   1749 
   1750    int32_t k = 0;
   1751    do {
   1752      int64x2_t dst = div4_neon(vld1q_s64(M + k));
   1753      vst1q_s64(M + k, dst);
   1754      dst = div4_neon(vld1q_s64(M + k + 2));
   1755      vst1q_s64(M + k + 2, dst);
   1756      H[k * wiener_win2 + k] /= 4;
   1757      k += 4;
   1758    } while (k < k4);
   1759 
   1760    H[k * wiener_win2 + k] /= 4;
   1761 
   1762    for (; k < wiener_win2; ++k) {
   1763      M[k] /= 4;
   1764    }
   1765 
   1766    div4_diagonal_copy_stats_neon(wiener_win2, H);
   1767  } else {  // bit_depth == AOM_BITS_12
   1768    const int32_t k4 = wiener_win2 & ~3;
   1769 
   1770    int32_t k = 0;
   1771    do {
   1772      int64x2_t dst = div16_neon(vld1q_s64(M + k));
   1773      vst1q_s64(M + k, dst);
   1774      dst = div16_neon(vld1q_s64(M + k + 2));
   1775      vst1q_s64(M + k + 2, dst);
   1776      H[k * wiener_win2 + k] /= 16;
   1777      k += 4;
   1778    } while (k < k4);
   1779 
   1780    H[k * wiener_win2 + k] /= 16;
   1781 
   1782    for (; k < wiener_win2; ++k) {
   1783      M[k] /= 16;
   1784    }
   1785 
   1786    div16_diagonal_copy_stats_neon(wiener_win2, H);
   1787  }
   1788 }
   1789 int64_t av1_highbd_pixel_proj_error_neon(
   1790    const uint8_t *src8, int width, int height, int src_stride,
   1791    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
   1792    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
   1793  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   1794  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
   1795  int64_t sse = 0;
   1796  int64x2_t sse_s64 = vdupq_n_s64(0);
   1797 
   1798  if (params->r[0] > 0 && params->r[1] > 0) {
   1799    int32x2_t xq_v = vld1_s32(xq);
   1800    int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4);
   1801 
   1802    do {
   1803      int j = 0;
   1804      int32x4_t sse_s32 = vdupq_n_s32(0);
   1805 
   1806      do {
   1807        const uint16x8_t d = vld1q_u16(&dat[j]);
   1808        const uint16x8_t s = vld1q_u16(&src[j]);
   1809        int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
   1810        int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
   1811        int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
   1812        int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
   1813 
   1814        int32x4_t d_s32_lo = vreinterpretq_s32_u32(
   1815            vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
   1816        int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16(
   1817            vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
   1818 
   1819        int32x4_t v0 = vsubq_s32(
   1820            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
   1821            d_s32_lo);
   1822        int32x4_t v1 = vsubq_s32(
   1823            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
   1824            d_s32_hi);
   1825 
   1826        v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0);
   1827        v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0);
   1828        v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
   1829        v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
   1830 
   1831        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
   1832        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
   1833 
   1834        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
   1835                                vreinterpretq_s16_u16(vsubq_u16(d, s)));
   1836        int16x4_t e_lo = vget_low_s16(e);
   1837        int16x4_t e_hi = vget_high_s16(e);
   1838 
   1839        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
   1840        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
   1841 
   1842        j += 8;
   1843      } while (j <= width - 8);
   1844 
   1845      for (int k = j; k < width; ++k) {
   1846        int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
   1847        v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]);
   1848        v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4);
   1849        int32_t e =
   1850            (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
   1851        sse += ((int64_t)e * e);
   1852      }
   1853 
   1854      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
   1855 
   1856      dat += dat_stride;
   1857      src += src_stride;
   1858      flt0 += flt0_stride;
   1859      flt1 += flt1_stride;
   1860    } while (--height != 0);
   1861  } else if (params->r[0] > 0 || params->r[1] > 0) {
   1862    int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
   1863    int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
   1864    int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
   1865    int32x4_t xq_v = vdupq_n_s32(xq_active);
   1866 
   1867    do {
   1868      int j = 0;
   1869      int32x4_t sse_s32 = vdupq_n_s32(0);
   1870      do {
   1871        const uint16x8_t d0 = vld1q_u16(&dat[j]);
   1872        const uint16x8_t s0 = vld1q_u16(&src[j]);
   1873        int32x4_t flt0_0 = vld1q_s32(&flt[j]);
   1874        int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]);
   1875 
   1876        uint16x8_t d_u16 = vshlq_n_u16(d0, 4);
   1877        int32x4_t sub0 = vreinterpretq_s32_u32(
   1878            vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16)));
   1879        int32x4_t sub1 = vreinterpretq_s32_u32(
   1880            vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16)));
   1881 
   1882        int32x4_t v0 = vmlaq_s32(
   1883            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0,
   1884            xq_v);
   1885        int32x4_t v1 = vmlaq_s32(
   1886            vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1,
   1887            xq_v);
   1888 
   1889        int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
   1890        int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
   1891 
   1892        int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
   1893                                vreinterpretq_s16_u16(vsubq_u16(d0, s0)));
   1894        int16x4_t e_lo = vget_low_s16(e);
   1895        int16x4_t e_hi = vget_high_s16(e);
   1896 
   1897        sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
   1898        sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
   1899 
   1900        j += 8;
   1901      } while (j <= width - 8);
   1902 
   1903      for (int k = j; k < width; ++k) {
   1904        int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
   1905        v += xq_active * (int32_t)((uint32_t)flt[k] - (uint16_t)(dat[k] << 4));
   1906        const int32_t e =
   1907            (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
   1908        sse += ((int64_t)e * e);
   1909      }
   1910 
   1911      sse_s64 = vpadalq_s32(sse_s64, sse_s32);
   1912 
   1913      dat += dat_stride;
   1914      flt += flt_stride;
   1915      src += src_stride;
   1916    } while (--height != 0);
   1917  } else {
   1918    do {
   1919      int j = 0;
   1920 
   1921      do {
   1922        const uint16x8_t d = vld1q_u16(&dat[j]);
   1923        const uint16x8_t s = vld1q_u16(&src[j]);
   1924 
   1925        uint16x8_t diff = vabdq_u16(d, s);
   1926        uint16x4_t diff_lo = vget_low_u16(diff);
   1927        uint16x4_t diff_hi = vget_high_u16(diff);
   1928 
   1929        uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo);
   1930        uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi);
   1931 
   1932        sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo));
   1933        sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi));
   1934 
   1935        j += 8;
   1936      } while (j <= width - 8);
   1937 
   1938      for (int k = j; k < width; ++k) {
   1939        int32_t e = dat[k] - src[k];
   1940        sse += e * e;
   1941      }
   1942 
   1943      dat += dat_stride;
   1944      src += src_stride;
   1945    } while (--height != 0);
   1946  }
   1947 
   1948  sse += horizontal_add_s64x2(sse_s64);
   1949  return sse;
   1950 }