tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pickrst_neon.h (50674B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
     13 #define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
     14 
     15 #include <arm_neon.h>
     16 
     17 #include "av1/common/restoration.h"
     18 
     19 #define WIN_7 ((WIENER_WIN - 1) * 2)
     20 #define WIN_CHROMA ((WIENER_WIN_CHROMA - 1) * 2)
     21 
     22 // Aligned sizes for Wiener filters.
     23 #define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
     24 #define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
     25 #define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
     26 #define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
     27 #define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
     28 
     29 // Compute 8 values of M (cross correlation) for a single source pixel and
     30 // accumulate.
     31 static inline void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
     32                                   int16x8_t dgd_avg) {
     33  int32x4_t lo = vld1q_s32(M_s32 + 0);
     34  int32x4_t hi = vld1q_s32(M_s32 + 4);
     35 
     36  lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
     37  hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
     38 
     39  vst1q_s32(M_s32 + 0, lo);
     40  vst1q_s32(M_s32 + 4, hi);
     41 }
     42 
     43 // Compute 8 values of M (cross correlation) for two source pixels and
     44 // accumulate.
     45 static inline void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
     46                                    int16x4_t src_avg1, int16x8_t dgd_avg0,
     47                                    int16x8_t dgd_avg1) {
     48  int32x4_t lo = vld1q_s32(M_s32 + 0);
     49  int32x4_t hi = vld1q_s32(M_s32 + 4);
     50 
     51  lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
     52  hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
     53  lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
     54  hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
     55 
     56  vst1q_s32(M_s32 + 0, lo);
     57  vst1q_s32(M_s32 + 4, hi);
     58 }
     59 
     60 static inline void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
     61                                   int width, int height) {
     62  for (int i = 0; i < height; i += 4) {
     63    int16x4_t di = vld1_s16(dgd_avg + i);
     64 
     65    for (int j = i; j < width; j += 4) {
     66      int16x4_t dj = vld1_s16(dgd_avg + j);
     67      int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
     68      int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
     69      int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
     70      int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
     71 
     72      h0 = vmlal_lane_s16(h0, dj, di, 0);
     73      h1 = vmlal_lane_s16(h1, dj, di, 1);
     74      h2 = vmlal_lane_s16(h2, dj, di, 2);
     75      h3 = vmlal_lane_s16(h3, dj, di, 3);
     76 
     77      vst1q_s32(H_s32 + 0 * width + j, h0);
     78      vst1q_s32(H_s32 + 1 * width + j, h1);
     79      vst1q_s32(H_s32 + 2 * width + j, h2);
     80      vst1q_s32(H_s32 + 3 * width + j, h3);
     81    }
     82    H_s32 += 4 * width;
     83  }
     84 }
     85 
     86 static inline void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
     87                                        const int16_t *dgd_avg1) {
     88  for (int i = 0; i < 24; i += 4) {
     89    int16x4_t di0 = vld1_s16(dgd_avg0 + i);
     90    int16x4_t di1 = vld1_s16(dgd_avg1 + i);
     91 
     92    for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
     93      int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
     94      int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
     95      int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
     96      int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
     97      int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
     98      int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
     99 
    100      h0 = vmlal_lane_s16(h0, dj0, di0, 0);
    101      h0 = vmlal_lane_s16(h0, dj1, di1, 0);
    102      h1 = vmlal_lane_s16(h1, dj0, di0, 1);
    103      h1 = vmlal_lane_s16(h1, dj1, di1, 1);
    104      h2 = vmlal_lane_s16(h2, dj0, di0, 2);
    105      h2 = vmlal_lane_s16(h2, dj1, di1, 2);
    106      h3 = vmlal_lane_s16(h3, dj0, di0, 3);
    107      h3 = vmlal_lane_s16(h3, dj1, di1, 3);
    108 
    109      vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
    110      vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
    111      vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
    112      vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
    113    }
    114    H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
    115  }
    116 }
    117 
    118 static inline void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
    119                                        const int16_t *dgd_avg1) {
    120  for (int i = 0; i < 48; i += 4) {
    121    int16x4_t di0 = vld1_s16(dgd_avg0 + i);
    122    int16x4_t di1 = vld1_s16(dgd_avg1 + i);
    123 
    124    int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
    125    int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
    126    int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
    127    int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
    128 
    129    h0 = vmlal_lane_s16(h0, di0, di0, 0);
    130    h0 = vmlal_lane_s16(h0, di1, di1, 0);
    131    h1 = vmlal_lane_s16(h1, di0, di0, 1);
    132    h1 = vmlal_lane_s16(h1, di1, di1, 1);
    133    h2 = vmlal_lane_s16(h2, di0, di0, 2);
    134    h2 = vmlal_lane_s16(h2, di1, di1, 2);
    135    h3 = vmlal_lane_s16(h3, di0, di0, 3);
    136    h3 = vmlal_lane_s16(h3, di1, di1, 3);
    137 
    138    vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
    139    vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
    140    vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
    141    vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
    142 
    143    for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
    144      int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
    145      int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
    146      h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
    147      h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
    148      h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
    149      h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
    150 
    151      h0 = vmlal_lane_s16(h0, dj0, di0, 0);
    152      h0 = vmlal_lane_s16(h0, dj1, di1, 0);
    153      h1 = vmlal_lane_s16(h1, dj0, di0, 1);
    154      h1 = vmlal_lane_s16(h1, dj1, di1, 1);
    155      h2 = vmlal_lane_s16(h2, dj0, di0, 2);
    156      h2 = vmlal_lane_s16(h2, dj1, di1, 2);
    157      h3 = vmlal_lane_s16(h3, dj0, di0, 3);
    158      h3 = vmlal_lane_s16(h3, dj1, di1, 3);
    159 
    160      vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
    161      vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
    162      vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
    163      vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
    164    }
    165    H_s32 += 4 * WIENER_WIN2_ALIGN2;
    166  }
    167 }
    168 
    169 // Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
    170 static inline void accumulate_and_clear(int64_t *dst, int32_t *src,
    171                                        int length) {
    172  do {
    173    int32x4_t s32 = vld1q_s32(src);
    174    vst1q_s32(src, vdupq_n_s32(0));
    175    src += 4;
    176 
    177    int64x2_t d_lo = vld1q_s64(dst + 0);
    178    int64x2_t d_hi = vld1q_s64(dst + 2);
    179 
    180    d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
    181    d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
    182 
    183    vst1q_s64(dst + 0, d_lo);
    184    vst1q_s64(dst + 2, d_hi);
    185 
    186    dst += 4;
    187    length -= 4;
    188  } while (length > 0);
    189 }
    190 
    191 // clang-format off
    192 // Constant pool to act as a mask to zero n top elements in an int16x8_t vector.
    193 // The index we load from depends on n.
    194 static const int16_t mask_16bit[32] = {
    195  0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    196  0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    197       0,      0,      0,      0,      0,      0,      0,      0,
    198       0,      0,      0,      0,      0,      0,      0,      0,
    199 };
    200 // clang-format on
    201 
    202 static inline void madd_neon_pairwise(int32x4_t *sum, const int16x8_t src,
    203                                      const int16x8_t dgd) {
    204  const int32x4_t sd =
    205      horizontal_add_2d_s32(vmull_s16(vget_low_s16(src), vget_low_s16(dgd)),
    206                            vmull_s16(vget_high_s16(src), vget_high_s16(dgd)));
    207  *sum = vaddq_s32(*sum, sd);
    208 }
    209 
    210 static inline void madd_neon(int32x4_t *sum, const int16x8_t src,
    211                             const int16x8_t dgd) {
    212  *sum = vmlal_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
    213  *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
    214 }
    215 
    216 static inline void msub_neon(int32x4_t *sum, const int16x8_t src,
    217                             const int16x8_t dgd) {
    218  *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
    219  *sum = vmlsl_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
    220 }
    221 
    222 static inline void compute_delta_step3(int32x4_t *sum0, int32x4_t *sum1,
    223                                       const int16x8_t src0,
    224                                       const int16x8_t src1,
    225                                       const int16x8_t dgd0,
    226                                       const int16x8_t dgd1) {
    227  *sum0 = vmlsl_s16(*sum0, vget_low_s16(src0), vget_low_s16(dgd0));
    228  *sum0 = vmlal_s16(*sum0, vget_low_s16(src1), vget_low_s16(dgd1));
    229  *sum1 = vmlsl_s16(*sum1, vget_high_s16(src0), vget_high_s16(dgd0));
    230  *sum1 = vmlal_s16(*sum1, vget_high_s16(src1), vget_high_s16(dgd1));
    231 }
    232 
    233 static inline int32x4_t hadd_four_32_neon(const int32x4_t src0,
    234                                          const int32x4_t src1,
    235                                          const int32x4_t src2,
    236                                          const int32x4_t src3) {
    237  int32x4_t src[4] = { src0, src1, src2, src3 };
    238  return horizontal_add_4d_s32x4(src);
    239 }
    240 
    241 static inline void update_4_stats_neon(const int64_t *const src,
    242                                       const int32x4_t delta,
    243                                       int64_t *const dst) {
    244  const int64x2_t s1 = vld1q_s64(src);
    245  const int64x2_t s2 = vld1q_s64(src + 2);
    246 
    247  const int64x2_t d1 = vaddw_s32(s1, vget_low_s32(delta));
    248  const int64x2_t d2 = vaddw_s32(s2, vget_high_s32(delta));
    249 
    250  vst1q_s64(dst, d1);
    251  vst1q_s64(dst + 2, d2);
    252 }
    253 
    254 static inline void load_more_16_neon(const int16_t *const src,
    255                                     const int32_t width,
    256                                     const int16x8_t org[2], int16x8_t dst[2]) {
    257  int16x8_t s0 = vld1q_dup_s16(src);
    258  int16x8_t s1 = vld1q_dup_s16(src + width);
    259  dst[0] = vextq_s16(org[0], s0, 1);
    260  dst[1] = vextq_s16(org[1], s1, 1);
    261 }
    262 
    263 static inline void stats_top_win5_neon(const int16x8_t src[2],
    264                                       const int16x8_t dgd[2],
    265                                       const int16_t *const d,
    266                                       const int32_t d_stride, int32x4_t *sum_m,
    267                                       int32x4_t *sum_h) {
    268  int16x8_t dgds[WIENER_WIN_CHROMA * 2];
    269 
    270  load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
    271               &dgds[8]);
    272  load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
    273               &dgds[9]);
    274 
    275  madd_neon(&sum_m[0], src[0], dgds[0]);
    276  madd_neon(&sum_m[0], src[1], dgds[1]);
    277  madd_neon(&sum_m[1], src[0], dgds[2]);
    278  madd_neon(&sum_m[1], src[1], dgds[3]);
    279  madd_neon(&sum_m[2], src[0], dgds[4]);
    280  madd_neon(&sum_m[2], src[1], dgds[5]);
    281  madd_neon(&sum_m[3], src[0], dgds[6]);
    282  madd_neon(&sum_m[3], src[1], dgds[7]);
    283  madd_neon(&sum_m[4], src[0], dgds[8]);
    284  madd_neon(&sum_m[4], src[1], dgds[9]);
    285 
    286  madd_neon(&sum_h[0], dgd[0], dgds[0]);
    287  madd_neon(&sum_h[0], dgd[1], dgds[1]);
    288  madd_neon(&sum_h[1], dgd[0], dgds[2]);
    289  madd_neon(&sum_h[1], dgd[1], dgds[3]);
    290  madd_neon(&sum_h[2], dgd[0], dgds[4]);
    291  madd_neon(&sum_h[2], dgd[1], dgds[5]);
    292  madd_neon(&sum_h[3], dgd[0], dgds[6]);
    293  madd_neon(&sum_h[3], dgd[1], dgds[7]);
    294  madd_neon(&sum_h[4], dgd[0], dgds[8]);
    295  madd_neon(&sum_h[4], dgd[1], dgds[9]);
    296 }
    297 
    298 static inline void stats_left_win5_neon(const int16x8_t src[2],
    299                                        const int16_t *d,
    300                                        const int32_t d_stride,
    301                                        int32x4_t *sum) {
    302  int16x8_t dgds[WIN_CHROMA];
    303 
    304  load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
    305               &dgds[6]);
    306  load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
    307               &dgds[7]);
    308 
    309  madd_neon(&sum[0], src[0], dgds[0]);
    310  madd_neon(&sum[0], src[1], dgds[1]);
    311  madd_neon(&sum[1], src[0], dgds[2]);
    312  madd_neon(&sum[1], src[1], dgds[3]);
    313  madd_neon(&sum[2], src[0], dgds[4]);
    314  madd_neon(&sum[2], src[1], dgds[5]);
    315  madd_neon(&sum[3], src[0], dgds[6]);
    316  madd_neon(&sum[3], src[1], dgds[7]);
    317 }
    318 
    319 static inline void derive_square_win5_neon(
    320    const int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
    321    const int16x8_t *d_je,
    322    int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
    323  msub_neon(&deltas[0][0], d_is[0], d_js[0]);
    324  msub_neon(&deltas[0][0], d_is[1], d_js[1]);
    325  msub_neon(&deltas[0][1], d_is[0], d_js[2]);
    326  msub_neon(&deltas[0][1], d_is[1], d_js[3]);
    327  msub_neon(&deltas[0][2], d_is[0], d_js[4]);
    328  msub_neon(&deltas[0][2], d_is[1], d_js[5]);
    329  msub_neon(&deltas[0][3], d_is[0], d_js[6]);
    330  msub_neon(&deltas[0][3], d_is[1], d_js[7]);
    331 
    332  msub_neon(&deltas[1][0], d_is[2], d_js[0]);
    333  msub_neon(&deltas[1][0], d_is[3], d_js[1]);
    334  msub_neon(&deltas[1][1], d_is[2], d_js[2]);
    335  msub_neon(&deltas[1][1], d_is[3], d_js[3]);
    336  msub_neon(&deltas[1][2], d_is[2], d_js[4]);
    337  msub_neon(&deltas[1][2], d_is[3], d_js[5]);
    338  msub_neon(&deltas[1][3], d_is[2], d_js[6]);
    339  msub_neon(&deltas[1][3], d_is[3], d_js[7]);
    340 
    341  msub_neon(&deltas[2][0], d_is[4], d_js[0]);
    342  msub_neon(&deltas[2][0], d_is[5], d_js[1]);
    343  msub_neon(&deltas[2][1], d_is[4], d_js[2]);
    344  msub_neon(&deltas[2][1], d_is[5], d_js[3]);
    345  msub_neon(&deltas[2][2], d_is[4], d_js[4]);
    346  msub_neon(&deltas[2][2], d_is[5], d_js[5]);
    347  msub_neon(&deltas[2][3], d_is[4], d_js[6]);
    348  msub_neon(&deltas[2][3], d_is[5], d_js[7]);
    349 
    350  msub_neon(&deltas[3][0], d_is[6], d_js[0]);
    351  msub_neon(&deltas[3][0], d_is[7], d_js[1]);
    352  msub_neon(&deltas[3][1], d_is[6], d_js[2]);
    353  msub_neon(&deltas[3][1], d_is[7], d_js[3]);
    354  msub_neon(&deltas[3][2], d_is[6], d_js[4]);
    355  msub_neon(&deltas[3][2], d_is[7], d_js[5]);
    356  msub_neon(&deltas[3][3], d_is[6], d_js[6]);
    357  msub_neon(&deltas[3][3], d_is[7], d_js[7]);
    358 
    359  madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
    360  madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
    361  madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
    362  madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
    363  madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
    364  madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
    365  madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
    366  madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
    367 
    368  madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
    369  madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
    370  madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
    371  madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
    372  madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
    373  madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
    374  madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
    375  madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
    376 
    377  madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
    378  madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
    379  madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
    380  madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
    381  madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
    382  madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
    383  madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
    384  madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
    385 
    386  madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
    387  madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
    388  madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
    389  madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
    390  madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
    391  madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
    392  madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
    393  madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
    394 }
    395 
    396 static inline void load_square_win5_neon(const int16_t *const di,
    397                                         const int16_t *const dj,
    398                                         const int32_t d_stride,
    399                                         const int32_t height, int16x8_t *d_is,
    400                                         int16x8_t *d_ie, int16x8_t *d_js,
    401                                         int16x8_t *d_je) {
    402  load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
    403  load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
    404  load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
    405  load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
    406 
    407  load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
    408               &d_ie[4], &d_ie[6]);
    409  load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
    410               &d_ie[5], &d_ie[7]);
    411  load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
    412               &d_je[4], &d_je[6]);
    413  load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
    414               &d_je[5], &d_je[7]);
    415 }
    416 
    417 static inline void update_5_stats_neon(const int64_t *const src,
    418                                       const int32x4_t delta,
    419                                       const int64_t delta4,
    420                                       int64_t *const dst) {
    421  update_4_stats_neon(src + 0, delta, dst + 0);
    422  dst[4] = src[4] + delta4;
    423 }
    424 
    425 static inline void compute_delta_step3_two_lines(int32x4_t *sum,
    426                                                 const int16x8_t src,
    427                                                 const int16x8_t dgd) {
    428  *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
    429  *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
    430 }
    431 
    432 static inline void step3_win5_neon(const int16_t *d, const int32_t d_stride,
    433                                   const int32_t width, const int32_t height,
    434                                   int16x8_t *ds, int32x4_t *deltas) {
    435  int32_t y = height;
    436  do {
    437    ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
    438    ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width);
    439 
    440    compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]);
    441    compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]);
    442    compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]);
    443    compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]);
    444    compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]);
    445    compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]);
    446    compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]);
    447    compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]);
    448    compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]);
    449    compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]);
    450 
    451    ds[0] = ds[2];
    452    ds[1] = ds[3];
    453    ds[2] = ds[4];
    454    ds[3] = ds[5];
    455 
    456    d += 2 * d_stride;
    457    y -= 2;
    458  } while (y);
    459 }
    460 
    461 static inline void step3_win5_oneline_neon(const int16_t **const d,
    462                                           const int32_t d_stride,
    463                                           const int32_t width,
    464                                           const int32_t height, int16x8_t *ds,
    465                                           int32x4_t *deltas) {
    466  int32_t y = height;
    467  do {
    468    ds[8] = vld1q_s16(*d);
    469    ds[9] = vld1q_s16(*d + width);
    470 
    471    compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
    472    compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
    473    compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
    474    compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
    475    compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
    476 
    477    ds[0] = ds[2];
    478    ds[1] = ds[3];
    479    ds[2] = ds[4];
    480    ds[3] = ds[5];
    481    ds[4] = ds[6];
    482    ds[5] = ds[7];
    483    ds[6] = ds[8];
    484    ds[7] = ds[9];
    485 
    486    *d += d_stride;
    487  } while (--y);
    488 }
    489 
    490 static inline void derive_triangle_win5_neon(const int16x8_t *d_is,
    491                                             const int16x8_t *d_ie,
    492                                             int32x4_t *deltas) {
    493  msub_neon(&deltas[0], d_is[0], d_is[0]);
    494  msub_neon(&deltas[0], d_is[1], d_is[1]);
    495  msub_neon(&deltas[1], d_is[0], d_is[2]);
    496  msub_neon(&deltas[1], d_is[1], d_is[3]);
    497  msub_neon(&deltas[2], d_is[0], d_is[4]);
    498  msub_neon(&deltas[2], d_is[1], d_is[5]);
    499  msub_neon(&deltas[3], d_is[0], d_is[6]);
    500  msub_neon(&deltas[3], d_is[1], d_is[7]);
    501  msub_neon(&deltas[4], d_is[2], d_is[2]);
    502  msub_neon(&deltas[4], d_is[3], d_is[3]);
    503  msub_neon(&deltas[5], d_is[2], d_is[4]);
    504  msub_neon(&deltas[5], d_is[3], d_is[5]);
    505  msub_neon(&deltas[6], d_is[2], d_is[6]);
    506  msub_neon(&deltas[6], d_is[3], d_is[7]);
    507  msub_neon(&deltas[7], d_is[4], d_is[4]);
    508  msub_neon(&deltas[7], d_is[5], d_is[5]);
    509  msub_neon(&deltas[8], d_is[4], d_is[6]);
    510  msub_neon(&deltas[8], d_is[5], d_is[7]);
    511  msub_neon(&deltas[9], d_is[6], d_is[6]);
    512  msub_neon(&deltas[9], d_is[7], d_is[7]);
    513 
    514  madd_neon(&deltas[0], d_ie[0], d_ie[0]);
    515  madd_neon(&deltas[0], d_ie[1], d_ie[1]);
    516  madd_neon(&deltas[1], d_ie[0], d_ie[2]);
    517  madd_neon(&deltas[1], d_ie[1], d_ie[3]);
    518  madd_neon(&deltas[2], d_ie[0], d_ie[4]);
    519  madd_neon(&deltas[2], d_ie[1], d_ie[5]);
    520  madd_neon(&deltas[3], d_ie[0], d_ie[6]);
    521  madd_neon(&deltas[3], d_ie[1], d_ie[7]);
    522  madd_neon(&deltas[4], d_ie[2], d_ie[2]);
    523  madd_neon(&deltas[4], d_ie[3], d_ie[3]);
    524  madd_neon(&deltas[5], d_ie[2], d_ie[4]);
    525  madd_neon(&deltas[5], d_ie[3], d_ie[5]);
    526  madd_neon(&deltas[6], d_ie[2], d_ie[6]);
    527  madd_neon(&deltas[6], d_ie[3], d_ie[7]);
    528  madd_neon(&deltas[7], d_ie[4], d_ie[4]);
    529  madd_neon(&deltas[7], d_ie[5], d_ie[5]);
    530  madd_neon(&deltas[8], d_ie[4], d_ie[6]);
    531  madd_neon(&deltas[8], d_ie[5], d_ie[7]);
    532  madd_neon(&deltas[9], d_ie[6], d_ie[6]);
    533  madd_neon(&deltas[9], d_ie[7], d_ie[7]);
    534 }
    535 
    536 static inline void load_triangle_win5_neon(const int16_t *const di,
    537                                           const int32_t d_stride,
    538                                           const int32_t height,
    539                                           int16x8_t *d_is, int16x8_t *d_ie) {
    540  load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
    541  load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
    542 
    543  load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
    544               &d_ie[4], &d_ie[6]);
    545  load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
    546               &d_ie[5], &d_ie[7]);
    547 }
    548 
    549 static inline void sub_deltas_step4(int16x8_t *A, int16x8_t *B,
    550                                    int32x4_t *deltas) {
    551  deltas[0] = vmlsl_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
    552  deltas[0] = vmlsl_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
    553  deltas[1] = vmlsl_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
    554  deltas[1] = vmlsl_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
    555  deltas[2] = vmlsl_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
    556  deltas[2] = vmlsl_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
    557  deltas[3] = vmlsl_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
    558  deltas[3] = vmlsl_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
    559  deltas[4] = vmlsl_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
    560  deltas[4] = vmlsl_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
    561  deltas[5] = vmlsl_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
    562  deltas[5] = vmlsl_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
    563  deltas[6] = vmlsl_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
    564  deltas[6] = vmlsl_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
    565  deltas[7] = vmlsl_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
    566  deltas[7] = vmlsl_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
    567  deltas[8] = vmlsl_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
    568  deltas[8] = vmlsl_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
    569 }
    570 
    571 static inline void add_deltas_step4(int16x8_t *A, int16x8_t *B,
    572                                    int32x4_t *deltas) {
    573  deltas[0] = vmlal_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
    574  deltas[0] = vmlal_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
    575  deltas[1] = vmlal_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
    576  deltas[1] = vmlal_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
    577  deltas[2] = vmlal_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
    578  deltas[2] = vmlal_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
    579  deltas[3] = vmlal_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
    580  deltas[3] = vmlal_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
    581  deltas[4] = vmlal_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
    582  deltas[4] = vmlal_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
    583  deltas[5] = vmlal_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
    584  deltas[5] = vmlal_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
    585  deltas[6] = vmlal_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
    586  deltas[6] = vmlal_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
    587  deltas[7] = vmlal_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
    588  deltas[7] = vmlal_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
    589  deltas[8] = vmlal_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
    590  deltas[8] = vmlal_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
    591 }
    592 
    593 static inline void stats_top_win7_neon(const int16x8_t src[2],
    594                                       const int16x8_t dgd[2],
    595                                       const int16_t *const d,
    596                                       const int32_t d_stride, int32x4_t *sum_m,
    597                                       int32x4_t *sum_h) {
    598  int16x8_t dgds[WIENER_WIN * 2];
    599 
    600  load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
    601               &dgds[8], &dgds[10], &dgds[12]);
    602  load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
    603               &dgds[9], &dgds[11], &dgds[13]);
    604 
    605  madd_neon(&sum_m[0], src[0], dgds[0]);
    606  madd_neon(&sum_m[0], src[1], dgds[1]);
    607  madd_neon(&sum_m[1], src[0], dgds[2]);
    608  madd_neon(&sum_m[1], src[1], dgds[3]);
    609  madd_neon(&sum_m[2], src[0], dgds[4]);
    610  madd_neon(&sum_m[2], src[1], dgds[5]);
    611  madd_neon(&sum_m[3], src[0], dgds[6]);
    612  madd_neon(&sum_m[3], src[1], dgds[7]);
    613  madd_neon(&sum_m[4], src[0], dgds[8]);
    614  madd_neon(&sum_m[4], src[1], dgds[9]);
    615  madd_neon(&sum_m[5], src[0], dgds[10]);
    616  madd_neon(&sum_m[5], src[1], dgds[11]);
    617  madd_neon(&sum_m[6], src[0], dgds[12]);
    618  madd_neon(&sum_m[6], src[1], dgds[13]);
    619 
    620  madd_neon(&sum_h[0], dgd[0], dgds[0]);
    621  madd_neon(&sum_h[0], dgd[1], dgds[1]);
    622  madd_neon(&sum_h[1], dgd[0], dgds[2]);
    623  madd_neon(&sum_h[1], dgd[1], dgds[3]);
    624  madd_neon(&sum_h[2], dgd[0], dgds[4]);
    625  madd_neon(&sum_h[2], dgd[1], dgds[5]);
    626  madd_neon(&sum_h[3], dgd[0], dgds[6]);
    627  madd_neon(&sum_h[3], dgd[1], dgds[7]);
    628  madd_neon(&sum_h[4], dgd[0], dgds[8]);
    629  madd_neon(&sum_h[4], dgd[1], dgds[9]);
    630  madd_neon(&sum_h[5], dgd[0], dgds[10]);
    631  madd_neon(&sum_h[5], dgd[1], dgds[11]);
    632  madd_neon(&sum_h[6], dgd[0], dgds[12]);
    633  madd_neon(&sum_h[6], dgd[1], dgds[13]);
    634 }
    635 
    636 static inline void derive_square_win7_neon(const int16x8_t *d_is,
    637                                           const int16x8_t *d_ie,
    638                                           const int16x8_t *d_js,
    639                                           const int16x8_t *d_je,
    640                                           int32x4_t deltas[][WIN_7]) {
    641  msub_neon(&deltas[0][0], d_is[0], d_js[0]);
    642  msub_neon(&deltas[0][0], d_is[1], d_js[1]);
    643  msub_neon(&deltas[0][1], d_is[0], d_js[2]);
    644  msub_neon(&deltas[0][1], d_is[1], d_js[3]);
    645  msub_neon(&deltas[0][2], d_is[0], d_js[4]);
    646  msub_neon(&deltas[0][2], d_is[1], d_js[5]);
    647  msub_neon(&deltas[0][3], d_is[0], d_js[6]);
    648  msub_neon(&deltas[0][3], d_is[1], d_js[7]);
    649  msub_neon(&deltas[0][4], d_is[0], d_js[8]);
    650  msub_neon(&deltas[0][4], d_is[1], d_js[9]);
    651  msub_neon(&deltas[0][5], d_is[0], d_js[10]);
    652  msub_neon(&deltas[0][5], d_is[1], d_js[11]);
    653 
    654  msub_neon(&deltas[1][0], d_is[2], d_js[0]);
    655  msub_neon(&deltas[1][0], d_is[3], d_js[1]);
    656  msub_neon(&deltas[1][1], d_is[2], d_js[2]);
    657  msub_neon(&deltas[1][1], d_is[3], d_js[3]);
    658  msub_neon(&deltas[1][2], d_is[2], d_js[4]);
    659  msub_neon(&deltas[1][2], d_is[3], d_js[5]);
    660  msub_neon(&deltas[1][3], d_is[2], d_js[6]);
    661  msub_neon(&deltas[1][3], d_is[3], d_js[7]);
    662  msub_neon(&deltas[1][4], d_is[2], d_js[8]);
    663  msub_neon(&deltas[1][4], d_is[3], d_js[9]);
    664  msub_neon(&deltas[1][5], d_is[2], d_js[10]);
    665  msub_neon(&deltas[1][5], d_is[3], d_js[11]);
    666 
    667  msub_neon(&deltas[2][0], d_is[4], d_js[0]);
    668  msub_neon(&deltas[2][0], d_is[5], d_js[1]);
    669  msub_neon(&deltas[2][1], d_is[4], d_js[2]);
    670  msub_neon(&deltas[2][1], d_is[5], d_js[3]);
    671  msub_neon(&deltas[2][2], d_is[4], d_js[4]);
    672  msub_neon(&deltas[2][2], d_is[5], d_js[5]);
    673  msub_neon(&deltas[2][3], d_is[4], d_js[6]);
    674  msub_neon(&deltas[2][3], d_is[5], d_js[7]);
    675  msub_neon(&deltas[2][4], d_is[4], d_js[8]);
    676  msub_neon(&deltas[2][4], d_is[5], d_js[9]);
    677  msub_neon(&deltas[2][5], d_is[4], d_js[10]);
    678  msub_neon(&deltas[2][5], d_is[5], d_js[11]);
    679 
    680  msub_neon(&deltas[3][0], d_is[6], d_js[0]);
    681  msub_neon(&deltas[3][0], d_is[7], d_js[1]);
    682  msub_neon(&deltas[3][1], d_is[6], d_js[2]);
    683  msub_neon(&deltas[3][1], d_is[7], d_js[3]);
    684  msub_neon(&deltas[3][2], d_is[6], d_js[4]);
    685  msub_neon(&deltas[3][2], d_is[7], d_js[5]);
    686  msub_neon(&deltas[3][3], d_is[6], d_js[6]);
    687  msub_neon(&deltas[3][3], d_is[7], d_js[7]);
    688  msub_neon(&deltas[3][4], d_is[6], d_js[8]);
    689  msub_neon(&deltas[3][4], d_is[7], d_js[9]);
    690  msub_neon(&deltas[3][5], d_is[6], d_js[10]);
    691  msub_neon(&deltas[3][5], d_is[7], d_js[11]);
    692 
    693  msub_neon(&deltas[4][0], d_is[8], d_js[0]);
    694  msub_neon(&deltas[4][0], d_is[9], d_js[1]);
    695  msub_neon(&deltas[4][1], d_is[8], d_js[2]);
    696  msub_neon(&deltas[4][1], d_is[9], d_js[3]);
    697  msub_neon(&deltas[4][2], d_is[8], d_js[4]);
    698  msub_neon(&deltas[4][2], d_is[9], d_js[5]);
    699  msub_neon(&deltas[4][3], d_is[8], d_js[6]);
    700  msub_neon(&deltas[4][3], d_is[9], d_js[7]);
    701  msub_neon(&deltas[4][4], d_is[8], d_js[8]);
    702  msub_neon(&deltas[4][4], d_is[9], d_js[9]);
    703  msub_neon(&deltas[4][5], d_is[8], d_js[10]);
    704  msub_neon(&deltas[4][5], d_is[9], d_js[11]);
    705 
    706  msub_neon(&deltas[5][0], d_is[10], d_js[0]);
    707  msub_neon(&deltas[5][0], d_is[11], d_js[1]);
    708  msub_neon(&deltas[5][1], d_is[10], d_js[2]);
    709  msub_neon(&deltas[5][1], d_is[11], d_js[3]);
    710  msub_neon(&deltas[5][2], d_is[10], d_js[4]);
    711  msub_neon(&deltas[5][2], d_is[11], d_js[5]);
    712  msub_neon(&deltas[5][3], d_is[10], d_js[6]);
    713  msub_neon(&deltas[5][3], d_is[11], d_js[7]);
    714  msub_neon(&deltas[5][4], d_is[10], d_js[8]);
    715  msub_neon(&deltas[5][4], d_is[11], d_js[9]);
    716  msub_neon(&deltas[5][5], d_is[10], d_js[10]);
    717  msub_neon(&deltas[5][5], d_is[11], d_js[11]);
    718 
    719  madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
    720  madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
    721  madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
    722  madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
    723  madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
    724  madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
    725  madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
    726  madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
    727  madd_neon(&deltas[0][4], d_ie[0], d_je[8]);
    728  madd_neon(&deltas[0][4], d_ie[1], d_je[9]);
    729  madd_neon(&deltas[0][5], d_ie[0], d_je[10]);
    730  madd_neon(&deltas[0][5], d_ie[1], d_je[11]);
    731 
    732  madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
    733  madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
    734  madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
    735  madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
    736  madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
    737  madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
    738  madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
    739  madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
    740  madd_neon(&deltas[1][4], d_ie[2], d_je[8]);
    741  madd_neon(&deltas[1][4], d_ie[3], d_je[9]);
    742  madd_neon(&deltas[1][5], d_ie[2], d_je[10]);
    743  madd_neon(&deltas[1][5], d_ie[3], d_je[11]);
    744 
    745  madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
    746  madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
    747  madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
    748  madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
    749  madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
    750  madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
    751  madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
    752  madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
    753  madd_neon(&deltas[2][4], d_ie[4], d_je[8]);
    754  madd_neon(&deltas[2][4], d_ie[5], d_je[9]);
    755  madd_neon(&deltas[2][5], d_ie[4], d_je[10]);
    756  madd_neon(&deltas[2][5], d_ie[5], d_je[11]);
    757 
    758  madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
    759  madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
    760  madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
    761  madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
    762  madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
    763  madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
    764  madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
    765  madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
    766  madd_neon(&deltas[3][4], d_ie[6], d_je[8]);
    767  madd_neon(&deltas[3][4], d_ie[7], d_je[9]);
    768  madd_neon(&deltas[3][5], d_ie[6], d_je[10]);
    769  madd_neon(&deltas[3][5], d_ie[7], d_je[11]);
    770 
    771  madd_neon(&deltas[4][0], d_ie[8], d_je[0]);
    772  madd_neon(&deltas[4][0], d_ie[9], d_je[1]);
    773  madd_neon(&deltas[4][1], d_ie[8], d_je[2]);
    774  madd_neon(&deltas[4][1], d_ie[9], d_je[3]);
    775  madd_neon(&deltas[4][2], d_ie[8], d_je[4]);
    776  madd_neon(&deltas[4][2], d_ie[9], d_je[5]);
    777  madd_neon(&deltas[4][3], d_ie[8], d_je[6]);
    778  madd_neon(&deltas[4][3], d_ie[9], d_je[7]);
    779  madd_neon(&deltas[4][4], d_ie[8], d_je[8]);
    780  madd_neon(&deltas[4][4], d_ie[9], d_je[9]);
    781  madd_neon(&deltas[4][5], d_ie[8], d_je[10]);
    782  madd_neon(&deltas[4][5], d_ie[9], d_je[11]);
    783 
    784  madd_neon(&deltas[5][0], d_ie[10], d_je[0]);
    785  madd_neon(&deltas[5][0], d_ie[11], d_je[1]);
    786  madd_neon(&deltas[5][1], d_ie[10], d_je[2]);
    787  madd_neon(&deltas[5][1], d_ie[11], d_je[3]);
    788  madd_neon(&deltas[5][2], d_ie[10], d_je[4]);
    789  madd_neon(&deltas[5][2], d_ie[11], d_je[5]);
    790  madd_neon(&deltas[5][3], d_ie[10], d_je[6]);
    791  madd_neon(&deltas[5][3], d_ie[11], d_je[7]);
    792  madd_neon(&deltas[5][4], d_ie[10], d_je[8]);
    793  madd_neon(&deltas[5][4], d_ie[11], d_je[9]);
    794  madd_neon(&deltas[5][5], d_ie[10], d_je[10]);
    795  madd_neon(&deltas[5][5], d_ie[11], d_je[11]);
    796 }
    797 
    798 static inline void update_8_stats_neon(const int64_t *const src,
    799                                       const int32x4_t delta0,
    800                                       const int32x4_t delta1,
    801                                       int64_t *const dst) {
    802  update_4_stats_neon(src + 0, delta0, dst + 0);
    803  update_4_stats_neon(src + 4, delta1, dst + 4);
    804 }
    805 
    806 static inline void load_square_win7_neon(const int16_t *const di,
    807                                         const int16_t *const dj,
    808                                         const int32_t d_stride,
    809                                         const int32_t height, int16x8_t *d_is,
    810                                         int16x8_t *d_ie, int16x8_t *d_js,
    811                                         int16x8_t *d_je) {
    812  load_s16_8x6(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6],
    813               &d_is[8], &d_is[10]);
    814  load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
    815               &d_is[9], &d_is[11]);
    816  load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
    817               &d_js[8], &d_js[10]);
    818  load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
    819               &d_js[9], &d_js[11]);
    820 
    821  load_s16_8x6(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
    822               &d_ie[4], &d_ie[6], &d_ie[8], &d_ie[10]);
    823  load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
    824               &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
    825  load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
    826               &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
    827  load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
    828               &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
    829 }
    830 
    831 static inline void load_triangle_win7_neon(const int16_t *const di,
    832                                           const int32_t d_stride,
    833                                           const int32_t height,
    834                                           int16x8_t *d_is, int16x8_t *d_ie) {
    835  load_s16_8x6(di, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6], &d_is[8],
    836               &d_is[10]);
    837  load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
    838               &d_is[9], &d_is[11]);
    839 
    840  load_s16_8x6(di + height * d_stride, d_stride, &d_ie[0], &d_ie[2], &d_ie[4],
    841               &d_ie[6], &d_ie[8], &d_ie[10]);
    842  load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
    843               &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
    844 }
    845 
    846 static inline void stats_left_win7_neon(const int16x8_t src[2],
    847                                        const int16_t *d,
    848                                        const int32_t d_stride,
    849                                        int32x4_t *sum) {
    850  int16x8_t dgds[WIN_7];
    851 
    852  load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
    853               &dgds[6], &dgds[8], &dgds[10]);
    854  load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
    855               &dgds[7], &dgds[9], &dgds[11]);
    856 
    857  madd_neon(&sum[0], src[0], dgds[0]);
    858  madd_neon(&sum[0], src[1], dgds[1]);
    859  madd_neon(&sum[1], src[0], dgds[2]);
    860  madd_neon(&sum[1], src[1], dgds[3]);
    861  madd_neon(&sum[2], src[0], dgds[4]);
    862  madd_neon(&sum[2], src[1], dgds[5]);
    863  madd_neon(&sum[3], src[0], dgds[6]);
    864  madd_neon(&sum[3], src[1], dgds[7]);
    865  madd_neon(&sum[4], src[0], dgds[8]);
    866  madd_neon(&sum[4], src[1], dgds[9]);
    867  madd_neon(&sum[5], src[0], dgds[10]);
    868  madd_neon(&sum[5], src[1], dgds[11]);
    869 }
    870 
    871 static inline void step3_win7_neon(const int16_t *d, const int32_t d_stride,
    872                                   const int32_t width, const int32_t height,
    873                                   int16x8_t *ds, int32x4_t *deltas) {
    874  int32_t y = height;
    875  do {
    876    ds[12] = vld1q_s16(d);
    877    ds[13] = vld1q_s16(d + width);
    878 
    879    compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
    880    compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
    881    compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
    882    compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
    883    compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
    884    compute_delta_step3(&deltas[9], &deltas[13], ds[0], ds[1], ds[10], ds[11]);
    885    compute_delta_step3(&deltas[10], &deltas[14], ds[0], ds[1], ds[12], ds[13]);
    886 
    887    ds[0] = ds[2];
    888    ds[1] = ds[3];
    889    ds[2] = ds[4];
    890    ds[3] = ds[5];
    891    ds[4] = ds[6];
    892    ds[5] = ds[7];
    893    ds[6] = ds[8];
    894    ds[7] = ds[9];
    895    ds[8] = ds[10];
    896    ds[9] = ds[11];
    897    ds[10] = ds[12];
    898    ds[11] = ds[13];
    899 
    900    d += d_stride;
    901  } while (--y);
    902 }
    903 
    904 static inline void derive_triangle_win7_neon(const int16x8_t *d_is,
    905                                             const int16x8_t *d_ie,
    906                                             int32x4_t *deltas) {
    907  msub_neon(&deltas[0], d_is[0], d_is[0]);
    908  msub_neon(&deltas[0], d_is[1], d_is[1]);
    909  msub_neon(&deltas[1], d_is[0], d_is[2]);
    910  msub_neon(&deltas[1], d_is[1], d_is[3]);
    911  msub_neon(&deltas[2], d_is[0], d_is[4]);
    912  msub_neon(&deltas[2], d_is[1], d_is[5]);
    913  msub_neon(&deltas[3], d_is[0], d_is[6]);
    914  msub_neon(&deltas[3], d_is[1], d_is[7]);
    915  msub_neon(&deltas[4], d_is[0], d_is[8]);
    916  msub_neon(&deltas[4], d_is[1], d_is[9]);
    917  msub_neon(&deltas[5], d_is[0], d_is[10]);
    918  msub_neon(&deltas[5], d_is[1], d_is[11]);
    919 
    920  msub_neon(&deltas[6], d_is[2], d_is[2]);
    921  msub_neon(&deltas[6], d_is[3], d_is[3]);
    922  msub_neon(&deltas[7], d_is[2], d_is[4]);
    923  msub_neon(&deltas[7], d_is[3], d_is[5]);
    924  msub_neon(&deltas[8], d_is[2], d_is[6]);
    925  msub_neon(&deltas[8], d_is[3], d_is[7]);
    926  msub_neon(&deltas[9], d_is[2], d_is[8]);
    927  msub_neon(&deltas[9], d_is[3], d_is[9]);
    928  msub_neon(&deltas[10], d_is[2], d_is[10]);
    929  msub_neon(&deltas[10], d_is[3], d_is[11]);
    930 
    931  msub_neon(&deltas[11], d_is[4], d_is[4]);
    932  msub_neon(&deltas[11], d_is[5], d_is[5]);
    933  msub_neon(&deltas[12], d_is[4], d_is[6]);
    934  msub_neon(&deltas[12], d_is[5], d_is[7]);
    935  msub_neon(&deltas[13], d_is[4], d_is[8]);
    936  msub_neon(&deltas[13], d_is[5], d_is[9]);
    937  msub_neon(&deltas[14], d_is[4], d_is[10]);
    938  msub_neon(&deltas[14], d_is[5], d_is[11]);
    939 
    940  msub_neon(&deltas[15], d_is[6], d_is[6]);
    941  msub_neon(&deltas[15], d_is[7], d_is[7]);
    942  msub_neon(&deltas[16], d_is[6], d_is[8]);
    943  msub_neon(&deltas[16], d_is[7], d_is[9]);
    944  msub_neon(&deltas[17], d_is[6], d_is[10]);
    945  msub_neon(&deltas[17], d_is[7], d_is[11]);
    946 
    947  msub_neon(&deltas[18], d_is[8], d_is[8]);
    948  msub_neon(&deltas[18], d_is[9], d_is[9]);
    949  msub_neon(&deltas[19], d_is[8], d_is[10]);
    950  msub_neon(&deltas[19], d_is[9], d_is[11]);
    951 
    952  msub_neon(&deltas[20], d_is[10], d_is[10]);
    953  msub_neon(&deltas[20], d_is[11], d_is[11]);
    954 
    955  madd_neon(&deltas[0], d_ie[0], d_ie[0]);
    956  madd_neon(&deltas[0], d_ie[1], d_ie[1]);
    957  madd_neon(&deltas[1], d_ie[0], d_ie[2]);
    958  madd_neon(&deltas[1], d_ie[1], d_ie[3]);
    959  madd_neon(&deltas[2], d_ie[0], d_ie[4]);
    960  madd_neon(&deltas[2], d_ie[1], d_ie[5]);
    961  madd_neon(&deltas[3], d_ie[0], d_ie[6]);
    962  madd_neon(&deltas[3], d_ie[1], d_ie[7]);
    963  madd_neon(&deltas[4], d_ie[0], d_ie[8]);
    964  madd_neon(&deltas[4], d_ie[1], d_ie[9]);
    965  madd_neon(&deltas[5], d_ie[0], d_ie[10]);
    966  madd_neon(&deltas[5], d_ie[1], d_ie[11]);
    967 
    968  madd_neon(&deltas[6], d_ie[2], d_ie[2]);
    969  madd_neon(&deltas[6], d_ie[3], d_ie[3]);
    970  madd_neon(&deltas[7], d_ie[2], d_ie[4]);
    971  madd_neon(&deltas[7], d_ie[3], d_ie[5]);
    972  madd_neon(&deltas[8], d_ie[2], d_ie[6]);
    973  madd_neon(&deltas[8], d_ie[3], d_ie[7]);
    974  madd_neon(&deltas[9], d_ie[2], d_ie[8]);
    975  madd_neon(&deltas[9], d_ie[3], d_ie[9]);
    976  madd_neon(&deltas[10], d_ie[2], d_ie[10]);
    977  madd_neon(&deltas[10], d_ie[3], d_ie[11]);
    978 
    979  madd_neon(&deltas[11], d_ie[4], d_ie[4]);
    980  madd_neon(&deltas[11], d_ie[5], d_ie[5]);
    981  madd_neon(&deltas[12], d_ie[4], d_ie[6]);
    982  madd_neon(&deltas[12], d_ie[5], d_ie[7]);
    983  madd_neon(&deltas[13], d_ie[4], d_ie[8]);
    984  madd_neon(&deltas[13], d_ie[5], d_ie[9]);
    985  madd_neon(&deltas[14], d_ie[4], d_ie[10]);
    986  madd_neon(&deltas[14], d_ie[5], d_ie[11]);
    987 
    988  madd_neon(&deltas[15], d_ie[6], d_ie[6]);
    989  madd_neon(&deltas[15], d_ie[7], d_ie[7]);
    990  madd_neon(&deltas[16], d_ie[6], d_ie[8]);
    991  madd_neon(&deltas[16], d_ie[7], d_ie[9]);
    992  madd_neon(&deltas[17], d_ie[6], d_ie[10]);
    993  madd_neon(&deltas[17], d_ie[7], d_ie[11]);
    994 
    995  madd_neon(&deltas[18], d_ie[8], d_ie[8]);
    996  madd_neon(&deltas[18], d_ie[9], d_ie[9]);
    997  madd_neon(&deltas[19], d_ie[8], d_ie[10]);
    998  madd_neon(&deltas[19], d_ie[9], d_ie[11]);
    999 
   1000  madd_neon(&deltas[20], d_ie[10], d_ie[10]);
   1001  madd_neon(&deltas[20], d_ie[11], d_ie[11]);
   1002 }
   1003 
   1004 static inline void diagonal_copy_stats_neon(const int32_t wiener_win2,
   1005                                            int64_t *const H) {
   1006  for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
   1007    int64x2_t in[8], out[8];
   1008 
   1009    in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 1);
   1010    in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 3);
   1011    in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1);
   1012    in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 3);
   1013    in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 1);
   1014    in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 3);
   1015    in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 1);
   1016    in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 3);
   1017 
   1018    transpose_arrays_s64_4x4(in, out);
   1019 
   1020    vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(out[0]));
   1021    vst1q_s64(H + (i + 2) * wiener_win2 + i, out[2]);
   1022    vst1q_s64(H + (i + 3) * wiener_win2 + i, out[4]);
   1023    vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
   1024    vst1q_s64(H + (i + 4) * wiener_win2 + i, out[6]);
   1025    vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
   1026 
   1027    for (int32_t j = i + 5; j < wiener_win2; j += 4) {
   1028      in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + j);
   1029      in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + j + 2);
   1030      in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + j);
   1031      in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + j + 2);
   1032      in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + j);
   1033      in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + j + 2);
   1034      in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + j);
   1035      in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + j + 2);
   1036 
   1037      transpose_arrays_s64_4x4(in, out);
   1038 
   1039      vst1q_s64(H + (j + 0) * wiener_win2 + i, out[0]);
   1040      vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
   1041      vst1q_s64(H + (j + 1) * wiener_win2 + i, out[2]);
   1042      vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
   1043      vst1q_s64(H + (j + 2) * wiener_win2 + i, out[4]);
   1044      vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
   1045      vst1q_s64(H + (j + 3) * wiener_win2 + i, out[6]);
   1046      vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
   1047    }
   1048  }
   1049 }
   1050 
   1051 static inline int64x2_t div4_neon(const int64x2_t src) {
   1052 #if AOM_ARCH_AARCH64
   1053  uint64x2_t sign = vcltzq_s64(src);
   1054  int64x2_t abs = vabsq_s64(src);
   1055  // divide by 4
   1056  abs = vshrq_n_s64(abs, 2);
   1057  // re-apply sign
   1058  return vbslq_s64(sign, vnegq_s64(abs), abs);
   1059 #else
   1060  int64x2_t sign = vshrq_n_s64(src, 63);
   1061  int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
   1062  // divide by 4
   1063  abs = vshrq_n_s64(abs, 2);
   1064  // re-apply sign
   1065  return vsubq_s64(veorq_s64(abs, sign), sign);
   1066 #endif  // AOM_ARCH_AARCH64
   1067 }
   1068 
   1069 static inline void div4_4x4_neon(const int32_t wiener_win2, int64_t *const H,
   1070                                 int64x2_t out[8]) {
   1071  out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
   1072  out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
   1073  out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
   1074  out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
   1075  out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
   1076  out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
   1077  out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
   1078  out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
   1079 
   1080  out[0] = div4_neon(out[0]);
   1081  out[1] = div4_neon(out[1]);
   1082  out[2] = div4_neon(out[2]);
   1083  out[3] = div4_neon(out[3]);
   1084  out[4] = div4_neon(out[4]);
   1085  out[5] = div4_neon(out[5]);
   1086  out[6] = div4_neon(out[6]);
   1087  out[7] = div4_neon(out[7]);
   1088 
   1089  vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
   1090  vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
   1091  vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
   1092  vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
   1093  vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
   1094  vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
   1095  vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
   1096  vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
   1097 }
   1098 
   1099 static inline int64x2_t div16_neon(const int64x2_t src) {
   1100 #if AOM_ARCH_AARCH64
   1101  uint64x2_t sign = vcltzq_s64(src);
   1102  int64x2_t abs = vabsq_s64(src);
   1103  // divide by 16
   1104  abs = vshrq_n_s64(abs, 4);
   1105  // re-apply sign
   1106  return vbslq_s64(sign, vnegq_s64(abs), abs);
   1107 #else
   1108  int64x2_t sign = vshrq_n_s64(src, 63);
   1109  int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
   1110  // divide by 16
   1111  abs = vshrq_n_s64(abs, 4);
   1112  // re-apply sign
   1113  return vsubq_s64(veorq_s64(abs, sign), sign);
   1114 #endif  // AOM_ARCH_AARCH64
   1115 }
   1116 
   1117 static inline void div16_4x4_neon(const int32_t wiener_win2, int64_t *const H,
   1118                                  int64x2_t out[8]) {
   1119  out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
   1120  out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
   1121  out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
   1122  out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
   1123  out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
   1124  out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
   1125  out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
   1126  out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
   1127 
   1128  out[0] = div16_neon(out[0]);
   1129  out[1] = div16_neon(out[1]);
   1130  out[2] = div16_neon(out[2]);
   1131  out[3] = div16_neon(out[3]);
   1132  out[4] = div16_neon(out[4]);
   1133  out[5] = div16_neon(out[5]);
   1134  out[6] = div16_neon(out[6]);
   1135  out[7] = div16_neon(out[7]);
   1136 
   1137  vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
   1138  vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
   1139  vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
   1140  vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
   1141  vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
   1142  vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
   1143  vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
   1144  vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
   1145 }
   1146 
   1147 static inline void div4_diagonal_copy_stats_neon(const int32_t wiener_win2,
   1148                                                 int64_t *const H) {
   1149  for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
   1150    int64x2_t in[8], out[8];
   1151 
   1152    div4_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
   1153    transpose_arrays_s64_4x4(in, out);
   1154 
   1155    vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
   1156    vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
   1157    vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
   1158    vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
   1159    vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
   1160    vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
   1161 
   1162    for (int32_t j = i + 5; j < wiener_win2; j += 4) {
   1163      div4_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
   1164      transpose_arrays_s64_4x4(in, out);
   1165 
   1166      vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
   1167      vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
   1168      vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
   1169      vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
   1170      vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
   1171      vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
   1172      vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
   1173      vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
   1174    }
   1175  }
   1176 }
   1177 
   1178 static inline void div16_diagonal_copy_stats_neon(const int32_t wiener_win2,
   1179                                                  int64_t *const H) {
   1180  for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
   1181    int64x2_t in[8], out[8];
   1182 
   1183    div16_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
   1184    transpose_arrays_s64_4x4(in, out);
   1185 
   1186    vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
   1187    vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
   1188    vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
   1189    vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
   1190    vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
   1191    vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
   1192 
   1193    for (int32_t j = i + 5; j < wiener_win2; j += 4) {
   1194      div16_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
   1195      transpose_arrays_s64_4x4(in, out);
   1196 
   1197      vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
   1198      vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
   1199      vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
   1200      vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
   1201      vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
   1202      vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
   1203      vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
   1204      vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
   1205    }
   1206  }
   1207 }
   1208 
   1209 #endif  // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_