pickrst_sve.h (90704B)
1 /* 2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ 13 #define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ 14 15 #include <arm_neon.h> 16 #include <arm_sve.h> 17 18 #include "aom_dsp/arm/aom_neon_sve_bridge.h" 19 #include "av1/encoder/arm/pickrst_neon.h" 20 21 // Swap each half of the dgd vectors so that we can accumulate the result of 22 // the dot-products directly in the destination matrix. 23 static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) { 24 int16x8_t dgd_trn0 = vreinterpretq_s16_s64( 25 vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); 26 int16x8_t dgd_trn1 = vreinterpretq_s16_s64( 27 vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); 28 29 return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 }; 30 } 31 32 static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5], 33 int64_t *M, int row) { 34 const int wiener_win = 5; 35 36 int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); 37 int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); 38 39 int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); 40 cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); 41 vst1q_s64(M + row * wiener_win + 0, cross_corr01); 42 43 int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); 44 int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); 45 46 int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); 47 cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); 48 vst1q_s64(M + row * wiener_win + 2, cross_corr23); 49 50 int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]); 51 M[row * wiener_win + 4] += vaddvq_s64(m4); 52 } 53 54 static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7], 55 int64_t *M, int row) { 56 const int wiener_win = 7; 57 58 int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); 59 int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); 60 61 int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); 62 cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); 63 vst1q_s64(M + row * wiener_win + 0, cross_corr01); 64 65 int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); 66 int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); 67 68 int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); 69 cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); 70 vst1q_s64(M + row * wiener_win + 2, cross_corr23); 71 72 int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4); 73 int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]); 74 75 int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0); 76 cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1); 77 vst1q_s64(M + row * wiener_win + 4, cross_corr45); 78 79 int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]); 80 M[row * wiener_win + 6] += vaddvq_s64(m6); 81 } 82 83 static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, 84 const int wiener_win, 85 const int wiener_win2) { 86 for (int row0 = 0; row0 < wiener_win; row0++) { 87 for (int row1 = row0; row1 < wiener_win; row1++) { 88 int auto_cov_idx = 89 (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; 90 91 int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]); 92 H[auto_cov_idx] += vaddvq_s64(auto_cov); 93 } 94 } 95 } 96 97 static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1, 98 int row0, int row1, int64_t *H) { 99 for (int col0 = 0; col0 < 5; col0++) { 100 int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5); 101 102 int64x2_t h01 = vld1q_s64(H + auto_cov_idx); 103 int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); 104 105 int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); 106 auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); 107 vst1q_s64(H + auto_cov_idx, auto_cov01); 108 109 int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); 110 int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); 111 112 int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); 113 auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); 114 vst1q_s64(H + auto_cov_idx + 2, auto_cov23); 115 116 int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]); 117 H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4); 118 } 119 } 120 121 static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1, 122 int row0, int row1, int64_t *H) { 123 for (int col0 = 0; col0 < 7; col0++) { 124 int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7); 125 126 int64x2_t h01 = vld1q_s64(H + auto_cov_idx); 127 int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); 128 129 int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); 130 auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); 131 vst1q_s64(H + auto_cov_idx, auto_cov01); 132 133 int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); 134 int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); 135 136 int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); 137 auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); 138 vst1q_s64(H + auto_cov_idx + 2, auto_cov23); 139 140 int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4); 141 int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]); 142 143 int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0); 144 auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1); 145 vst1q_s64(H + auto_cov_idx + 4, auto_cov45); 146 147 int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]); 148 H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6); 149 } 150 } 151 152 static inline void stats_top_win5_sve(const int16x8_t src[2], 153 const int16x8_t dgd[2], 154 const int16_t *const d, 155 const int32_t d_stride, int64x2_t *sum_m, 156 int64x2_t *sum_h) { 157 int16x8_t dgds[WIENER_WIN_CHROMA * 2]; 158 159 load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], 160 &dgds[8]); 161 load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], 162 &dgds[9]); 163 164 sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]); 165 sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]); 166 sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]); 167 sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]); 168 sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]); 169 sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]); 170 sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]); 171 sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]); 172 sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]); 173 sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]); 174 175 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]); 176 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]); 177 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]); 178 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]); 179 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]); 180 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]); 181 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]); 182 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]); 183 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]); 184 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]); 185 } 186 187 static inline void stats_left_win5_sve(const int16x8_t src[2], const int16_t *d, 188 const int32_t d_stride, int64x2_t *sum) { 189 int16x8_t dgds[WIN_CHROMA]; 190 191 load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], 192 &dgds[6]); 193 load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], 194 &dgds[7]); 195 196 sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]); 197 sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]); 198 sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]); 199 sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]); 200 sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]); 201 sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]); 202 sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]); 203 sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]); 204 } 205 206 static inline void sub_deltas_step4_sve(int16x8_t *A, int16x8_t *B, 207 int64x2_t *deltas) { 208 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(A[0]), B[0]); 209 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(A[0]), B[1]); 210 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(A[0]), B[2]); 211 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(A[0]), B[3]); 212 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(A[0]), B[4]); 213 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(A[1]), B[0]); 214 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(A[2]), B[0]); 215 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(A[3]), B[0]); 216 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(A[4]), B[0]); 217 } 218 219 static inline void add_deltas_step4_sve(int16x8_t *A, int16x8_t *B, 220 int64x2_t *deltas) { 221 deltas[0] = aom_sdotq_s16(deltas[0], A[0], B[0]); 222 deltas[1] = aom_sdotq_s16(deltas[1], A[0], B[1]); 223 deltas[2] = aom_sdotq_s16(deltas[2], A[0], B[2]); 224 deltas[3] = aom_sdotq_s16(deltas[3], A[0], B[3]); 225 deltas[4] = aom_sdotq_s16(deltas[4], A[0], B[4]); 226 deltas[5] = aom_sdotq_s16(deltas[5], A[1], B[0]); 227 deltas[6] = aom_sdotq_s16(deltas[6], A[2], B[0]); 228 deltas[7] = aom_sdotq_s16(deltas[7], A[3], B[0]); 229 deltas[8] = aom_sdotq_s16(deltas[8], A[4], B[0]); 230 } 231 232 static inline void load_square_win5_sve( 233 const int16_t *const di, const int16_t *const dj, const int32_t d_stride, 234 const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js, 235 int16x8_t *d_je, svbool_t p0, svbool_t p1) { 236 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); 237 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); 238 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); 239 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); 240 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); 241 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); 242 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); 243 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); 244 245 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); 246 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); 247 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); 248 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); 249 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); 250 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); 251 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); 252 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); 253 254 load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]); 255 load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]); 256 load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2], 257 &d_je[4], &d_je[6]); 258 load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3], 259 &d_je[5], &d_je[7]); 260 } 261 262 static inline void update_4_stats_sve(const int64_t *const src, 263 const int64x2_t *delta, 264 int64_t *const dst) { 265 const int64x2_t s1 = vld1q_s64(src); 266 const int64x2_t s2 = vld1q_s64(src + 2); 267 268 vst1q_s64(dst + 0, vaddq_s64(s1, delta[0])); 269 vst1q_s64(dst + 2, vaddq_s64(s2, delta[1])); 270 } 271 272 static inline void derive_square_win5_sve( 273 int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js, 274 const int16x8_t *d_je, 275 int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) { 276 d_is[0] = vnegq_s16(d_is[0]); 277 d_is[1] = vnegq_s16(d_is[1]); 278 d_is[2] = vnegq_s16(d_is[2]); 279 d_is[3] = vnegq_s16(d_is[3]); 280 d_is[4] = vnegq_s16(d_is[4]); 281 d_is[5] = vnegq_s16(d_is[5]); 282 d_is[6] = vnegq_s16(d_is[6]); 283 d_is[7] = vnegq_s16(d_is[7]); 284 285 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]); 286 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]); 287 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]); 288 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]); 289 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]); 290 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]); 291 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]); 292 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]); 293 294 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]); 295 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]); 296 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]); 297 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]); 298 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]); 299 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]); 300 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]); 301 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]); 302 303 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]); 304 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]); 305 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]); 306 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]); 307 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]); 308 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]); 309 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]); 310 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]); 311 312 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]); 313 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]); 314 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]); 315 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]); 316 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]); 317 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]); 318 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]); 319 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]); 320 321 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]); 322 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]); 323 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]); 324 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]); 325 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]); 326 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]); 327 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]); 328 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]); 329 330 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]); 331 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]); 332 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]); 333 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]); 334 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]); 335 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]); 336 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]); 337 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]); 338 339 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]); 340 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]); 341 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]); 342 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]); 343 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]); 344 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]); 345 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]); 346 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]); 347 348 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]); 349 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]); 350 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]); 351 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]); 352 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]); 353 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]); 354 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]); 355 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]); 356 } 357 358 static inline void hadd_update_4_stats_sve(const int64_t *const src, 359 const int64x2_t *deltas, 360 int64_t *const dst) { 361 int64x2_t src0 = vld1q_s64(src); 362 int64x2_t src1 = vld1q_s64(src + 2); 363 vst1q_s64(dst + 0, vaddq_s64(src0, vpaddq_s64(deltas[0], deltas[1]))); 364 vst1q_s64(dst + 2, vaddq_s64(src1, vpaddq_s64(deltas[2], deltas[3]))); 365 } 366 367 static inline void load_triangle_win5_sve(const int16_t *const di, 368 const int32_t d_stride, 369 const int32_t height, int16x8_t *d_is, 370 int16x8_t *d_ie, svbool_t p0, 371 svbool_t p1) { 372 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); 373 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); 374 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); 375 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); 376 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); 377 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); 378 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); 379 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); 380 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); 381 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); 382 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); 383 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); 384 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); 385 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); 386 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); 387 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); 388 } 389 390 static inline void derive_triangle_win5_sve(const int16x8_t *d_is, 391 const int16x8_t *d_ie, 392 int64x2_t *deltas) { 393 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]); 394 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]); 395 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]); 396 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]); 397 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]); 398 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]); 399 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]); 400 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]); 401 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[2]), d_is[2]); 402 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[3]), d_is[3]); 403 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[2]), d_is[4]); 404 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[3]), d_is[5]); 405 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[6]); 406 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[7]); 407 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[4]), d_is[4]); 408 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[5]), d_is[5]); 409 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[4]), d_is[6]); 410 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[5]), d_is[7]); 411 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[6]), d_is[6]); 412 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[7]), d_is[7]); 413 414 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]); 415 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]); 416 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]); 417 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]); 418 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]); 419 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]); 420 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]); 421 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]); 422 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[2], d_ie[2]); 423 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[3], d_ie[3]); 424 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[2], d_ie[4]); 425 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[3], d_ie[5]); 426 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[6]); 427 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[7]); 428 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[4], d_ie[4]); 429 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[5], d_ie[5]); 430 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[4], d_ie[6]); 431 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[5], d_ie[7]); 432 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[6], d_ie[6]); 433 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[7], d_ie[7]); 434 } 435 436 static inline void compute_stats_win5_sve( 437 const int16_t *const d, const int32_t d_stride, const int16_t *const s, 438 const int32_t s_stride, const int32_t width, const int32_t height, 439 int64_t *const M, int64_t *const H) { 440 const int32_t wiener_win = WIENER_WIN_CHROMA; 441 const int32_t wiener_win2 = wiener_win * wiener_win; 442 const int32_t h8 = height & ~7; 443 int32_t i, j, x, y; 444 445 // Use a predicate to compute the last columns. 446 svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16); 447 svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16); 448 449 // Step 1: Calculate the top edge of the whole matrix, i.e., the top 450 // edge of each triangle and square on the top row. 451 j = 0; 452 do { 453 const int16_t *s_t = s; 454 const int16_t *d_t = d; 455 int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; 456 int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; 457 int16x8_t src[2], dgd[2]; 458 459 y = height; 460 do { 461 x = 0; 462 while (x < width - 16) { 463 src[0] = vld1q_s16(s_t + x + 0); 464 src[1] = vld1q_s16(s_t + x + 8); 465 dgd[0] = vld1q_s16(d_t + x + 0); 466 dgd[1] = vld1q_s16(d_t + x + 8); 467 stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); 468 x += 16; 469 } 470 471 src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0)); 472 src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8)); 473 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0)); 474 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8)); 475 476 stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); 477 478 s_t += s_stride; 479 d_t += d_stride; 480 } while (--y); 481 482 vst1q_s64(&M[wiener_win * j + 0], vpaddq_s64(sum_m[0], sum_m[1])); 483 vst1q_s64(&M[wiener_win * j + 2], vpaddq_s64(sum_m[2], sum_m[3])); 484 M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]); 485 486 vst1q_s64(&H[wiener_win * j + 0], vpaddq_s64(sum_h[0], sum_h[1])); 487 vst1q_s64(&H[wiener_win * j + 2], vpaddq_s64(sum_h[2], sum_h[3])); 488 H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]); 489 } while (++j < wiener_win); 490 491 // Step 2: Calculate the left edge of each square on the top row. 492 j = 1; 493 do { 494 const int16_t *d_t = d; 495 int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) }; 496 int16x8_t dgd[2]; 497 498 y = height; 499 do { 500 x = 0; 501 while (x < width - 16) { 502 dgd[0] = vld1q_s16(d_t + j + x + 0); 503 dgd[1] = vld1q_s16(d_t + j + x + 8); 504 stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h); 505 x += 16; 506 } 507 508 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0)); 509 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8)); 510 511 stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h); 512 513 d_t += d_stride; 514 } while (--y); 515 516 int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]); 517 int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]); 518 vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01)); 519 vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01)); 520 vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23)); 521 vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23)); 522 523 } while (++j < wiener_win); 524 525 // Step 3: Derive the top edge of each triangle along the diagonal. No 526 // triangle in top row. 527 { 528 const int16_t *d_t = d; 529 530 if (height % 2) { 531 int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 532 int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 533 int16x8_t ds[WIENER_WIN * 2]; 534 535 load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]); 536 load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]); 537 d_t += 4 * d_stride; 538 539 step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas); 540 transpose_arrays_s32_8x8(deltas, deltas_tr); 541 542 update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, 543 deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0), 544 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); 545 546 update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, 547 deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0), 548 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); 549 550 update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, 551 deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0), 552 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); 553 554 update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, 555 deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0), 556 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); 557 558 } else { 559 int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) }; 560 int16x8_t ds[WIENER_WIN_CHROMA * 2]; 561 562 ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width); 563 ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width); 564 ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width); 565 ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width); 566 567 step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas); 568 569 transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2], 570 &deltas[3]); 571 572 update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, 573 deltas[0], vgetq_lane_s32(deltas[4], 0), 574 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); 575 576 update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, 577 deltas[1], vgetq_lane_s32(deltas[4], 1), 578 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); 579 580 update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, 581 deltas[2], vgetq_lane_s32(deltas[4], 2), 582 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); 583 584 update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, 585 deltas[3], vgetq_lane_s32(deltas[4], 3), 586 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); 587 } 588 } 589 590 // Step 4: Derive the top and left edge of each square. No square in top and 591 // bottom row. 592 { 593 y = h8; 594 595 int16x4_t d_s[12]; 596 int16x4_t d_e[12]; 597 const int16_t *d_t = d; 598 int16x4_t zeros = vdup_n_s16(0); 599 load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]); 600 load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]); 601 int64x2_t deltas[6][18] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } }; 602 603 while (y >= 8) { 604 load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], 605 &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); 606 load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], 607 &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); 608 609 int16x8_t s_tr[8], e_tr[8]; 610 transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], 611 d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], 612 &s_tr[3]); 613 transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, 614 zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], 615 &s_tr[7]); 616 617 transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], 618 d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], 619 &e_tr[3]); 620 transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, 621 zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], 622 &e_tr[7]); 623 624 int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; 625 start_col0[0] = s_tr[0]; 626 start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1); 627 start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2); 628 start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3); 629 start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4); 630 631 start_col1[0] = s_tr[1]; 632 start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1); 633 start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2); 634 start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3); 635 start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4); 636 637 start_col2[0] = s_tr[2]; 638 start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1); 639 start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2); 640 start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3); 641 start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4); 642 643 start_col3[0] = s_tr[3]; 644 start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1); 645 start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2); 646 start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3); 647 start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4); 648 649 // i = 1, j = 2; 650 sub_deltas_step4_sve(start_col0, start_col1, deltas[0]); 651 652 // i = 1, j = 3; 653 sub_deltas_step4_sve(start_col0, start_col2, deltas[1]); 654 655 // i = 1, j = 4 656 sub_deltas_step4_sve(start_col0, start_col3, deltas[2]); 657 658 // i = 2, j =3 659 sub_deltas_step4_sve(start_col1, start_col2, deltas[3]); 660 661 // i = 2, j = 4 662 sub_deltas_step4_sve(start_col1, start_col3, deltas[4]); 663 664 // i = 3, j = 4 665 sub_deltas_step4_sve(start_col2, start_col3, deltas[5]); 666 667 int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; 668 end_col0[0] = e_tr[0]; 669 end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1); 670 end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2); 671 end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3); 672 end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4); 673 674 end_col1[0] = e_tr[1]; 675 end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1); 676 end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2); 677 end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3); 678 end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4); 679 680 end_col2[0] = e_tr[2]; 681 end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1); 682 end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2); 683 end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3); 684 end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4); 685 686 end_col3[0] = e_tr[3]; 687 end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1); 688 end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2); 689 end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3); 690 end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4); 691 692 // i = 1, j = 2; 693 add_deltas_step4_sve(end_col0, end_col1, deltas[0]); 694 695 // i = 1, j = 3; 696 add_deltas_step4_sve(end_col0, end_col2, deltas[1]); 697 698 // i = 1, j = 4 699 add_deltas_step4_sve(end_col0, end_col3, deltas[2]); 700 701 // i = 2, j =3 702 add_deltas_step4_sve(end_col1, end_col2, deltas[3]); 703 704 // i = 2, j = 4 705 add_deltas_step4_sve(end_col1, end_col3, deltas[4]); 706 707 // i = 3, j = 4 708 add_deltas_step4_sve(end_col2, end_col3, deltas[5]); 709 710 d_s[0] = d_s[8]; 711 d_s[1] = d_s[9]; 712 d_s[2] = d_s[10]; 713 d_s[3] = d_s[11]; 714 d_e[0] = d_e[8]; 715 d_e[1] = d_e[9]; 716 d_e[2] = d_e[10]; 717 d_e[3] = d_e[11]; 718 719 d_t += 8 * d_stride; 720 y -= 8; 721 } 722 723 if (h8 != height) { 724 const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8)); 725 726 load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], 727 &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); 728 load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], 729 &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); 730 int16x8_t s_tr[8], e_tr[8]; 731 transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], 732 d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], 733 &s_tr[3]); 734 transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, 735 zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], 736 &s_tr[7]); 737 transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], 738 d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], 739 &e_tr[3]); 740 transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, 741 zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], 742 &e_tr[7]); 743 744 int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; 745 start_col0[0] = vandq_s16(s_tr[0], mask_h); 746 start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h); 747 start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h); 748 start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h); 749 start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h); 750 751 start_col1[0] = vandq_s16(s_tr[1], mask_h); 752 start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h); 753 start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h); 754 start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h); 755 start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h); 756 757 start_col2[0] = vandq_s16(s_tr[2], mask_h); 758 start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h); 759 start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h); 760 start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h); 761 start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h); 762 763 start_col3[0] = vandq_s16(s_tr[3], mask_h); 764 start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h); 765 start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h); 766 start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h); 767 start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h); 768 769 // i = 1, j = 2; 770 sub_deltas_step4_sve(start_col0, start_col1, deltas[0]); 771 772 // i = 1, j = 3; 773 sub_deltas_step4_sve(start_col0, start_col2, deltas[1]); 774 775 // i = 1, j = 4 776 sub_deltas_step4_sve(start_col0, start_col3, deltas[2]); 777 778 // i = 2, j = 3 779 sub_deltas_step4_sve(start_col1, start_col2, deltas[3]); 780 781 // i = 2, j = 4 782 sub_deltas_step4_sve(start_col1, start_col3, deltas[4]); 783 784 // i = 3, j = 4 785 sub_deltas_step4_sve(start_col2, start_col3, deltas[5]); 786 787 int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; 788 end_col0[0] = vandq_s16(e_tr[0], mask_h); 789 end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h); 790 end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h); 791 end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h); 792 end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h); 793 794 end_col1[0] = vandq_s16(e_tr[1], mask_h); 795 end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h); 796 end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h); 797 end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h); 798 end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h); 799 800 end_col2[0] = vandq_s16(e_tr[2], mask_h); 801 end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h); 802 end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h); 803 end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h); 804 end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h); 805 806 end_col3[0] = vandq_s16(e_tr[3], mask_h); 807 end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h); 808 end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h); 809 end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h); 810 end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h); 811 812 // i = 1, j = 2; 813 add_deltas_step4_sve(end_col0, end_col1, deltas[0]); 814 815 // i = 1, j = 3; 816 add_deltas_step4_sve(end_col0, end_col2, deltas[1]); 817 818 // i = 1, j = 4 819 add_deltas_step4_sve(end_col0, end_col3, deltas[2]); 820 821 // i = 2, j =3 822 add_deltas_step4_sve(end_col1, end_col2, deltas[3]); 823 824 // i = 2, j = 4 825 add_deltas_step4_sve(end_col1, end_col3, deltas[4]); 826 827 // i = 3, j = 4 828 add_deltas_step4_sve(end_col2, end_col3, deltas[5]); 829 } 830 831 int64_t single_delta[6]; 832 833 deltas[0][0] = vpaddq_s64(deltas[0][0], deltas[0][1]); 834 deltas[0][1] = vpaddq_s64(deltas[0][2], deltas[0][3]); 835 deltas[1][0] = vpaddq_s64(deltas[1][0], deltas[1][1]); 836 deltas[1][1] = vpaddq_s64(deltas[1][2], deltas[1][3]); 837 deltas[2][0] = vpaddq_s64(deltas[2][0], deltas[2][1]); 838 deltas[2][1] = vpaddq_s64(deltas[2][2], deltas[2][3]); 839 deltas[3][0] = vpaddq_s64(deltas[3][0], deltas[3][1]); 840 deltas[3][1] = vpaddq_s64(deltas[3][2], deltas[3][3]); 841 deltas[4][0] = vpaddq_s64(deltas[4][0], deltas[4][1]); 842 deltas[4][1] = vpaddq_s64(deltas[4][2], deltas[4][3]); 843 deltas[5][0] = vpaddq_s64(deltas[5][0], deltas[5][1]); 844 deltas[5][1] = vpaddq_s64(deltas[5][2], deltas[5][3]); 845 846 deltas[0][5] = vpaddq_s64(deltas[0][5], deltas[0][6]); 847 deltas[0][7] = vpaddq_s64(deltas[0][7], deltas[0][8]); 848 deltas[1][5] = vpaddq_s64(deltas[1][5], deltas[1][6]); 849 deltas[1][7] = vpaddq_s64(deltas[1][7], deltas[1][8]); 850 deltas[2][5] = vpaddq_s64(deltas[2][5], deltas[2][6]); 851 deltas[2][7] = vpaddq_s64(deltas[2][7], deltas[2][8]); 852 deltas[3][5] = vpaddq_s64(deltas[3][5], deltas[3][6]); 853 deltas[3][7] = vpaddq_s64(deltas[3][7], deltas[3][8]); 854 deltas[4][5] = vpaddq_s64(deltas[4][5], deltas[4][6]); 855 deltas[4][7] = vpaddq_s64(deltas[4][7], deltas[4][8]); 856 deltas[5][5] = vpaddq_s64(deltas[5][5], deltas[5][6]); 857 deltas[5][7] = vpaddq_s64(deltas[5][7], deltas[5][8]); 858 859 vst1q_s64(single_delta + 0, vpaddq_s64(deltas[0][4], deltas[1][4])); 860 vst1q_s64(single_delta + 2, vpaddq_s64(deltas[2][4], deltas[3][4])); 861 vst1q_s64(single_delta + 4, vpaddq_s64(deltas[4][4], deltas[5][4])); 862 863 int idx = 0; 864 for (i = 1; i < wiener_win - 1; i++) { 865 for (j = i + 1; j < wiener_win; j++) { 866 update_4_stats_sve( 867 H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, 868 deltas[idx], H + i * wiener_win * wiener_win2 + j * wiener_win); 869 H[i * wiener_win * wiener_win2 + j * wiener_win + 4] = 870 H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] + 871 single_delta[idx]; 872 873 H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = 874 H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + 875 vgetq_lane_s64(deltas[idx][5], 0); 876 H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = 877 H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + 878 vgetq_lane_s64(deltas[idx][5], 1); 879 H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = 880 H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + 881 vgetq_lane_s64(deltas[idx][7], 0); 882 H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = 883 H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + 884 vgetq_lane_s64(deltas[idx][7], 1); 885 886 idx++; 887 } 888 } 889 } 890 891 // Step 5: Derive other points of each square. No square in bottom row. 892 i = 0; 893 do { 894 const int16_t *const di = d + i; 895 896 j = i + 1; 897 do { 898 const int16_t *const dj = d + j; 899 int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = { 900 { vdupq_n_s64(0) }, { vdupq_n_s64(0) } 901 }; 902 int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; 903 int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA]; 904 905 x = 0; 906 while (x < width - 16) { 907 load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, 908 d_js, d_je); 909 derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas); 910 x += 16; 911 } 912 913 load_square_win5_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, 914 d_je, p0, p1); 915 derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas); 916 917 hadd_update_4_stats_sve( 918 H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], 919 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); 920 hadd_update_4_stats_sve( 921 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], 922 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); 923 hadd_update_4_stats_sve( 924 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], 925 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); 926 hadd_update_4_stats_sve( 927 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], 928 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); 929 } while (++j < wiener_win); 930 } while (++i < wiener_win - 1); 931 932 // Step 6: Derive other points of each upper triangle along the diagonal. 933 i = 0; 934 do { 935 const int16_t *const di = d + i; 936 int64x2_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s64(0) }; 937 int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; 938 939 x = 0; 940 while (x < width - 16) { 941 load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); 942 derive_triangle_win5_sve(d_is, d_ie, deltas); 943 x += 16; 944 } 945 946 load_triangle_win5_sve(di + x, d_stride, height, d_is, d_ie, p0, p1); 947 derive_triangle_win5_sve(d_is, d_ie, deltas); 948 949 // Row 1: 4 points 950 hadd_update_4_stats_sve( 951 H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, 952 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); 953 954 // Row 2: 3 points 955 int64x2_t src0 = 956 vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); 957 vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, 958 vaddq_s64(src0, vpaddq_s64(deltas[4], deltas[5]))); 959 960 int64x2_t deltas69 = vpaddq_s64(deltas[6], deltas[9]); 961 962 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] = 963 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] + 964 vgetq_lane_s64(deltas69, 0); 965 966 // Row 3: 2 points 967 int64x2_t src1 = 968 vld1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); 969 vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3, 970 vaddq_s64(src1, vpaddq_s64(deltas[7], deltas[8]))); 971 972 // Row 4: 1 point 973 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] = 974 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] + 975 vgetq_lane_s64(deltas69, 1); 976 } while (++i < wiener_win); 977 } 978 979 static inline void stats_top_win7_sve(const int16x8_t src[2], 980 const int16x8_t dgd[2], 981 const int16_t *const d, 982 const int32_t d_stride, int64x2_t *sum_m, 983 int64x2_t *sum_h) { 984 int16x8_t dgds[WIENER_WIN * 2]; 985 986 load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], 987 &dgds[8], &dgds[10], &dgds[12]); 988 load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], 989 &dgds[9], &dgds[11], &dgds[13]); 990 991 sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]); 992 sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]); 993 sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]); 994 sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]); 995 sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]); 996 sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]); 997 sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]); 998 sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]); 999 sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]); 1000 sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]); 1001 sum_m[5] = aom_sdotq_s16(sum_m[5], src[0], dgds[10]); 1002 sum_m[5] = aom_sdotq_s16(sum_m[5], src[1], dgds[11]); 1003 sum_m[6] = aom_sdotq_s16(sum_m[6], src[0], dgds[12]); 1004 sum_m[6] = aom_sdotq_s16(sum_m[6], src[1], dgds[13]); 1005 1006 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]); 1007 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]); 1008 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]); 1009 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]); 1010 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]); 1011 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]); 1012 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]); 1013 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]); 1014 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]); 1015 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]); 1016 sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[0], dgds[10]); 1017 sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[1], dgds[11]); 1018 sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[0], dgds[12]); 1019 sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[1], dgds[13]); 1020 } 1021 1022 static inline void stats_left_win7_sve(const int16x8_t src[2], const int16_t *d, 1023 const int32_t d_stride, int64x2_t *sum) { 1024 int16x8_t dgds[WIN_7]; 1025 1026 load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], 1027 &dgds[6], &dgds[8], &dgds[10]); 1028 load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], 1029 &dgds[7], &dgds[9], &dgds[11]); 1030 1031 sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]); 1032 sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]); 1033 sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]); 1034 sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]); 1035 sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]); 1036 sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]); 1037 sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]); 1038 sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]); 1039 sum[4] = aom_sdotq_s16(sum[4], src[0], dgds[8]); 1040 sum[4] = aom_sdotq_s16(sum[4], src[1], dgds[9]); 1041 sum[5] = aom_sdotq_s16(sum[5], src[0], dgds[10]); 1042 sum[5] = aom_sdotq_s16(sum[5], src[1], dgds[11]); 1043 } 1044 1045 static inline void load_square_win7_sve( 1046 const int16_t *const di, const int16_t *const dj, const int32_t d_stride, 1047 const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js, 1048 int16x8_t *d_je, svbool_t p0, svbool_t p1) { 1049 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); 1050 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); 1051 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); 1052 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); 1053 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); 1054 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); 1055 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); 1056 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); 1057 d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0)); 1058 d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8)); 1059 d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0)); 1060 d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8)); 1061 1062 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); 1063 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); 1064 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); 1065 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); 1066 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); 1067 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); 1068 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); 1069 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); 1070 d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0)); 1071 d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8)); 1072 d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0)); 1073 d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8)); 1074 1075 load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6], 1076 &d_js[8], &d_js[10]); 1077 load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7], 1078 &d_js[9], &d_js[11]); 1079 load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2], 1080 &d_je[4], &d_je[6], &d_je[8], &d_je[10]); 1081 load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3], 1082 &d_je[5], &d_je[7], &d_je[9], &d_je[11]); 1083 } 1084 1085 static inline void derive_square_win7_sve(int16x8_t *d_is, 1086 const int16x8_t *d_ie, 1087 const int16x8_t *d_js, 1088 const int16x8_t *d_je, 1089 int64x2_t deltas[][WIN_7]) { 1090 d_is[0] = vnegq_s16(d_is[0]); 1091 d_is[1] = vnegq_s16(d_is[1]); 1092 d_is[2] = vnegq_s16(d_is[2]); 1093 d_is[3] = vnegq_s16(d_is[3]); 1094 d_is[4] = vnegq_s16(d_is[4]); 1095 d_is[5] = vnegq_s16(d_is[5]); 1096 d_is[6] = vnegq_s16(d_is[6]); 1097 d_is[7] = vnegq_s16(d_is[7]); 1098 d_is[8] = vnegq_s16(d_is[8]); 1099 d_is[9] = vnegq_s16(d_is[9]); 1100 d_is[10] = vnegq_s16(d_is[10]); 1101 d_is[11] = vnegq_s16(d_is[11]); 1102 1103 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]); 1104 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]); 1105 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]); 1106 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]); 1107 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]); 1108 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]); 1109 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]); 1110 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]); 1111 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[0], d_js[8]); 1112 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[1], d_js[9]); 1113 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[0], d_js[10]); 1114 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[1], d_js[11]); 1115 1116 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]); 1117 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]); 1118 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]); 1119 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]); 1120 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]); 1121 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]); 1122 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]); 1123 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]); 1124 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[2], d_js[8]); 1125 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[3], d_js[9]); 1126 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[2], d_js[10]); 1127 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[3], d_js[11]); 1128 1129 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]); 1130 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]); 1131 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]); 1132 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]); 1133 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]); 1134 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]); 1135 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]); 1136 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]); 1137 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[4], d_js[8]); 1138 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[5], d_js[9]); 1139 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[4], d_js[10]); 1140 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[5], d_js[11]); 1141 1142 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]); 1143 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]); 1144 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]); 1145 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]); 1146 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]); 1147 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]); 1148 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]); 1149 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]); 1150 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[6], d_js[8]); 1151 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[7], d_js[9]); 1152 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[6], d_js[10]); 1153 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[7], d_js[11]); 1154 1155 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[8], d_js[0]); 1156 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[9], d_js[1]); 1157 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[8], d_js[2]); 1158 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[9], d_js[3]); 1159 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[8], d_js[4]); 1160 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[9], d_js[5]); 1161 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[8], d_js[6]); 1162 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[9], d_js[7]); 1163 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[8], d_js[8]); 1164 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[9], d_js[9]); 1165 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[8], d_js[10]); 1166 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[9], d_js[11]); 1167 1168 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[10], d_js[0]); 1169 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[11], d_js[1]); 1170 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[10], d_js[2]); 1171 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[11], d_js[3]); 1172 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[10], d_js[4]); 1173 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[11], d_js[5]); 1174 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[10], d_js[6]); 1175 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[11], d_js[7]); 1176 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[10], d_js[8]); 1177 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[11], d_js[9]); 1178 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[10], d_js[10]); 1179 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[11], d_js[11]); 1180 1181 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]); 1182 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]); 1183 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]); 1184 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]); 1185 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]); 1186 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]); 1187 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]); 1188 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]); 1189 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[0], d_je[8]); 1190 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[1], d_je[9]); 1191 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[0], d_je[10]); 1192 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[1], d_je[11]); 1193 1194 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]); 1195 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]); 1196 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]); 1197 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]); 1198 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]); 1199 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]); 1200 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]); 1201 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]); 1202 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[2], d_je[8]); 1203 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[3], d_je[9]); 1204 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[2], d_je[10]); 1205 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[3], d_je[11]); 1206 1207 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]); 1208 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]); 1209 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]); 1210 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]); 1211 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]); 1212 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]); 1213 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]); 1214 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]); 1215 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[4], d_je[8]); 1216 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[5], d_je[9]); 1217 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[4], d_je[10]); 1218 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[5], d_je[11]); 1219 1220 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]); 1221 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]); 1222 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]); 1223 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]); 1224 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]); 1225 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]); 1226 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]); 1227 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]); 1228 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[6], d_je[8]); 1229 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[7], d_je[9]); 1230 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[6], d_je[10]); 1231 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[7], d_je[11]); 1232 1233 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[8], d_je[0]); 1234 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[9], d_je[1]); 1235 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[8], d_je[2]); 1236 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[9], d_je[3]); 1237 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[8], d_je[4]); 1238 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[9], d_je[5]); 1239 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[8], d_je[6]); 1240 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[9], d_je[7]); 1241 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[8], d_je[8]); 1242 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[9], d_je[9]); 1243 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[8], d_je[10]); 1244 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[9], d_je[11]); 1245 1246 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[10], d_je[0]); 1247 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[11], d_je[1]); 1248 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[10], d_je[2]); 1249 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[11], d_je[3]); 1250 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[10], d_je[4]); 1251 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[11], d_je[5]); 1252 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[10], d_je[6]); 1253 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[11], d_je[7]); 1254 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[10], d_je[8]); 1255 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[11], d_je[9]); 1256 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[10], d_je[10]); 1257 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[11], d_je[11]); 1258 } 1259 1260 static inline void hadd_update_6_stats_sve(const int64_t *const src, 1261 const int64x2_t *deltas, 1262 int64_t *const dst) { 1263 int64x2_t src0 = vld1q_s64(src + 0); 1264 int64x2_t src1 = vld1q_s64(src + 2); 1265 int64x2_t src2 = vld1q_s64(src + 4); 1266 1267 int64x2_t deltas01 = vpaddq_s64(deltas[0], deltas[1]); 1268 int64x2_t deltas23 = vpaddq_s64(deltas[2], deltas[3]); 1269 int64x2_t deltas45 = vpaddq_s64(deltas[4], deltas[5]); 1270 1271 vst1q_s64(dst + 0, vaddq_s64(src0, deltas01)); 1272 vst1q_s64(dst + 2, vaddq_s64(src1, deltas23)); 1273 vst1q_s64(dst + 4, vaddq_s64(src2, deltas45)); 1274 } 1275 1276 static inline void load_triangle_win7_sve(const int16_t *const di, 1277 const int32_t d_stride, 1278 const int32_t height, int16x8_t *d_is, 1279 int16x8_t *d_ie, svbool_t p0, 1280 svbool_t p1) { 1281 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); 1282 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); 1283 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); 1284 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); 1285 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); 1286 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); 1287 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); 1288 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); 1289 d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0)); 1290 d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8)); 1291 d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0)); 1292 d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8)); 1293 1294 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); 1295 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); 1296 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); 1297 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); 1298 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); 1299 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); 1300 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); 1301 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); 1302 d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0)); 1303 d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8)); 1304 d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0)); 1305 d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8)); 1306 } 1307 1308 static inline void derive_triangle_win7_sve(const int16x8_t *d_is, 1309 const int16x8_t *d_ie, 1310 int64x2_t *deltas) { 1311 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]); 1312 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]); 1313 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]); 1314 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]); 1315 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]); 1316 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]); 1317 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]); 1318 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]); 1319 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[0]), d_is[8]); 1320 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[1]), d_is[9]); 1321 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[0]), d_is[10]); 1322 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[1]), d_is[11]); 1323 1324 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[2]); 1325 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[3]); 1326 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[2]), d_is[4]); 1327 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[3]), d_is[5]); 1328 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[2]), d_is[6]); 1329 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[3]), d_is[7]); 1330 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[2]), d_is[8]); 1331 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[3]), d_is[9]); 1332 deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[2]), d_is[10]); 1333 deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[3]), d_is[11]); 1334 1335 deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[4]), d_is[4]); 1336 deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[5]), d_is[5]); 1337 deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[4]), d_is[6]); 1338 deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[5]), d_is[7]); 1339 deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[4]), d_is[8]); 1340 deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[5]), d_is[9]); 1341 deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[4]), d_is[10]); 1342 deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[5]), d_is[11]); 1343 1344 deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[6]), d_is[6]); 1345 deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[7]), d_is[7]); 1346 deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[6]), d_is[8]); 1347 deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[7]), d_is[9]); 1348 deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[6]), d_is[10]); 1349 deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[7]), d_is[11]); 1350 1351 deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[8]), d_is[8]); 1352 deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[9]), d_is[9]); 1353 deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[8]), d_is[10]); 1354 deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[9]), d_is[11]); 1355 1356 deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[10]), d_is[10]); 1357 deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[11]), d_is[11]); 1358 1359 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]); 1360 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]); 1361 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]); 1362 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]); 1363 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]); 1364 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]); 1365 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]); 1366 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]); 1367 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[0], d_ie[8]); 1368 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[1], d_ie[9]); 1369 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[0], d_ie[10]); 1370 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[1], d_ie[11]); 1371 1372 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[2]); 1373 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[3]); 1374 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[2], d_ie[4]); 1375 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[3], d_ie[5]); 1376 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[2], d_ie[6]); 1377 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[3], d_ie[7]); 1378 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[2], d_ie[8]); 1379 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[3], d_ie[9]); 1380 deltas[10] = aom_sdotq_s16(deltas[10], d_ie[2], d_ie[10]); 1381 deltas[10] = aom_sdotq_s16(deltas[10], d_ie[3], d_ie[11]); 1382 1383 deltas[11] = aom_sdotq_s16(deltas[11], d_ie[4], d_ie[4]); 1384 deltas[11] = aom_sdotq_s16(deltas[11], d_ie[5], d_ie[5]); 1385 deltas[12] = aom_sdotq_s16(deltas[12], d_ie[4], d_ie[6]); 1386 deltas[12] = aom_sdotq_s16(deltas[12], d_ie[5], d_ie[7]); 1387 deltas[13] = aom_sdotq_s16(deltas[13], d_ie[4], d_ie[8]); 1388 deltas[13] = aom_sdotq_s16(deltas[13], d_ie[5], d_ie[9]); 1389 deltas[14] = aom_sdotq_s16(deltas[14], d_ie[4], d_ie[10]); 1390 deltas[14] = aom_sdotq_s16(deltas[14], d_ie[5], d_ie[11]); 1391 1392 deltas[15] = aom_sdotq_s16(deltas[15], d_ie[6], d_ie[6]); 1393 deltas[15] = aom_sdotq_s16(deltas[15], d_ie[7], d_ie[7]); 1394 deltas[16] = aom_sdotq_s16(deltas[16], d_ie[6], d_ie[8]); 1395 deltas[16] = aom_sdotq_s16(deltas[16], d_ie[7], d_ie[9]); 1396 deltas[17] = aom_sdotq_s16(deltas[17], d_ie[6], d_ie[10]); 1397 deltas[17] = aom_sdotq_s16(deltas[17], d_ie[7], d_ie[11]); 1398 1399 deltas[18] = aom_sdotq_s16(deltas[18], d_ie[8], d_ie[8]); 1400 deltas[18] = aom_sdotq_s16(deltas[18], d_ie[9], d_ie[9]); 1401 deltas[19] = aom_sdotq_s16(deltas[19], d_ie[8], d_ie[10]); 1402 deltas[19] = aom_sdotq_s16(deltas[19], d_ie[9], d_ie[11]); 1403 1404 deltas[20] = aom_sdotq_s16(deltas[20], d_ie[10], d_ie[10]); 1405 deltas[20] = aom_sdotq_s16(deltas[20], d_ie[11], d_ie[11]); 1406 } 1407 1408 static inline void compute_stats_win7_sve( 1409 const int16_t *const d, const int32_t d_stride, const int16_t *const s, 1410 const int32_t s_stride, const int32_t width, const int32_t height, 1411 int64_t *const M, int64_t *const H) { 1412 const int32_t wiener_win = WIENER_WIN; 1413 const int32_t wiener_win2 = wiener_win * wiener_win; 1414 const int32_t h8 = height & ~7; 1415 int32_t i, j, x, y; 1416 1417 // Use a predicate to compute the last columns. 1418 svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16); 1419 svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16); 1420 1421 // Step 1: Calculate the top edge of the whole matrix, i.e., the top 1422 // edge of each triangle and square on the top row. 1423 j = 0; 1424 do { 1425 const int16_t *s_t = s; 1426 const int16_t *d_t = d; 1427 int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) }; 1428 int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) }; 1429 int16x8_t src[2], dgd[2]; 1430 1431 y = height; 1432 do { 1433 x = 0; 1434 while (x < width - 16) { 1435 src[0] = vld1q_s16(s_t + x + 0); 1436 src[1] = vld1q_s16(s_t + x + 8); 1437 dgd[0] = vld1q_s16(d_t + x + 0); 1438 dgd[1] = vld1q_s16(d_t + x + 8); 1439 stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); 1440 x += 16; 1441 } 1442 1443 src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0)); 1444 src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8)); 1445 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0)); 1446 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8)); 1447 stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); 1448 1449 s_t += s_stride; 1450 d_t += d_stride; 1451 } while (--y); 1452 1453 vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1])); 1454 vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3])); 1455 vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5])); 1456 M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]); 1457 1458 vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1])); 1459 vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3])); 1460 vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5])); 1461 H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]); 1462 } while (++j < wiener_win); 1463 1464 // Step 2: Calculate the left edge of each square on the top row. 1465 j = 1; 1466 do { 1467 const int16_t *d_t = d; 1468 int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) }; 1469 int16x8_t dgd[2]; 1470 1471 y = height; 1472 do { 1473 x = 0; 1474 while (x < width - 16) { 1475 dgd[0] = vld1q_s16(d_t + j + x + 0); 1476 dgd[1] = vld1q_s16(d_t + j + x + 8); 1477 stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h); 1478 x += 16; 1479 } 1480 1481 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0)); 1482 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8)); 1483 stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h); 1484 1485 d_t += d_stride; 1486 } while (--y); 1487 1488 int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]); 1489 int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]); 1490 int64x2_t sum_h45 = vpaddq_s64(sum_h[4], sum_h[5]); 1491 vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01)); 1492 vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01)); 1493 vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23)); 1494 vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23)); 1495 vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h45)); 1496 vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h45)); 1497 } while (++j < wiener_win); 1498 1499 // Step 3: Derive the top edge of each triangle along the diagonal. No 1500 // triangle in top row. 1501 { 1502 const int16_t *d_t = d; 1503 // Pad to call transpose function. 1504 int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 1505 int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 1506 int16x8_t ds[WIENER_WIN * 2]; 1507 1508 load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8], 1509 &ds[10]); 1510 load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9], 1511 &ds[11]); 1512 1513 d_t += 6 * d_stride; 1514 1515 step3_win7_neon(d_t, d_stride, width, height, ds, deltas); 1516 transpose_arrays_s32_8x8(deltas, deltas_tr); 1517 1518 update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, 1519 deltas_tr[0], deltas_tr[4], 1520 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); 1521 update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, 1522 deltas_tr[1], deltas_tr[5], 1523 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); 1524 update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, 1525 deltas_tr[2], deltas_tr[6], 1526 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); 1527 update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, 1528 deltas_tr[3], deltas_tr[7], 1529 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); 1530 update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win, 1531 deltas_tr[8], deltas_tr[12], 1532 H + 5 * wiener_win * wiener_win2 + 5 * wiener_win); 1533 update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win, 1534 deltas_tr[9], deltas_tr[13], 1535 H + 6 * wiener_win * wiener_win2 + 6 * wiener_win); 1536 } 1537 1538 // Step 4: Derive the top and left edge of each square. No square in top and 1539 // bottom row. 1540 1541 i = 1; 1542 do { 1543 j = i + 1; 1544 do { 1545 const int16_t *di = d + i - 1; 1546 const int16_t *dj = d + j - 1; 1547 int64x2_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s64(0) }; 1548 int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2]; 1549 1550 dd[5] = vdupq_n_s16(0); // Initialize to avoid warning. 1551 const int16_t dd0_values[] = { di[0 * d_stride], 1552 di[1 * d_stride], 1553 di[2 * d_stride], 1554 di[3 * d_stride], 1555 di[4 * d_stride], 1556 di[5 * d_stride], 1557 0, 1558 0 }; 1559 dd[0] = vld1q_s16(dd0_values); 1560 const int16_t dd1_values[] = { di[0 * d_stride + width], 1561 di[1 * d_stride + width], 1562 di[2 * d_stride + width], 1563 di[3 * d_stride + width], 1564 di[4 * d_stride + width], 1565 di[5 * d_stride + width], 1566 0, 1567 0 }; 1568 dd[1] = vld1q_s16(dd1_values); 1569 const int16_t ds0_values[] = { dj[0 * d_stride], 1570 dj[1 * d_stride], 1571 dj[2 * d_stride], 1572 dj[3 * d_stride], 1573 dj[4 * d_stride], 1574 dj[5 * d_stride], 1575 0, 1576 0 }; 1577 ds[0] = vld1q_s16(ds0_values); 1578 int16_t ds1_values[] = { dj[0 * d_stride + width], 1579 dj[1 * d_stride + width], 1580 dj[2 * d_stride + width], 1581 dj[3 * d_stride + width], 1582 dj[4 * d_stride + width], 1583 dj[5 * d_stride + width], 1584 0, 1585 0 }; 1586 ds[1] = vld1q_s16(ds1_values); 1587 1588 y = 0; 1589 while (y < h8) { 1590 // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e 1591 dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6); 1592 dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7); 1593 dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6); 1594 dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7); 1595 1596 // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e 1597 // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e 1598 ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6); 1599 ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7); 1600 ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6); 1601 ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7); 1602 1603 load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]); 1604 load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]); 1605 load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]); 1606 load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]); 1607 load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]); 1608 load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]); 1609 load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]); 1610 load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]); 1611 load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]); 1612 load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]); 1613 load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]); 1614 load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]); 1615 1616 deltas[0] = aom_sdotq_s16(deltas[0], dd[0], ds[0]); 1617 deltas[1] = aom_sdotq_s16(deltas[1], dd[1], ds[1]); 1618 deltas[2] = aom_sdotq_s16(deltas[2], dd[0], ds[2]); 1619 deltas[3] = aom_sdotq_s16(deltas[3], dd[1], ds[3]); 1620 deltas[4] = aom_sdotq_s16(deltas[4], dd[0], ds[4]); 1621 deltas[5] = aom_sdotq_s16(deltas[5], dd[1], ds[5]); 1622 deltas[6] = aom_sdotq_s16(deltas[6], dd[0], ds[6]); 1623 deltas[7] = aom_sdotq_s16(deltas[7], dd[1], ds[7]); 1624 deltas[8] = aom_sdotq_s16(deltas[8], dd[0], ds[8]); 1625 deltas[9] = aom_sdotq_s16(deltas[9], dd[1], ds[9]); 1626 deltas[10] = aom_sdotq_s16(deltas[10], dd[0], ds[10]); 1627 deltas[11] = aom_sdotq_s16(deltas[11], dd[1], ds[11]); 1628 deltas[12] = aom_sdotq_s16(deltas[12], dd[0], ds[12]); 1629 deltas[13] = aom_sdotq_s16(deltas[13], dd[1], ds[13]); 1630 deltas[14] = aom_sdotq_s16(deltas[14], dd[2], ds[0]); 1631 deltas[15] = aom_sdotq_s16(deltas[15], dd[3], ds[1]); 1632 deltas[16] = aom_sdotq_s16(deltas[16], dd[4], ds[0]); 1633 deltas[17] = aom_sdotq_s16(deltas[17], dd[5], ds[1]); 1634 deltas[18] = aom_sdotq_s16(deltas[18], dd[6], ds[0]); 1635 deltas[19] = aom_sdotq_s16(deltas[19], dd[7], ds[1]); 1636 deltas[20] = aom_sdotq_s16(deltas[20], dd[8], ds[0]); 1637 deltas[21] = aom_sdotq_s16(deltas[21], dd[9], ds[1]); 1638 deltas[22] = aom_sdotq_s16(deltas[22], dd[10], ds[0]); 1639 deltas[23] = aom_sdotq_s16(deltas[23], dd[11], ds[1]); 1640 deltas[24] = aom_sdotq_s16(deltas[24], dd[12], ds[0]); 1641 deltas[25] = aom_sdotq_s16(deltas[25], dd[13], ds[1]); 1642 1643 dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2); 1644 dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2); 1645 ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2); 1646 ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2); 1647 1648 di += 8 * d_stride; 1649 dj += 8 * d_stride; 1650 y += 8; 1651 } 1652 1653 int64x2_t deltas02 = vpaddq_s64(deltas[0], deltas[2]); 1654 int64x2_t deltas13 = vpaddq_s64(deltas[1], deltas[3]); 1655 int64x2_t deltas46 = vpaddq_s64(deltas[4], deltas[6]); 1656 int64x2_t deltas57 = vpaddq_s64(deltas[5], deltas[7]); 1657 int64x2_t deltas810 = vpaddq_s64(deltas[8], deltas[10]); 1658 int64x2_t deltas911 = vpaddq_s64(deltas[9], deltas[11]); 1659 int64x2_t deltas1212 = vpaddq_s64(deltas[12], deltas[12]); 1660 int64x2_t deltas1313 = vpaddq_s64(deltas[13], deltas[13]); 1661 int64x2_t deltas1416 = vpaddq_s64(deltas[14], deltas[16]); 1662 int64x2_t deltas1820 = vpaddq_s64(deltas[18], deltas[20]); 1663 int64x2_t deltas1517 = vpaddq_s64(deltas[15], deltas[17]); 1664 int64x2_t deltas1921 = vpaddq_s64(deltas[19], deltas[21]); 1665 int64x2_t deltas2224 = vpaddq_s64(deltas[22], deltas[24]); 1666 int64x2_t deltas2325 = vpaddq_s64(deltas[23], deltas[25]); 1667 deltas02 = vsubq_s64(deltas13, deltas02); 1668 deltas46 = vsubq_s64(deltas57, deltas46); 1669 deltas810 = vsubq_s64(deltas911, deltas810); 1670 deltas1212 = vsubq_s64(deltas1313, deltas1212); 1671 deltas1416 = vsubq_s64(deltas1517, deltas1416); 1672 deltas1820 = vsubq_s64(deltas1921, deltas1820); 1673 deltas2224 = vsubq_s64(deltas2325, deltas2224); 1674 1675 if (h8 != height) { 1676 const int16_t ds0_vals[] = { 1677 dj[0 * d_stride], dj[0 * d_stride + width], 1678 dj[1 * d_stride], dj[1 * d_stride + width], 1679 dj[2 * d_stride], dj[2 * d_stride + width], 1680 dj[3 * d_stride], dj[3 * d_stride + width] 1681 }; 1682 ds[0] = vld1q_s16(ds0_vals); 1683 1684 ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0); 1685 ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1); 1686 ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2); 1687 ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3); 1688 const int16_t dd4_vals[] = { 1689 -di[1 * d_stride], di[1 * d_stride + width], 1690 -di[2 * d_stride], di[2 * d_stride + width], 1691 -di[3 * d_stride], di[3 * d_stride + width], 1692 -di[4 * d_stride], di[4 * d_stride + width] 1693 }; 1694 dd[4] = vld1q_s16(dd4_vals); 1695 1696 dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0); 1697 dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1); 1698 do { 1699 dd[0] = vdupq_n_s16(-di[0 * d_stride]); 1700 dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]); 1701 dd[0] = dd[1] = vzip1q_s16(dd[0], dd[2]); 1702 1703 ds[4] = vdupq_n_s16(dj[0 * d_stride]); 1704 ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]); 1705 ds[4] = ds[5] = vzip1q_s16(ds[4], ds[6]); 1706 1707 dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2); 1708 dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3); 1709 ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4); 1710 ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5); 1711 1712 const int32x4_t res0 = 1713 vpaddq_s32(vmull_s16(vget_low_s16(dd[0]), vget_low_s16(ds[0])), 1714 vmull_s16(vget_high_s16(dd[0]), vget_high_s16(ds[0]))); 1715 deltas02 = vaddw_s32(deltas02, vget_low_s32(res0)); 1716 deltas46 = vaddw_s32(deltas46, vget_high_s32(res0)); 1717 const int32x4_t res1 = 1718 vpaddq_s32(vmull_s16(vget_low_s16(dd[1]), vget_low_s16(ds[1])), 1719 vmull_s16(vget_high_s16(dd[1]), vget_high_s16(ds[1]))); 1720 deltas810 = vaddw_s32(deltas810, vget_low_s32(res1)); 1721 deltas1212 = vaddw_s32(deltas1212, vget_high_s32(res1)); 1722 const int32x4_t res2 = 1723 vpaddq_s32(vmull_s16(vget_low_s16(dd[4]), vget_low_s16(ds[4])), 1724 vmull_s16(vget_high_s16(dd[4]), vget_high_s16(ds[4]))); 1725 deltas1416 = vaddw_s32(deltas1416, vget_low_s32(res2)); 1726 deltas1820 = vaddw_s32(deltas1820, vget_high_s32(res2)); 1727 const int32x4_t res3 = 1728 vpaddq_s32(vmull_s16(vget_low_s16(dd[5]), vget_low_s16(ds[5])), 1729 vmull_s16(vget_high_s16(dd[5]), vget_high_s16(ds[5]))); 1730 deltas2224 = vaddw_s32(deltas2224, vget_low_s32(res3)); 1731 1732 int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0); 1733 ds[0] = vextq_s16(ds[0], ds[1], 2); 1734 ds[1] = vextq_s16(ds[1], ds[0], 2); 1735 ds[1] = vreinterpretq_s16_s32( 1736 vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3)); 1737 int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0); 1738 dd[4] = vextq_s16(dd[4], dd[5], 2); 1739 dd[5] = vextq_s16(dd[5], dd[4], 2); 1740 dd[5] = vreinterpretq_s16_s32( 1741 vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3)); 1742 di += d_stride; 1743 dj += d_stride; 1744 } while (++y < height); 1745 } 1746 1747 // Writing one more element on the top edge of a square falls to 1748 // the next square in the same row or the first element in the next 1749 // row, which will just be overwritten later. 1750 int64x2_t s0 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + 1751 (j - 1) * wiener_win + 0); 1752 int64x2_t s1 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + 1753 (j - 1) * wiener_win + 2); 1754 int64x2_t s2 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + 1755 (j - 1) * wiener_win + 4); 1756 int64x2_t s3 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + 1757 (j - 1) * wiener_win + 6); 1758 1759 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 0, 1760 vaddq_s64(s0, deltas02)); 1761 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 2, 1762 vaddq_s64(s1, deltas46)); 1763 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 4, 1764 vaddq_s64(s2, deltas810)); 1765 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 6, 1766 vaddq_s64(s3, deltas1212)); 1767 1768 H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = 1769 H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + 1770 vgetq_lane_s64(deltas1416, 0); 1771 H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = 1772 H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + 1773 vgetq_lane_s64(deltas1416, 1); 1774 H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = 1775 H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + 1776 vgetq_lane_s64(deltas1820, 0); 1777 H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = 1778 H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + 1779 vgetq_lane_s64(deltas1820, 1); 1780 H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] = 1781 H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] + 1782 vgetq_lane_s64(deltas2224, 0); 1783 H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] = 1784 H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] + 1785 vgetq_lane_s64(deltas2224, 1); 1786 } while (++j < wiener_win); 1787 } while (++i < wiener_win - 1); 1788 1789 // Step 5: Derive other points of each square. No square in bottom row. 1790 i = 0; 1791 do { 1792 const int16_t *const di = d + i; 1793 1794 j = i + 1; 1795 do { 1796 const int16_t *const dj = d + j; 1797 int64x2_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s64(0) }, 1798 { vdupq_n_s64(0) } }; 1799 int16x8_t d_is[WIN_7]; 1800 int16x8_t d_ie[WIN_7]; 1801 int16x8_t d_js[WIN_7]; 1802 int16x8_t d_je[WIN_7]; 1803 1804 x = 0; 1805 while (x < width - 16) { 1806 load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, 1807 d_js, d_je); 1808 derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas); 1809 x += 16; 1810 } 1811 1812 load_square_win7_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, 1813 d_je, p0, p1); 1814 derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas); 1815 1816 hadd_update_6_stats_sve( 1817 H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], 1818 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); 1819 hadd_update_6_stats_sve( 1820 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], 1821 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); 1822 hadd_update_6_stats_sve( 1823 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], 1824 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); 1825 hadd_update_6_stats_sve( 1826 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], 1827 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); 1828 hadd_update_6_stats_sve( 1829 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4], 1830 H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1); 1831 hadd_update_6_stats_sve( 1832 H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5], 1833 H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1); 1834 } while (++j < wiener_win); 1835 } while (++i < wiener_win - 1); 1836 1837 // Step 6: Derive other points of each upper triangle along the diagonal. 1838 i = 0; 1839 do { 1840 const int16_t *const di = d + i; 1841 int64x2_t deltas[3 * WIENER_WIN] = { vdupq_n_s64(0) }; 1842 int16x8_t d_is[WIN_7], d_ie[WIN_7]; 1843 1844 x = 0; 1845 while (x < width - 16) { 1846 load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); 1847 derive_triangle_win7_sve(d_is, d_ie, deltas); 1848 x += 16; 1849 } 1850 1851 load_triangle_win7_sve(di + x, d_stride, height, d_is, d_ie, p0, p1); 1852 derive_triangle_win7_sve(d_is, d_ie, deltas); 1853 1854 // Row 1: 6 points 1855 hadd_update_6_stats_sve( 1856 H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, 1857 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); 1858 1859 int64x2_t deltas1017 = vpaddq_s64(deltas[10], deltas[17]); 1860 1861 // Row 2: 5 points 1862 hadd_update_4_stats_sve( 1863 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6, 1864 H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); 1865 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] = 1866 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] + 1867 vgetq_lane_s64(deltas1017, 0); 1868 1869 // Row 3: 4 points 1870 hadd_update_4_stats_sve( 1871 H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, 1872 deltas + 11, 1873 H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); 1874 1875 // Row 4: 3 points 1876 int64x2_t h0 = 1877 vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); 1878 vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4, 1879 vaddq_s64(h0, vpaddq_s64(deltas[15], deltas[16]))); 1880 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] = 1881 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] + 1882 vgetq_lane_s64(deltas1017, 1); 1883 1884 // Row 5: 2 points 1885 int64x2_t h1 = 1886 vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4); 1887 vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5, 1888 vaddq_s64(h1, vpaddq_s64(deltas[18], deltas[19]))); 1889 1890 // Row 6: 1 points 1891 H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] = 1892 H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] + 1893 vaddvq_s64(deltas[20]); 1894 } while (++i < wiener_win); 1895 } 1896 1897 #endif // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_