highbd_pickrst_neon.c (78726B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 #include <stdint.h> 15 16 #include "aom_dsp/arm/mem_neon.h" 17 #include "aom_dsp/arm/sum_neon.h" 18 #include "aom_dsp/arm/transpose_neon.h" 19 #include "av1/encoder/arm/pickrst_neon.h" 20 #include "av1/encoder/pickrst.h" 21 22 static inline void highbd_calc_proj_params_r0_r1_neon( 23 const uint8_t *src8, int width, int height, int src_stride, 24 const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, 25 int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { 26 assert(width % 8 == 0); 27 const int size = width * height; 28 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 29 const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); 30 31 int64x2_t h00_lo = vdupq_n_s64(0); 32 int64x2_t h00_hi = vdupq_n_s64(0); 33 int64x2_t h11_lo = vdupq_n_s64(0); 34 int64x2_t h11_hi = vdupq_n_s64(0); 35 int64x2_t h01_lo = vdupq_n_s64(0); 36 int64x2_t h01_hi = vdupq_n_s64(0); 37 int64x2_t c0_lo = vdupq_n_s64(0); 38 int64x2_t c0_hi = vdupq_n_s64(0); 39 int64x2_t c1_lo = vdupq_n_s64(0); 40 int64x2_t c1_hi = vdupq_n_s64(0); 41 42 do { 43 const uint16_t *src_ptr = src; 44 const uint16_t *dat_ptr = dat; 45 int32_t *flt0_ptr = flt0; 46 int32_t *flt1_ptr = flt1; 47 int w = width; 48 49 do { 50 uint16x8_t s = vld1q_u16(src_ptr); 51 uint16x8_t d = vld1q_u16(dat_ptr); 52 int32x4_t f0_lo = vld1q_s32(flt0_ptr); 53 int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); 54 int32x4_t f1_lo = vld1q_s32(flt1_ptr); 55 int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); 56 57 int32x4_t u_lo = 58 vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); 59 int32x4_t u_hi = vreinterpretq_s32_u32( 60 vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); 61 int32x4_t s_lo = 62 vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); 63 int32x4_t s_hi = vreinterpretq_s32_u32( 64 vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); 65 s_lo = vsubq_s32(s_lo, u_lo); 66 s_hi = vsubq_s32(s_hi, u_hi); 67 68 f0_lo = vsubq_s32(f0_lo, u_lo); 69 f0_hi = vsubq_s32(f0_hi, u_hi); 70 f1_lo = vsubq_s32(f1_lo, u_lo); 71 f1_hi = vsubq_s32(f1_hi, u_hi); 72 73 h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); 74 h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); 75 h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); 76 h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); 77 78 h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); 79 h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); 80 h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); 81 h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); 82 83 h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); 84 h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); 85 h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); 86 h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); 87 88 c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); 89 c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); 90 c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); 91 c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); 92 93 c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); 94 c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); 95 c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); 96 c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); 97 98 src_ptr += 8; 99 dat_ptr += 8; 100 flt0_ptr += 8; 101 flt1_ptr += 8; 102 w -= 8; 103 } while (w != 0); 104 105 src += src_stride; 106 dat += dat_stride; 107 flt0 += flt0_stride; 108 flt1 += flt1_stride; 109 } while (--height != 0); 110 111 H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; 112 H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; 113 H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; 114 H[1][0] = H[0][1]; 115 C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; 116 C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; 117 } 118 119 static inline void highbd_calc_proj_params_r0_neon( 120 const uint8_t *src8, int width, int height, int src_stride, 121 const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, 122 int64_t H[2][2], int64_t C[2]) { 123 assert(width % 8 == 0); 124 const int size = width * height; 125 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 126 const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); 127 128 int64x2_t h00_lo = vdupq_n_s64(0); 129 int64x2_t h00_hi = vdupq_n_s64(0); 130 int64x2_t c0_lo = vdupq_n_s64(0); 131 int64x2_t c0_hi = vdupq_n_s64(0); 132 133 do { 134 const uint16_t *src_ptr = src; 135 const uint16_t *dat_ptr = dat; 136 int32_t *flt0_ptr = flt0; 137 int w = width; 138 139 do { 140 uint16x8_t s = vld1q_u16(src_ptr); 141 uint16x8_t d = vld1q_u16(dat_ptr); 142 int32x4_t f0_lo = vld1q_s32(flt0_ptr); 143 int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); 144 145 int32x4_t u_lo = 146 vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); 147 int32x4_t u_hi = vreinterpretq_s32_u32( 148 vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); 149 int32x4_t s_lo = 150 vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); 151 int32x4_t s_hi = vreinterpretq_s32_u32( 152 vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); 153 s_lo = vsubq_s32(s_lo, u_lo); 154 s_hi = vsubq_s32(s_hi, u_hi); 155 156 f0_lo = vsubq_s32(f0_lo, u_lo); 157 f0_hi = vsubq_s32(f0_hi, u_hi); 158 159 h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); 160 h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); 161 h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); 162 h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); 163 164 c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); 165 c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); 166 c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); 167 c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); 168 169 src_ptr += 8; 170 dat_ptr += 8; 171 flt0_ptr += 8; 172 w -= 8; 173 } while (w != 0); 174 175 src += src_stride; 176 dat += dat_stride; 177 flt0 += flt0_stride; 178 } while (--height != 0); 179 180 H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; 181 C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; 182 } 183 184 static inline void highbd_calc_proj_params_r1_neon( 185 const uint8_t *src8, int width, int height, int src_stride, 186 const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, 187 int64_t H[2][2], int64_t C[2]) { 188 assert(width % 8 == 0); 189 const int size = width * height; 190 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 191 const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); 192 193 int64x2_t h11_lo = vdupq_n_s64(0); 194 int64x2_t h11_hi = vdupq_n_s64(0); 195 int64x2_t c1_lo = vdupq_n_s64(0); 196 int64x2_t c1_hi = vdupq_n_s64(0); 197 198 do { 199 const uint16_t *src_ptr = src; 200 const uint16_t *dat_ptr = dat; 201 int32_t *flt1_ptr = flt1; 202 int w = width; 203 204 do { 205 uint16x8_t s = vld1q_u16(src_ptr); 206 uint16x8_t d = vld1q_u16(dat_ptr); 207 int32x4_t f1_lo = vld1q_s32(flt1_ptr); 208 int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); 209 210 int32x4_t u_lo = 211 vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); 212 int32x4_t u_hi = vreinterpretq_s32_u32( 213 vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); 214 int32x4_t s_lo = 215 vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); 216 int32x4_t s_hi = vreinterpretq_s32_u32( 217 vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); 218 s_lo = vsubq_s32(s_lo, u_lo); 219 s_hi = vsubq_s32(s_hi, u_hi); 220 221 f1_lo = vsubq_s32(f1_lo, u_lo); 222 f1_hi = vsubq_s32(f1_hi, u_hi); 223 224 h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); 225 h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); 226 h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); 227 h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); 228 229 c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); 230 c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); 231 c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); 232 c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); 233 234 src_ptr += 8; 235 dat_ptr += 8; 236 flt1_ptr += 8; 237 w -= 8; 238 } while (w != 0); 239 240 src += src_stride; 241 dat += dat_stride; 242 flt1 += flt1_stride; 243 } while (--height != 0); 244 245 H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; 246 C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; 247 } 248 249 // The function calls 3 subfunctions for the following cases : 250 // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements 251 // of C and H need to be computed. 252 // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are 253 // non-zero and need to be computed. 254 // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are 255 // non-zero and need to be computed. 256 void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, 257 int height, int src_stride, 258 const uint8_t *dat8, int dat_stride, 259 int32_t *flt0, int flt0_stride, 260 int32_t *flt1, int flt1_stride, 261 int64_t H[2][2], int64_t C[2], 262 const sgr_params_type *params) { 263 if ((params->r[0] > 0) && (params->r[1] > 0)) { 264 highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, 265 dat_stride, flt0, flt0_stride, flt1, 266 flt1_stride, H, C); 267 } else if (params->r[0] > 0) { 268 highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, 269 dat_stride, flt0, flt0_stride, H, C); 270 } else if (params->r[1] > 0) { 271 highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, 272 dat_stride, flt1, flt1_stride, H, C); 273 } 274 } 275 276 static inline void hadd_update_4_stats_neon(const int64_t *const src, 277 const int32x4_t *deltas, 278 int64_t *const dst) { 279 int64x2_t delta0_s64 = vpaddlq_s32(deltas[0]); 280 int64x2_t delta1_s64 = vpaddlq_s32(deltas[1]); 281 int64x2_t delta2_s64 = vpaddlq_s32(deltas[2]); 282 int64x2_t delta3_s64 = vpaddlq_s32(deltas[3]); 283 284 #if AOM_ARCH_AARCH64 285 int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64); 286 int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64); 287 288 int64x2_t src0 = vld1q_s64(src); 289 int64x2_t src1 = vld1q_s64(src + 2); 290 vst1q_s64(dst, vaddq_s64(src0, delta01)); 291 vst1q_s64(dst + 2, vaddq_s64(src1, delta23)); 292 #else 293 dst[0] = src[0] + horizontal_add_s64x2(delta0_s64); 294 dst[1] = src[1] + horizontal_add_s64x2(delta1_s64); 295 dst[2] = src[2] + horizontal_add_s64x2(delta2_s64); 296 dst[3] = src[3] + horizontal_add_s64x2(delta3_s64); 297 #endif 298 } 299 300 static inline void compute_stats_win5_highbd_neon( 301 const int16_t *const d, const int32_t d_stride, const int16_t *const s, 302 const int32_t s_stride, const int32_t width, const int32_t height, 303 int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) { 304 const int32_t wiener_win = WIENER_WIN_CHROMA; 305 const int32_t wiener_win2 = wiener_win * wiener_win; 306 const int32_t w16 = width & ~15; 307 const int32_t h8 = height & ~7; 308 int16x8_t mask[2]; 309 mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16); 310 mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8); 311 int32_t i, j, x, y; 312 313 const int32_t num_bit_left = 314 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */; 315 const int32_t h_allowed = 316 (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0)); 317 318 // Step 1: Calculate the top edge of the whole matrix, i.e., the top 319 // edge of each triangle and square on the top row. 320 j = 0; 321 do { 322 const int16_t *s_t = s; 323 const int16_t *d_t = d; 324 int32_t height_t = 0; 325 int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; 326 int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; 327 int16x8_t src[2], dgd[2]; 328 329 do { 330 const int32_t h_t = 331 ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; 332 int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) }; 333 int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) }; 334 335 y = h_t; 336 do { 337 x = 0; 338 while (x < w16) { 339 src[0] = vld1q_s16(s_t + x + 0); 340 src[1] = vld1q_s16(s_t + x + 8); 341 dgd[0] = vld1q_s16(d_t + x + 0); 342 dgd[1] = vld1q_s16(d_t + x + 8); 343 stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h); 344 x += 16; 345 } 346 347 if (w16 != width) { 348 src[0] = vld1q_s16(s_t + w16 + 0); 349 src[1] = vld1q_s16(s_t + w16 + 8); 350 dgd[0] = vld1q_s16(d_t + w16 + 0); 351 dgd[1] = vld1q_s16(d_t + w16 + 8); 352 src[0] = vandq_s16(src[0], mask[0]); 353 src[1] = vandq_s16(src[1], mask[1]); 354 dgd[0] = vandq_s16(dgd[0], mask[0]); 355 dgd[1] = vandq_s16(dgd[1], mask[1]); 356 stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h); 357 } 358 359 s_t += s_stride; 360 d_t += d_stride; 361 } while (--y); 362 363 sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]); 364 sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]); 365 sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]); 366 sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]); 367 sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]); 368 sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); 369 sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); 370 sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); 371 sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); 372 sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); 373 374 height_t += h_t; 375 } while (height_t < height); 376 377 #if AOM_ARCH_AARCH64 378 int64x2_t sum_m0 = vpaddq_s64(sum_m[0], sum_m[1]); 379 int64x2_t sum_m2 = vpaddq_s64(sum_m[2], sum_m[3]); 380 vst1q_s64(&M[wiener_win * j + 0], sum_m0); 381 vst1q_s64(&M[wiener_win * j + 2], sum_m2); 382 M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]); 383 384 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); 385 int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]); 386 vst1q_s64(&H[wiener_win * j + 0], sum_h0); 387 vst1q_s64(&H[wiener_win * j + 2], sum_h2); 388 H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]); 389 #else 390 M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]); 391 M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]); 392 M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]); 393 M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]); 394 M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]); 395 396 H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]); 397 H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]); 398 H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]); 399 H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]); 400 H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]); 401 #endif // AOM_ARCH_AARCH64 402 } while (++j < wiener_win); 403 404 // Step 2: Calculate the left edge of each square on the top row. 405 j = 1; 406 do { 407 const int16_t *d_t = d; 408 int32_t height_t = 0; 409 int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) }; 410 int16x8_t dgd[2]; 411 412 do { 413 const int32_t h_t = 414 ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; 415 int32x4_t row_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s32(0) }; 416 417 y = h_t; 418 do { 419 x = 0; 420 while (x < w16) { 421 dgd[0] = vld1q_s16(d_t + j + x + 0); 422 dgd[1] = vld1q_s16(d_t + j + x + 8); 423 stats_left_win5_neon(dgd, d_t + x, d_stride, row_h); 424 x += 16; 425 } 426 427 if (w16 != width) { 428 dgd[0] = vld1q_s16(d_t + j + x + 0); 429 dgd[1] = vld1q_s16(d_t + j + x + 8); 430 dgd[0] = vandq_s16(dgd[0], mask[0]); 431 dgd[1] = vandq_s16(dgd[1], mask[1]); 432 stats_left_win5_neon(dgd, d_t + x, d_stride, row_h); 433 } 434 435 d_t += d_stride; 436 } while (--y); 437 438 sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); 439 sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); 440 sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); 441 sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); 442 443 height_t += h_t; 444 } while (height_t < height); 445 446 #if AOM_ARCH_AARCH64 447 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); 448 int64x2_t sum_h1 = vpaddq_s64(sum_h[2], sum_h[3]); 449 vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0)); 450 vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0)); 451 vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1)); 452 vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1)); 453 #else 454 H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]); 455 H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]); 456 H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]); 457 H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]); 458 #endif // AOM_ARCH_AARCH64 459 } while (++j < wiener_win); 460 461 // Step 3: Derive the top edge of each triangle along the diagonal. No 462 // triangle in top row. 463 { 464 const int16_t *d_t = d; 465 466 if (height % 2) { 467 int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 468 int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 469 int16x8_t ds[WIENER_WIN * 2]; 470 471 load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]); 472 load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]); 473 d_t += 4 * d_stride; 474 475 step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas); 476 transpose_arrays_s32_8x8(deltas, deltas_tr); 477 478 update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, 479 deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0), 480 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); 481 482 update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, 483 deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0), 484 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); 485 486 update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, 487 deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0), 488 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); 489 490 update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, 491 deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0), 492 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); 493 494 } else { 495 int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) }; 496 int16x8_t ds[WIENER_WIN_CHROMA * 2]; 497 498 ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width); 499 ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width); 500 ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width); 501 ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width); 502 503 step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas); 504 505 transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2], 506 &deltas[3]); 507 508 update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, 509 deltas[0], vgetq_lane_s32(deltas[4], 0), 510 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); 511 512 update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, 513 deltas[1], vgetq_lane_s32(deltas[4], 1), 514 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); 515 516 update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, 517 deltas[2], vgetq_lane_s32(deltas[4], 2), 518 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); 519 520 update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, 521 deltas[3], vgetq_lane_s32(deltas[4], 3), 522 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); 523 } 524 } 525 526 // Step 4: Derive the top and left edge of each square. No square in top and 527 // bottom row. 528 529 { 530 y = h8; 531 532 int16x4_t d_s[12]; 533 int16x4_t d_e[12]; 534 const int16_t *d_t = d; 535 int16x4_t zeros = vdup_n_s16(0); 536 load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]); 537 load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]); 538 int32x4_t deltas[6][18] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; 539 540 while (y >= 8) { 541 load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], 542 &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); 543 load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], 544 &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); 545 546 int16x8_t s_tr[8], e_tr[8]; 547 transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], 548 d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], 549 &s_tr[3]); 550 transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, 551 zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], 552 &s_tr[7]); 553 554 transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], 555 d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], 556 &e_tr[3]); 557 transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, 558 zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], 559 &e_tr[7]); 560 561 int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; 562 start_col0[0] = s_tr[0]; 563 start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1); 564 start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2); 565 start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3); 566 start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4); 567 568 start_col1[0] = s_tr[1]; 569 start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1); 570 start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2); 571 start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3); 572 start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4); 573 574 start_col2[0] = s_tr[2]; 575 start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1); 576 start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2); 577 start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3); 578 start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4); 579 580 start_col3[0] = s_tr[3]; 581 start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1); 582 start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2); 583 start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3); 584 start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4); 585 586 // i = 1, j = 2; 587 sub_deltas_step4(start_col0, start_col1, deltas[0]); 588 589 // i = 1, j = 3; 590 sub_deltas_step4(start_col0, start_col2, deltas[1]); 591 592 // i = 1, j = 4 593 sub_deltas_step4(start_col0, start_col3, deltas[2]); 594 595 // i = 2, j =3 596 sub_deltas_step4(start_col1, start_col2, deltas[3]); 597 598 // i = 2, j = 4 599 sub_deltas_step4(start_col1, start_col3, deltas[4]); 600 601 // i = 3, j = 4 602 sub_deltas_step4(start_col2, start_col3, deltas[5]); 603 604 int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; 605 end_col0[0] = e_tr[0]; 606 end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1); 607 end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2); 608 end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3); 609 end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4); 610 611 end_col1[0] = e_tr[1]; 612 end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1); 613 end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2); 614 end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3); 615 end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4); 616 617 end_col2[0] = e_tr[2]; 618 end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1); 619 end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2); 620 end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3); 621 end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4); 622 623 end_col3[0] = e_tr[3]; 624 end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1); 625 end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2); 626 end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3); 627 end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4); 628 629 // i = 1, j = 2; 630 add_deltas_step4(end_col0, end_col1, deltas[0]); 631 632 // i = 1, j = 3; 633 add_deltas_step4(end_col0, end_col2, deltas[1]); 634 635 // i = 1, j = 4 636 add_deltas_step4(end_col0, end_col3, deltas[2]); 637 638 // i = 2, j =3 639 add_deltas_step4(end_col1, end_col2, deltas[3]); 640 641 // i = 2, j = 4 642 add_deltas_step4(end_col1, end_col3, deltas[4]); 643 644 // i = 3, j = 4 645 add_deltas_step4(end_col2, end_col3, deltas[5]); 646 647 d_s[0] = d_s[8]; 648 d_s[1] = d_s[9]; 649 d_s[2] = d_s[10]; 650 d_s[3] = d_s[11]; 651 d_e[0] = d_e[8]; 652 d_e[1] = d_e[9]; 653 d_e[2] = d_e[10]; 654 d_e[3] = d_e[11]; 655 656 d_t += 8 * d_stride; 657 y -= 8; 658 } 659 660 if (h8 != height) { 661 const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8)); 662 663 load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], 664 &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); 665 load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], 666 &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); 667 int16x8_t s_tr[8], e_tr[8]; 668 transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], 669 d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], 670 &s_tr[3]); 671 transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, 672 zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], 673 &s_tr[7]); 674 transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], 675 d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], 676 &e_tr[3]); 677 transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, 678 zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], 679 &e_tr[7]); 680 681 int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; 682 start_col0[0] = vandq_s16(s_tr[0], mask_h); 683 start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h); 684 start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h); 685 start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h); 686 start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h); 687 688 start_col1[0] = vandq_s16(s_tr[1], mask_h); 689 start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h); 690 start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h); 691 start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h); 692 start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h); 693 694 start_col2[0] = vandq_s16(s_tr[2], mask_h); 695 start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h); 696 start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h); 697 start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h); 698 start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h); 699 700 start_col3[0] = vandq_s16(s_tr[3], mask_h); 701 start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h); 702 start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h); 703 start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h); 704 start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h); 705 706 // i = 1, j = 2; 707 sub_deltas_step4(start_col0, start_col1, deltas[0]); 708 709 // i = 1, j = 3; 710 sub_deltas_step4(start_col0, start_col2, deltas[1]); 711 712 // i = 1, j = 4 713 sub_deltas_step4(start_col0, start_col3, deltas[2]); 714 715 // i = 2, j = 3 716 sub_deltas_step4(start_col1, start_col2, deltas[3]); 717 718 // i = 2, j = 4 719 sub_deltas_step4(start_col1, start_col3, deltas[4]); 720 721 // i = 3, j = 4 722 sub_deltas_step4(start_col2, start_col3, deltas[5]); 723 724 int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; 725 end_col0[0] = vandq_s16(e_tr[0], mask_h); 726 end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h); 727 end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h); 728 end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h); 729 end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h); 730 731 end_col1[0] = vandq_s16(e_tr[1], mask_h); 732 end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h); 733 end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h); 734 end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h); 735 end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h); 736 737 end_col2[0] = vandq_s16(e_tr[2], mask_h); 738 end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h); 739 end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h); 740 end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h); 741 end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h); 742 743 end_col3[0] = vandq_s16(e_tr[3], mask_h); 744 end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h); 745 end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h); 746 end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h); 747 end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h); 748 749 // i = 1, j = 2; 750 add_deltas_step4(end_col0, end_col1, deltas[0]); 751 752 // i = 1, j = 3; 753 add_deltas_step4(end_col0, end_col2, deltas[1]); 754 755 // i = 1, j = 4 756 add_deltas_step4(end_col0, end_col3, deltas[2]); 757 758 // i = 2, j =3 759 add_deltas_step4(end_col1, end_col2, deltas[3]); 760 761 // i = 2, j = 4 762 add_deltas_step4(end_col1, end_col3, deltas[4]); 763 764 // i = 3, j = 4 765 add_deltas_step4(end_col2, end_col3, deltas[5]); 766 } 767 768 int32x4_t delta[6][2]; 769 int32_t single_delta[6]; 770 771 delta[0][0] = horizontal_add_4d_s32x4(&deltas[0][0]); 772 delta[1][0] = horizontal_add_4d_s32x4(&deltas[1][0]); 773 delta[2][0] = horizontal_add_4d_s32x4(&deltas[2][0]); 774 delta[3][0] = horizontal_add_4d_s32x4(&deltas[3][0]); 775 delta[4][0] = horizontal_add_4d_s32x4(&deltas[4][0]); 776 delta[5][0] = horizontal_add_4d_s32x4(&deltas[5][0]); 777 778 delta[0][1] = horizontal_add_4d_s32x4(&deltas[0][5]); 779 delta[1][1] = horizontal_add_4d_s32x4(&deltas[1][5]); 780 delta[2][1] = horizontal_add_4d_s32x4(&deltas[2][5]); 781 delta[3][1] = horizontal_add_4d_s32x4(&deltas[3][5]); 782 delta[4][1] = horizontal_add_4d_s32x4(&deltas[4][5]); 783 delta[5][1] = horizontal_add_4d_s32x4(&deltas[5][5]); 784 785 single_delta[0] = horizontal_add_s32x4(deltas[0][4]); 786 single_delta[1] = horizontal_add_s32x4(deltas[1][4]); 787 single_delta[2] = horizontal_add_s32x4(deltas[2][4]); 788 single_delta[3] = horizontal_add_s32x4(deltas[3][4]); 789 single_delta[4] = horizontal_add_s32x4(deltas[4][4]); 790 single_delta[5] = horizontal_add_s32x4(deltas[5][4]); 791 792 int idx = 0; 793 for (i = 1; i < wiener_win - 1; i++) { 794 for (j = i + 1; j < wiener_win; j++) { 795 update_4_stats_neon( 796 H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, 797 delta[idx][0], H + i * wiener_win * wiener_win2 + j * wiener_win); 798 H[i * wiener_win * wiener_win2 + j * wiener_win + 4] = 799 H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] + 800 single_delta[idx]; 801 802 H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = 803 H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + 804 vgetq_lane_s32(delta[idx][1], 0); 805 H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = 806 H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + 807 vgetq_lane_s32(delta[idx][1], 1); 808 H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = 809 H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + 810 vgetq_lane_s32(delta[idx][1], 2); 811 H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = 812 H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + 813 vgetq_lane_s32(delta[idx][1], 3); 814 815 idx++; 816 } 817 } 818 } 819 820 // Step 5: Derive other points of each square. No square in bottom row. 821 i = 0; 822 do { 823 const int16_t *const di = d + i; 824 825 j = i + 1; 826 do { 827 const int16_t *const dj = d + j; 828 int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = { 829 { vdupq_n_s32(0) }, { vdupq_n_s32(0) } 830 }; 831 int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; 832 int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA]; 833 834 x = 0; 835 while (x < w16) { 836 load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, 837 d_js, d_je); 838 derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas); 839 x += 16; 840 } 841 842 if (w16 != width) { 843 load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, 844 d_js, d_je); 845 d_is[0] = vandq_s16(d_is[0], mask[0]); 846 d_is[1] = vandq_s16(d_is[1], mask[1]); 847 d_is[2] = vandq_s16(d_is[2], mask[0]); 848 d_is[3] = vandq_s16(d_is[3], mask[1]); 849 d_is[4] = vandq_s16(d_is[4], mask[0]); 850 d_is[5] = vandq_s16(d_is[5], mask[1]); 851 d_is[6] = vandq_s16(d_is[6], mask[0]); 852 d_is[7] = vandq_s16(d_is[7], mask[1]); 853 d_ie[0] = vandq_s16(d_ie[0], mask[0]); 854 d_ie[1] = vandq_s16(d_ie[1], mask[1]); 855 d_ie[2] = vandq_s16(d_ie[2], mask[0]); 856 d_ie[3] = vandq_s16(d_ie[3], mask[1]); 857 d_ie[4] = vandq_s16(d_ie[4], mask[0]); 858 d_ie[5] = vandq_s16(d_ie[5], mask[1]); 859 d_ie[6] = vandq_s16(d_ie[6], mask[0]); 860 d_ie[7] = vandq_s16(d_ie[7], mask[1]); 861 derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas); 862 } 863 864 hadd_update_4_stats_neon( 865 H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], 866 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); 867 hadd_update_4_stats_neon( 868 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], 869 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); 870 hadd_update_4_stats_neon( 871 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], 872 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); 873 hadd_update_4_stats_neon( 874 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], 875 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); 876 } while (++j < wiener_win); 877 } while (++i < wiener_win - 1); 878 879 // Step 6: Derive other points of each upper triangle along the diagonal. 880 i = 0; 881 do { 882 const int16_t *const di = d + i; 883 int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) }; 884 int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; 885 886 x = 0; 887 while (x < w16) { 888 load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); 889 derive_triangle_win5_neon(d_is, d_ie, deltas); 890 x += 16; 891 } 892 893 if (w16 != width) { 894 load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); 895 d_is[0] = vandq_s16(d_is[0], mask[0]); 896 d_is[1] = vandq_s16(d_is[1], mask[1]); 897 d_is[2] = vandq_s16(d_is[2], mask[0]); 898 d_is[3] = vandq_s16(d_is[3], mask[1]); 899 d_is[4] = vandq_s16(d_is[4], mask[0]); 900 d_is[5] = vandq_s16(d_is[5], mask[1]); 901 d_is[6] = vandq_s16(d_is[6], mask[0]); 902 d_is[7] = vandq_s16(d_is[7], mask[1]); 903 d_ie[0] = vandq_s16(d_ie[0], mask[0]); 904 d_ie[1] = vandq_s16(d_ie[1], mask[1]); 905 d_ie[2] = vandq_s16(d_ie[2], mask[0]); 906 d_ie[3] = vandq_s16(d_ie[3], mask[1]); 907 d_ie[4] = vandq_s16(d_ie[4], mask[0]); 908 d_ie[5] = vandq_s16(d_ie[5], mask[1]); 909 d_ie[6] = vandq_s16(d_ie[6], mask[0]); 910 d_ie[7] = vandq_s16(d_ie[7], mask[1]); 911 derive_triangle_win5_neon(d_is, d_ie, deltas); 912 } 913 914 // Row 1: 4 points 915 hadd_update_4_stats_neon( 916 H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, 917 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); 918 919 // Row 2: 3 points 920 int64x2_t delta4_s64 = vpaddlq_s32(deltas[4]); 921 int64x2_t delta5_s64 = vpaddlq_s32(deltas[5]); 922 923 #if AOM_ARCH_AARCH64 924 int64x2_t deltas45 = vpaddq_s64(delta4_s64, delta5_s64); 925 int64x2_t src = 926 vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); 927 int64x2_t dst = vaddq_s64(src, deltas45); 928 vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, dst); 929 #else 930 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 0] = 931 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1 + 0] + 932 horizontal_add_s64x2(delta4_s64); 933 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 1] = 934 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1 + 1] + 935 horizontal_add_s64x2(delta5_s64); 936 #endif // AOM_ARCH_AARCH64 937 938 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] = 939 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] + 940 horizontal_long_add_s32x4(deltas[6]); 941 942 // Row 3: 2 points 943 int64x2_t delta7_s64 = vpaddlq_s32(deltas[7]); 944 int64x2_t delta8_s64 = vpaddlq_s32(deltas[8]); 945 946 #if AOM_ARCH_AARCH64 947 int64x2_t deltas78 = vpaddq_s64(delta7_s64, delta8_s64); 948 vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3, 949 vaddq_s64(dst, deltas78)); 950 #else 951 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 0] = 952 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 0] + 953 horizontal_add_s64x2(delta7_s64); 954 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 1] = 955 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 1] + 956 horizontal_add_s64x2(delta8_s64); 957 #endif // AOM_ARCH_AARCH64 958 959 // Row 4: 1 point 960 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] = 961 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] + 962 horizontal_long_add_s32x4(deltas[9]); 963 } while (++i < wiener_win); 964 } 965 966 static inline void hadd_update_6_stats_neon(const int64_t *const src, 967 const int32x4_t *deltas, 968 int64_t *const dst) { 969 int64x2_t delta0_s64 = vpaddlq_s32(deltas[0]); 970 int64x2_t delta1_s64 = vpaddlq_s32(deltas[1]); 971 int64x2_t delta2_s64 = vpaddlq_s32(deltas[2]); 972 int64x2_t delta3_s64 = vpaddlq_s32(deltas[3]); 973 int64x2_t delta4_s64 = vpaddlq_s32(deltas[4]); 974 int64x2_t delta5_s64 = vpaddlq_s32(deltas[5]); 975 976 #if AOM_ARCH_AARCH64 977 int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64); 978 int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64); 979 int64x2_t delta45 = vpaddq_s64(delta4_s64, delta5_s64); 980 981 int64x2_t src0 = vld1q_s64(src); 982 int64x2_t src1 = vld1q_s64(src + 2); 983 int64x2_t src2 = vld1q_s64(src + 4); 984 985 vst1q_s64(dst, vaddq_s64(src0, delta01)); 986 vst1q_s64(dst + 2, vaddq_s64(src1, delta23)); 987 vst1q_s64(dst + 4, vaddq_s64(src2, delta45)); 988 #else 989 dst[0] = src[0] + horizontal_add_s64x2(delta0_s64); 990 dst[1] = src[1] + horizontal_add_s64x2(delta1_s64); 991 dst[2] = src[2] + horizontal_add_s64x2(delta2_s64); 992 dst[3] = src[3] + horizontal_add_s64x2(delta3_s64); 993 dst[4] = src[4] + horizontal_add_s64x2(delta4_s64); 994 dst[5] = src[5] + horizontal_add_s64x2(delta5_s64); 995 #endif 996 } 997 998 static inline void compute_stats_win7_highbd_neon( 999 const int16_t *const d, const int32_t d_stride, const int16_t *const s, 1000 const int32_t s_stride, const int32_t width, const int32_t height, 1001 int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) { 1002 const int32_t wiener_win = WIENER_WIN; 1003 const int32_t wiener_win2 = wiener_win * wiener_win; 1004 const int32_t w16 = width & ~15; 1005 const int32_t h8 = height & ~7; 1006 int16x8_t mask[2]; 1007 mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16); 1008 mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8); 1009 int32_t i, j, x, y; 1010 1011 const int32_t num_bit_left = 1012 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */; 1013 const int32_t h_allowed = 1014 (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0)); 1015 1016 // Step 1: Calculate the top edge of the whole matrix, i.e., the top 1017 // edge of each triangle and square on the top row. 1018 j = 0; 1019 do { 1020 const int16_t *s_t = s; 1021 const int16_t *d_t = d; 1022 int32_t height_t = 0; 1023 int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) }; 1024 int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) }; 1025 int16x8_t src[2], dgd[2]; 1026 1027 do { 1028 const int32_t h_t = 1029 ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; 1030 int32x4_t row_m[WIENER_WIN * 2] = { vdupq_n_s32(0) }; 1031 int32x4_t row_h[WIENER_WIN * 2] = { vdupq_n_s32(0) }; 1032 1033 y = h_t; 1034 do { 1035 x = 0; 1036 while (x < w16) { 1037 src[0] = vld1q_s16(s_t + x); 1038 src[1] = vld1q_s16(s_t + x + 8); 1039 dgd[0] = vld1q_s16(d_t + x); 1040 dgd[1] = vld1q_s16(d_t + x + 8); 1041 stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h); 1042 x += 16; 1043 } 1044 1045 if (w16 != width) { 1046 src[0] = vld1q_s16(s_t + w16); 1047 src[1] = vld1q_s16(s_t + w16 + 8); 1048 dgd[0] = vld1q_s16(d_t + w16); 1049 dgd[1] = vld1q_s16(d_t + w16 + 8); 1050 src[0] = vandq_s16(src[0], mask[0]); 1051 src[1] = vandq_s16(src[1], mask[1]); 1052 dgd[0] = vandq_s16(dgd[0], mask[0]); 1053 dgd[1] = vandq_s16(dgd[1], mask[1]); 1054 stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h); 1055 } 1056 1057 s_t += s_stride; 1058 d_t += d_stride; 1059 } while (--y); 1060 1061 sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]); 1062 sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]); 1063 sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]); 1064 sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]); 1065 sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]); 1066 sum_m[5] = vpadalq_s32(sum_m[5], row_m[5]); 1067 sum_m[6] = vpadalq_s32(sum_m[6], row_m[6]); 1068 1069 sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); 1070 sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); 1071 sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); 1072 sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); 1073 sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); 1074 sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]); 1075 sum_h[6] = vpadalq_s32(sum_h[6], row_h[6]); 1076 1077 height_t += h_t; 1078 } while (height_t < height); 1079 1080 #if AOM_ARCH_AARCH64 1081 vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1])); 1082 vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3])); 1083 vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5])); 1084 M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]); 1085 1086 vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1])); 1087 vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3])); 1088 vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5])); 1089 H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]); 1090 #else 1091 M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]); 1092 M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]); 1093 M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]); 1094 M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]); 1095 M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]); 1096 M[wiener_win * j + 5] = horizontal_add_s64x2(sum_m[5]); 1097 M[wiener_win * j + 6] = horizontal_add_s64x2(sum_m[6]); 1098 1099 H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]); 1100 H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]); 1101 H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]); 1102 H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]); 1103 H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]); 1104 H[wiener_win * j + 5] = horizontal_add_s64x2(sum_h[5]); 1105 H[wiener_win * j + 6] = horizontal_add_s64x2(sum_h[6]); 1106 #endif // AOM_ARCH_AARCH64 1107 } while (++j < wiener_win); 1108 1109 // Step 2: Calculate the left edge of each square on the top row. 1110 j = 1; 1111 do { 1112 const int16_t *d_t = d; 1113 int32_t height_t = 0; 1114 int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) }; 1115 int16x8_t dgd[2]; 1116 1117 do { 1118 const int32_t h_t = 1119 ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; 1120 int32x4_t row_h[WIENER_WIN - 1] = { vdupq_n_s32(0) }; 1121 1122 y = h_t; 1123 do { 1124 x = 0; 1125 while (x < w16) { 1126 dgd[0] = vld1q_s16(d_t + j + x + 0); 1127 dgd[1] = vld1q_s16(d_t + j + x + 8); 1128 stats_left_win7_neon(dgd, d_t + x, d_stride, row_h); 1129 x += 16; 1130 } 1131 1132 if (w16 != width) { 1133 dgd[0] = vld1q_s16(d_t + j + x + 0); 1134 dgd[1] = vld1q_s16(d_t + j + x + 8); 1135 dgd[0] = vandq_s16(dgd[0], mask[0]); 1136 dgd[1] = vandq_s16(dgd[1], mask[1]); 1137 stats_left_win7_neon(dgd, d_t + x, d_stride, row_h); 1138 } 1139 1140 d_t += d_stride; 1141 } while (--y); 1142 1143 sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); 1144 sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); 1145 sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); 1146 sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); 1147 sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); 1148 sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]); 1149 1150 height_t += h_t; 1151 } while (height_t < height); 1152 1153 #if AOM_ARCH_AARCH64 1154 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); 1155 int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]); 1156 int64x2_t sum_h4 = vpaddq_s64(sum_h[4], sum_h[5]); 1157 vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0)); 1158 vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0)); 1159 vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2)); 1160 vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2)); 1161 vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4)); 1162 vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4)); 1163 #else 1164 H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]); 1165 H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]); 1166 H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]); 1167 H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]); 1168 H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4]); 1169 H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5]); 1170 #endif // AOM_ARCH_AARCH64 1171 1172 } while (++j < wiener_win); 1173 1174 // Step 3: Derive the top edge of each triangle along the diagonal. No 1175 // triangle in top row. 1176 { 1177 const int16_t *d_t = d; 1178 // Pad to call transpose function. 1179 int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 1180 int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; 1181 int16x8_t ds[WIENER_WIN * 2]; 1182 1183 load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8], 1184 &ds[10]); 1185 load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9], 1186 &ds[11]); 1187 1188 d_t += 6 * d_stride; 1189 1190 step3_win7_neon(d_t, d_stride, width, height, ds, deltas); 1191 transpose_arrays_s32_8x8(deltas, deltas_tr); 1192 1193 update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, 1194 deltas_tr[0], deltas_tr[4], 1195 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); 1196 update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, 1197 deltas_tr[1], deltas_tr[5], 1198 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); 1199 update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, 1200 deltas_tr[2], deltas_tr[6], 1201 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); 1202 update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, 1203 deltas_tr[3], deltas_tr[7], 1204 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); 1205 update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win, 1206 deltas_tr[8], deltas_tr[12], 1207 H + 5 * wiener_win * wiener_win2 + 5 * wiener_win); 1208 update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win, 1209 deltas_tr[9], deltas_tr[13], 1210 H + 6 * wiener_win * wiener_win2 + 6 * wiener_win); 1211 } 1212 1213 // Step 4: Derive the top and left edge of each square. No square in top and 1214 // bottom row. 1215 1216 i = 1; 1217 do { 1218 j = i + 1; 1219 do { 1220 const int16_t *di = d + i - 1; 1221 const int16_t *dj = d + j - 1; 1222 int32x4_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s32(0) }; 1223 int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2]; 1224 1225 dd[5] = vdupq_n_s16(0); // Initialize to avoid warning. 1226 const int16_t dd0_values[] = { di[0 * d_stride], 1227 di[1 * d_stride], 1228 di[2 * d_stride], 1229 di[3 * d_stride], 1230 di[4 * d_stride], 1231 di[5 * d_stride], 1232 0, 1233 0 }; 1234 dd[0] = vld1q_s16(dd0_values); 1235 const int16_t dd1_values[] = { di[0 * d_stride + width], 1236 di[1 * d_stride + width], 1237 di[2 * d_stride + width], 1238 di[3 * d_stride + width], 1239 di[4 * d_stride + width], 1240 di[5 * d_stride + width], 1241 0, 1242 0 }; 1243 dd[1] = vld1q_s16(dd1_values); 1244 const int16_t ds0_values[] = { dj[0 * d_stride], 1245 dj[1 * d_stride], 1246 dj[2 * d_stride], 1247 dj[3 * d_stride], 1248 dj[4 * d_stride], 1249 dj[5 * d_stride], 1250 0, 1251 0 }; 1252 ds[0] = vld1q_s16(ds0_values); 1253 int16_t ds1_values[] = { dj[0 * d_stride + width], 1254 dj[1 * d_stride + width], 1255 dj[2 * d_stride + width], 1256 dj[3 * d_stride + width], 1257 dj[4 * d_stride + width], 1258 dj[5 * d_stride + width], 1259 0, 1260 0 }; 1261 ds[1] = vld1q_s16(ds1_values); 1262 1263 y = 0; 1264 while (y < h8) { 1265 // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e 1266 dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6); 1267 dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7); 1268 dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6); 1269 dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7); 1270 1271 // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e 1272 // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e 1273 ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6); 1274 ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7); 1275 ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6); 1276 ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7); 1277 1278 load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]); 1279 load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]); 1280 load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]); 1281 load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]); 1282 load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]); 1283 load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]); 1284 load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]); 1285 load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]); 1286 load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]); 1287 load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]); 1288 load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]); 1289 load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]); 1290 1291 madd_neon(&deltas[0], dd[0], ds[0]); 1292 madd_neon(&deltas[1], dd[1], ds[1]); 1293 madd_neon(&deltas[2], dd[0], ds[2]); 1294 madd_neon(&deltas[3], dd[1], ds[3]); 1295 madd_neon(&deltas[4], dd[0], ds[4]); 1296 madd_neon(&deltas[5], dd[1], ds[5]); 1297 madd_neon(&deltas[6], dd[0], ds[6]); 1298 madd_neon(&deltas[7], dd[1], ds[7]); 1299 madd_neon(&deltas[8], dd[0], ds[8]); 1300 madd_neon(&deltas[9], dd[1], ds[9]); 1301 madd_neon(&deltas[10], dd[0], ds[10]); 1302 madd_neon(&deltas[11], dd[1], ds[11]); 1303 madd_neon(&deltas[12], dd[0], ds[12]); 1304 madd_neon(&deltas[13], dd[1], ds[13]); 1305 madd_neon(&deltas[14], dd[2], ds[0]); 1306 madd_neon(&deltas[15], dd[3], ds[1]); 1307 madd_neon(&deltas[16], dd[4], ds[0]); 1308 madd_neon(&deltas[17], dd[5], ds[1]); 1309 madd_neon(&deltas[18], dd[6], ds[0]); 1310 madd_neon(&deltas[19], dd[7], ds[1]); 1311 madd_neon(&deltas[20], dd[8], ds[0]); 1312 madd_neon(&deltas[21], dd[9], ds[1]); 1313 madd_neon(&deltas[22], dd[10], ds[0]); 1314 madd_neon(&deltas[23], dd[11], ds[1]); 1315 madd_neon(&deltas[24], dd[12], ds[0]); 1316 madd_neon(&deltas[25], dd[13], ds[1]); 1317 1318 dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2); 1319 dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2); 1320 ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2); 1321 ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2); 1322 1323 di += 8 * d_stride; 1324 dj += 8 * d_stride; 1325 y += 8; 1326 } 1327 1328 deltas[0] = hadd_four_32_neon(deltas[0], deltas[2], deltas[4], deltas[6]); 1329 deltas[1] = hadd_four_32_neon(deltas[1], deltas[3], deltas[5], deltas[7]); 1330 deltas[2] = 1331 hadd_four_32_neon(deltas[8], deltas[10], deltas[12], deltas[12]); 1332 deltas[3] = 1333 hadd_four_32_neon(deltas[9], deltas[11], deltas[13], deltas[13]); 1334 deltas[4] = 1335 hadd_four_32_neon(deltas[14], deltas[16], deltas[18], deltas[20]); 1336 deltas[5] = 1337 hadd_four_32_neon(deltas[15], deltas[17], deltas[19], deltas[21]); 1338 deltas[6] = 1339 hadd_four_32_neon(deltas[22], deltas[24], deltas[22], deltas[24]); 1340 deltas[7] = 1341 hadd_four_32_neon(deltas[23], deltas[25], deltas[23], deltas[25]); 1342 deltas[0] = vsubq_s32(deltas[1], deltas[0]); 1343 deltas[1] = vsubq_s32(deltas[3], deltas[2]); 1344 deltas[2] = vsubq_s32(deltas[5], deltas[4]); 1345 deltas[3] = vsubq_s32(deltas[7], deltas[6]); 1346 1347 if (h8 != height) { 1348 const int16_t ds0_vals[] = { 1349 dj[0 * d_stride], dj[0 * d_stride + width], 1350 dj[1 * d_stride], dj[1 * d_stride + width], 1351 dj[2 * d_stride], dj[2 * d_stride + width], 1352 dj[3 * d_stride], dj[3 * d_stride + width] 1353 }; 1354 ds[0] = vld1q_s16(ds0_vals); 1355 1356 ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0); 1357 ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1); 1358 ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2); 1359 ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3); 1360 const int16_t dd4_vals[] = { 1361 -di[1 * d_stride], di[1 * d_stride + width], 1362 -di[2 * d_stride], di[2 * d_stride + width], 1363 -di[3 * d_stride], di[3 * d_stride + width], 1364 -di[4 * d_stride], di[4 * d_stride + width] 1365 }; 1366 dd[4] = vld1q_s16(dd4_vals); 1367 1368 dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0); 1369 dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1); 1370 do { 1371 dd[0] = vdupq_n_s16(-di[0 * d_stride]); 1372 dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]); 1373 dd[0] = dd[1] = vzipq_s16(dd[0], dd[2]).val[0]; 1374 1375 ds[4] = vdupq_n_s16(dj[0 * d_stride]); 1376 ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]); 1377 ds[4] = ds[5] = vzipq_s16(ds[4], ds[6]).val[0]; 1378 1379 dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2); 1380 dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3); 1381 ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4); 1382 ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5); 1383 1384 madd_neon_pairwise(&deltas[0], dd[0], ds[0]); 1385 madd_neon_pairwise(&deltas[1], dd[1], ds[1]); 1386 madd_neon_pairwise(&deltas[2], dd[4], ds[4]); 1387 madd_neon_pairwise(&deltas[3], dd[5], ds[5]); 1388 1389 int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0); 1390 ds[0] = vextq_s16(ds[0], ds[1], 2); 1391 ds[1] = vextq_s16(ds[1], ds[0], 2); 1392 ds[1] = vreinterpretq_s16_s32( 1393 vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3)); 1394 int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0); 1395 dd[4] = vextq_s16(dd[4], dd[5], 2); 1396 dd[5] = vextq_s16(dd[5], dd[4], 2); 1397 dd[5] = vreinterpretq_s16_s32( 1398 vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3)); 1399 di += d_stride; 1400 dj += d_stride; 1401 } while (++y < height); 1402 } 1403 1404 // Writing one more element on the top edge of a square falls to 1405 // the next square in the same row or the first element in the next 1406 // row, which will just be overwritten later. 1407 update_8_stats_neon( 1408 H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, 1409 deltas[0], deltas[1], 1410 H + i * wiener_win * wiener_win2 + j * wiener_win); 1411 1412 H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = 1413 H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + 1414 vgetq_lane_s32(deltas[2], 0); 1415 H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = 1416 H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + 1417 vgetq_lane_s32(deltas[2], 1); 1418 H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = 1419 H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + 1420 vgetq_lane_s32(deltas[2], 2); 1421 H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = 1422 H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + 1423 vgetq_lane_s32(deltas[2], 3); 1424 H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] = 1425 H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] + 1426 vgetq_lane_s32(deltas[3], 0); 1427 H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] = 1428 H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] + 1429 vgetq_lane_s32(deltas[3], 1); 1430 } while (++j < wiener_win); 1431 } while (++i < wiener_win - 1); 1432 1433 // Step 5: Derive other points of each square. No square in bottom row. 1434 i = 0; 1435 do { 1436 const int16_t *const di = d + i; 1437 1438 j = i + 1; 1439 do { 1440 const int16_t *const dj = d + j; 1441 int32x4_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s32(0) }, 1442 { vdupq_n_s32(0) } }; 1443 int16x8_t d_is[WIN_7]; 1444 int16x8_t d_ie[WIN_7]; 1445 int16x8_t d_js[WIN_7]; 1446 int16x8_t d_je[WIN_7]; 1447 1448 x = 0; 1449 while (x < w16) { 1450 load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, 1451 d_js, d_je); 1452 derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas); 1453 x += 16; 1454 } 1455 1456 if (w16 != width) { 1457 load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, 1458 d_js, d_je); 1459 d_is[0] = vandq_s16(d_is[0], mask[0]); 1460 d_is[1] = vandq_s16(d_is[1], mask[1]); 1461 d_is[2] = vandq_s16(d_is[2], mask[0]); 1462 d_is[3] = vandq_s16(d_is[3], mask[1]); 1463 d_is[4] = vandq_s16(d_is[4], mask[0]); 1464 d_is[5] = vandq_s16(d_is[5], mask[1]); 1465 d_is[6] = vandq_s16(d_is[6], mask[0]); 1466 d_is[7] = vandq_s16(d_is[7], mask[1]); 1467 d_is[8] = vandq_s16(d_is[8], mask[0]); 1468 d_is[9] = vandq_s16(d_is[9], mask[1]); 1469 d_is[10] = vandq_s16(d_is[10], mask[0]); 1470 d_is[11] = vandq_s16(d_is[11], mask[1]); 1471 d_ie[0] = vandq_s16(d_ie[0], mask[0]); 1472 d_ie[1] = vandq_s16(d_ie[1], mask[1]); 1473 d_ie[2] = vandq_s16(d_ie[2], mask[0]); 1474 d_ie[3] = vandq_s16(d_ie[3], mask[1]); 1475 d_ie[4] = vandq_s16(d_ie[4], mask[0]); 1476 d_ie[5] = vandq_s16(d_ie[5], mask[1]); 1477 d_ie[6] = vandq_s16(d_ie[6], mask[0]); 1478 d_ie[7] = vandq_s16(d_ie[7], mask[1]); 1479 d_ie[8] = vandq_s16(d_ie[8], mask[0]); 1480 d_ie[9] = vandq_s16(d_ie[9], mask[1]); 1481 d_ie[10] = vandq_s16(d_ie[10], mask[0]); 1482 d_ie[11] = vandq_s16(d_ie[11], mask[1]); 1483 derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas); 1484 } 1485 1486 hadd_update_6_stats_neon( 1487 H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], 1488 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); 1489 hadd_update_6_stats_neon( 1490 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], 1491 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); 1492 hadd_update_6_stats_neon( 1493 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], 1494 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); 1495 hadd_update_6_stats_neon( 1496 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], 1497 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); 1498 hadd_update_6_stats_neon( 1499 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4], 1500 H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1); 1501 hadd_update_6_stats_neon( 1502 H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5], 1503 H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1); 1504 } while (++j < wiener_win); 1505 } while (++i < wiener_win - 1); 1506 1507 // Step 6: Derive other points of each upper triangle along the diagonal. 1508 i = 0; 1509 do { 1510 const int16_t *const di = d + i; 1511 int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) }; 1512 int16x8_t d_is[WIN_7], d_ie[WIN_7]; 1513 1514 x = 0; 1515 while (x < w16) { 1516 load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); 1517 derive_triangle_win7_neon(d_is, d_ie, deltas); 1518 x += 16; 1519 } 1520 1521 if (w16 != width) { 1522 load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); 1523 d_is[0] = vandq_s16(d_is[0], mask[0]); 1524 d_is[1] = vandq_s16(d_is[1], mask[1]); 1525 d_is[2] = vandq_s16(d_is[2], mask[0]); 1526 d_is[3] = vandq_s16(d_is[3], mask[1]); 1527 d_is[4] = vandq_s16(d_is[4], mask[0]); 1528 d_is[5] = vandq_s16(d_is[5], mask[1]); 1529 d_is[6] = vandq_s16(d_is[6], mask[0]); 1530 d_is[7] = vandq_s16(d_is[7], mask[1]); 1531 d_is[8] = vandq_s16(d_is[8], mask[0]); 1532 d_is[9] = vandq_s16(d_is[9], mask[1]); 1533 d_is[10] = vandq_s16(d_is[10], mask[0]); 1534 d_is[11] = vandq_s16(d_is[11], mask[1]); 1535 d_ie[0] = vandq_s16(d_ie[0], mask[0]); 1536 d_ie[1] = vandq_s16(d_ie[1], mask[1]); 1537 d_ie[2] = vandq_s16(d_ie[2], mask[0]); 1538 d_ie[3] = vandq_s16(d_ie[3], mask[1]); 1539 d_ie[4] = vandq_s16(d_ie[4], mask[0]); 1540 d_ie[5] = vandq_s16(d_ie[5], mask[1]); 1541 d_ie[6] = vandq_s16(d_ie[6], mask[0]); 1542 d_ie[7] = vandq_s16(d_ie[7], mask[1]); 1543 d_ie[8] = vandq_s16(d_ie[8], mask[0]); 1544 d_ie[9] = vandq_s16(d_ie[9], mask[1]); 1545 d_ie[10] = vandq_s16(d_ie[10], mask[0]); 1546 d_ie[11] = vandq_s16(d_ie[11], mask[1]); 1547 derive_triangle_win7_neon(d_is, d_ie, deltas); 1548 } 1549 1550 // Row 1: 6 points 1551 hadd_update_6_stats_neon( 1552 H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, 1553 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); 1554 1555 // Row 2: 5 points 1556 hadd_update_4_stats_neon( 1557 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6, 1558 H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); 1559 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] = 1560 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] + 1561 horizontal_long_add_s32x4(deltas[10]); 1562 1563 // Row 3: 4 points 1564 hadd_update_4_stats_neon( 1565 H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, 1566 deltas + 11, 1567 H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); 1568 1569 // Row 4: 3 points 1570 #if AOM_ARCH_AARCH64 1571 int64x2_t delta15_s64 = vpaddlq_s32(deltas[15]); 1572 int64x2_t delta16_s64 = vpaddlq_s32(deltas[16]); 1573 int64x2_t delta1516 = vpaddq_s64(delta15_s64, delta16_s64); 1574 1575 int64x2_t h0 = 1576 vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); 1577 vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4, 1578 vaddq_s64(h0, delta1516)); 1579 #else 1580 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 0] = 1581 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 0] + 1582 horizontal_long_add_s32x4(deltas[15]); 1583 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 1] = 1584 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 1] + 1585 horizontal_long_add_s32x4(deltas[16]); 1586 #endif // AOM_ARCH_AARCH64 1587 1588 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] = 1589 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] + 1590 horizontal_long_add_s32x4(deltas[17]); 1591 1592 // Row 5: 2 points 1593 int64x2_t delta18_s64 = vpaddlq_s32(deltas[18]); 1594 int64x2_t delta19_s64 = vpaddlq_s32(deltas[19]); 1595 1596 #if AOM_ARCH_AARCH64 1597 int64x2_t delta1819 = vpaddq_s64(delta18_s64, delta19_s64); 1598 1599 int64x2_t h1 = 1600 vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4); 1601 vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5, 1602 vaddq_s64(h1, delta1819)); 1603 #else 1604 H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] = 1605 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] + 1606 horizontal_add_s64x2(delta18_s64); 1607 H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5 + 1] = 1608 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 1] + 1609 horizontal_add_s64x2(delta19_s64); 1610 #endif // AOM_ARCH_AARCH64 1611 1612 // Row 6: 1 points 1613 H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] = 1614 H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] + 1615 horizontal_long_add_s32x4(deltas[20]); 1616 } while (++i < wiener_win); 1617 } 1618 1619 static inline void sub_avg_block_highbd_neon(const uint16_t *src, 1620 const int32_t src_stride, 1621 const uint16_t avg, 1622 const int32_t width, 1623 const int32_t height, int16_t *dst, 1624 const int32_t dst_stride) { 1625 const uint16x8_t a = vdupq_n_u16(avg); 1626 1627 int32_t i = height + 1; 1628 do { 1629 int32_t j = 0; 1630 while (j < width) { 1631 const uint16x8_t s = vld1q_u16(src + j); 1632 const uint16x8_t d = vsubq_u16(s, a); 1633 vst1q_s16(dst + j, vreinterpretq_s16_u16(d)); 1634 j += 8; 1635 } 1636 1637 src += src_stride; 1638 dst += dst_stride; 1639 } while (--i); 1640 } 1641 1642 static inline uint16_t highbd_find_average_neon(const uint16_t *src, 1643 int src_stride, int width, 1644 int height) { 1645 assert(width > 0); 1646 assert(height > 0); 1647 1648 uint64x2_t sum_u64 = vdupq_n_u64(0); 1649 uint64_t sum = 0; 1650 const uint16x8_t mask = 1651 vreinterpretq_u16_s16(vld1q_s16(&mask_16bit[16] - (width % 8))); 1652 1653 int h = height; 1654 do { 1655 uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 1656 1657 int w = width; 1658 const uint16_t *row = src; 1659 while (w >= 32) { 1660 uint16x8_t s0 = vld1q_u16(row + 0); 1661 uint16x8_t s1 = vld1q_u16(row + 8); 1662 uint16x8_t s2 = vld1q_u16(row + 16); 1663 uint16x8_t s3 = vld1q_u16(row + 24); 1664 1665 s0 = vaddq_u16(s0, s1); 1666 s2 = vaddq_u16(s2, s3); 1667 sum_u32[0] = vpadalq_u16(sum_u32[0], s0); 1668 sum_u32[1] = vpadalq_u16(sum_u32[1], s2); 1669 1670 row += 32; 1671 w -= 32; 1672 } 1673 1674 if (w >= 16) { 1675 uint16x8_t s0 = vld1q_u16(row + 0); 1676 uint16x8_t s1 = vld1q_u16(row + 8); 1677 1678 s0 = vaddq_u16(s0, s1); 1679 sum_u32[0] = vpadalq_u16(sum_u32[0], s0); 1680 1681 row += 16; 1682 w -= 16; 1683 } 1684 1685 if (w >= 8) { 1686 uint16x8_t s0 = vld1q_u16(row); 1687 sum_u32[1] = vpadalq_u16(sum_u32[1], s0); 1688 1689 row += 8; 1690 w -= 8; 1691 } 1692 1693 if (w) { 1694 uint16x8_t s0 = vandq_u16(vld1q_u16(row), mask); 1695 sum_u32[1] = vpadalq_u16(sum_u32[1], s0); 1696 1697 row += 8; 1698 w -= 8; 1699 } 1700 1701 sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1])); 1702 1703 src += src_stride; 1704 } while (--h != 0); 1705 1706 return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width)); 1707 } 1708 1709 void av1_compute_stats_highbd_neon(int32_t wiener_win, const uint8_t *dgd8, 1710 const uint8_t *src8, int16_t *dgd_avg, 1711 int16_t *src_avg, int32_t h_start, 1712 int32_t h_end, int32_t v_start, 1713 int32_t v_end, int32_t dgd_stride, 1714 int32_t src_stride, int64_t *M, int64_t *H, 1715 aom_bit_depth_t bit_depth) { 1716 const int32_t wiener_win2 = wiener_win * wiener_win; 1717 const int32_t wiener_halfwin = (wiener_win >> 1); 1718 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 1719 const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); 1720 const int32_t width = h_end - h_start; 1721 const int32_t height = v_end - v_start; 1722 const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; 1723 const uint16_t avg = 1724 highbd_find_average_neon(dgd_start, dgd_stride, width, height); 1725 const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; 1726 const int32_t s_stride = (width + 15) & ~15; 1727 1728 sub_avg_block_highbd_neon(src + v_start * src_stride + h_start, src_stride, 1729 avg, width, height, src_avg, s_stride); 1730 sub_avg_block_highbd_neon( 1731 dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, 1732 dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1733 dgd_avg, d_stride); 1734 1735 if (wiener_win == WIENER_WIN) { 1736 compute_stats_win7_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width, 1737 height, M, H, bit_depth); 1738 } else if (wiener_win == WIENER_WIN_CHROMA) { 1739 compute_stats_win5_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width, 1740 height, M, H, bit_depth); 1741 } 1742 1743 // H is a symmetric matrix, so we only need to fill out the upper triangle. 1744 // We can copy it down to the lower triangle outside the (i, j) loops. 1745 if (bit_depth == AOM_BITS_8) { 1746 diagonal_copy_stats_neon(wiener_win2, H); 1747 } else if (bit_depth == AOM_BITS_10) { // bit_depth == AOM_BITS_10 1748 const int32_t k4 = wiener_win2 & ~3; 1749 1750 int32_t k = 0; 1751 do { 1752 int64x2_t dst = div4_neon(vld1q_s64(M + k)); 1753 vst1q_s64(M + k, dst); 1754 dst = div4_neon(vld1q_s64(M + k + 2)); 1755 vst1q_s64(M + k + 2, dst); 1756 H[k * wiener_win2 + k] /= 4; 1757 k += 4; 1758 } while (k < k4); 1759 1760 H[k * wiener_win2 + k] /= 4; 1761 1762 for (; k < wiener_win2; ++k) { 1763 M[k] /= 4; 1764 } 1765 1766 div4_diagonal_copy_stats_neon(wiener_win2, H); 1767 } else { // bit_depth == AOM_BITS_12 1768 const int32_t k4 = wiener_win2 & ~3; 1769 1770 int32_t k = 0; 1771 do { 1772 int64x2_t dst = div16_neon(vld1q_s64(M + k)); 1773 vst1q_s64(M + k, dst); 1774 dst = div16_neon(vld1q_s64(M + k + 2)); 1775 vst1q_s64(M + k + 2, dst); 1776 H[k * wiener_win2 + k] /= 16; 1777 k += 4; 1778 } while (k < k4); 1779 1780 H[k * wiener_win2 + k] /= 16; 1781 1782 for (; k < wiener_win2; ++k) { 1783 M[k] /= 16; 1784 } 1785 1786 div16_diagonal_copy_stats_neon(wiener_win2, H); 1787 } 1788 } 1789 int64_t av1_highbd_pixel_proj_error_neon( 1790 const uint8_t *src8, int width, int height, int src_stride, 1791 const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, 1792 int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { 1793 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 1794 const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); 1795 int64_t sse = 0; 1796 int64x2_t sse_s64 = vdupq_n_s64(0); 1797 1798 if (params->r[0] > 0 && params->r[1] > 0) { 1799 int32x2_t xq_v = vld1_s32(xq); 1800 int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4); 1801 1802 do { 1803 int j = 0; 1804 int32x4_t sse_s32 = vdupq_n_s32(0); 1805 1806 do { 1807 const uint16x8_t d = vld1q_u16(&dat[j]); 1808 const uint16x8_t s = vld1q_u16(&src[j]); 1809 int32x4_t flt0_0 = vld1q_s32(&flt0[j]); 1810 int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); 1811 int32x4_t flt1_0 = vld1q_s32(&flt1[j]); 1812 int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); 1813 1814 int32x4_t d_s32_lo = vreinterpretq_s32_u32( 1815 vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); 1816 int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16( 1817 vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); 1818 1819 int32x4_t v0 = vsubq_s32( 1820 vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), 1821 d_s32_lo); 1822 int32x4_t v1 = vsubq_s32( 1823 vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), 1824 d_s32_hi); 1825 1826 v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0); 1827 v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0); 1828 v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); 1829 v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); 1830 1831 int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); 1832 int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); 1833 1834 int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), 1835 vreinterpretq_s16_u16(vsubq_u16(d, s))); 1836 int16x4_t e_lo = vget_low_s16(e); 1837 int16x4_t e_hi = vget_high_s16(e); 1838 1839 sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); 1840 sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); 1841 1842 j += 8; 1843 } while (j <= width - 8); 1844 1845 for (int k = j; k < width; ++k) { 1846 int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); 1847 v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]); 1848 v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4); 1849 int32_t e = 1850 (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; 1851 sse += ((int64_t)e * e); 1852 } 1853 1854 sse_s64 = vpadalq_s32(sse_s64, sse_s32); 1855 1856 dat += dat_stride; 1857 src += src_stride; 1858 flt0 += flt0_stride; 1859 flt1 += flt1_stride; 1860 } while (--height != 0); 1861 } else if (params->r[0] > 0 || params->r[1] > 0) { 1862 int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; 1863 int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; 1864 int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; 1865 int32x4_t xq_v = vdupq_n_s32(xq_active); 1866 1867 do { 1868 int j = 0; 1869 int32x4_t sse_s32 = vdupq_n_s32(0); 1870 do { 1871 const uint16x8_t d0 = vld1q_u16(&dat[j]); 1872 const uint16x8_t s0 = vld1q_u16(&src[j]); 1873 int32x4_t flt0_0 = vld1q_s32(&flt[j]); 1874 int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]); 1875 1876 uint16x8_t d_u16 = vshlq_n_u16(d0, 4); 1877 int32x4_t sub0 = vreinterpretq_s32_u32( 1878 vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16))); 1879 int32x4_t sub1 = vreinterpretq_s32_u32( 1880 vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16))); 1881 1882 int32x4_t v0 = vmlaq_s32( 1883 vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0, 1884 xq_v); 1885 int32x4_t v1 = vmlaq_s32( 1886 vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1, 1887 xq_v); 1888 1889 int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); 1890 int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); 1891 1892 int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), 1893 vreinterpretq_s16_u16(vsubq_u16(d0, s0))); 1894 int16x4_t e_lo = vget_low_s16(e); 1895 int16x4_t e_hi = vget_high_s16(e); 1896 1897 sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); 1898 sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); 1899 1900 j += 8; 1901 } while (j <= width - 8); 1902 1903 for (int k = j; k < width; ++k) { 1904 int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); 1905 v += xq_active * (int32_t)((uint32_t)flt[k] - (uint16_t)(dat[k] << 4)); 1906 const int32_t e = 1907 (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; 1908 sse += ((int64_t)e * e); 1909 } 1910 1911 sse_s64 = vpadalq_s32(sse_s64, sse_s32); 1912 1913 dat += dat_stride; 1914 flt += flt_stride; 1915 src += src_stride; 1916 } while (--height != 0); 1917 } else { 1918 do { 1919 int j = 0; 1920 1921 do { 1922 const uint16x8_t d = vld1q_u16(&dat[j]); 1923 const uint16x8_t s = vld1q_u16(&src[j]); 1924 1925 uint16x8_t diff = vabdq_u16(d, s); 1926 uint16x4_t diff_lo = vget_low_u16(diff); 1927 uint16x4_t diff_hi = vget_high_u16(diff); 1928 1929 uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo); 1930 uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi); 1931 1932 sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo)); 1933 sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi)); 1934 1935 j += 8; 1936 } while (j <= width - 8); 1937 1938 for (int k = j; k < width; ++k) { 1939 int32_t e = dat[k] - src[k]; 1940 sse += e * e; 1941 } 1942 1943 dat += dat_stride; 1944 src += src_stride; 1945 } while (--height != 0); 1946 } 1947 1948 sse += horizontal_add_s64x2(sse_s64); 1949 return sse; 1950 }