wedge_utils_neon.c (4813B)
1 /* 2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "aom_dsp/arm/sum_neon.h" 16 #include "av1/common/reconinter.h" 17 18 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) 19 20 /** 21 * See av1_wedge_sse_from_residuals_c for details of the parameters and 22 * computation. 23 */ 24 uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, 25 const uint8_t *m, int N) { 26 assert(N % 64 == 0); 27 28 uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; 29 30 int i = 0; 31 do { 32 int32x4_t sum[4]; 33 int32x4_t sse[2]; 34 int16x4_t sum_s16[4]; 35 36 const int16x8_t r1_l = vld1q_s16(r1 + i); 37 const int16x8_t r1_h = vld1q_s16(r1 + i + 8); 38 const int16x8_t d_l = vld1q_s16(d + i); 39 const int16x8_t d_h = vld1q_s16(d + i + 8); 40 // The following three lines are a bit inelegant compared to using a pair 41 // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair - 42 // which can be executed in parallel with the subsequent SSHL instructions. 43 // (SSHL can only be executed on half of the Neon pipes in modern Arm 44 // cores, whereas ZIP1/2 can be executed on all of them.) 45 const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0)); 46 const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]); 47 const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]); 48 49 sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); 50 sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); 51 sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); 52 sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); 53 54 sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); 55 sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); 56 sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); 57 sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); 58 59 sum_s16[0] = vqmovn_s32(sum[0]); 60 sum_s16[1] = vqmovn_s32(sum[1]); 61 sum_s16[2] = vqmovn_s32(sum[2]); 62 sum_s16[3] = vqmovn_s32(sum[3]); 63 64 sse[0] = vmull_s16(sum_s16[0], sum_s16[0]); 65 sse[1] = vmull_s16(sum_s16[2], sum_s16[2]); 66 sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]); 67 sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]); 68 69 v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0])); 70 v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1])); 71 72 i += 16; 73 } while (i < N); 74 75 uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1])); 76 return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); 77 } 78 79 int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m, 80 int N, int64_t limit) { 81 int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), 82 vdupq_n_s32(0) }; 83 84 do { 85 int16x8_t ds_l = vld1q_s16(ds); 86 int16x8_t ds_h = vld1q_s16(ds + 8); 87 88 int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m)); 89 int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8)); 90 int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8)); 91 92 acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l)); 93 acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l)); 94 acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h)); 95 acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h)); 96 97 ds += 16; 98 m += 16; 99 N -= 16; 100 } while (N != 0); 101 102 int64x2_t sum = vpaddlq_s32(acc[0]); 103 sum = vpadalq_s32(sum, acc[1]); 104 sum = vpadalq_s32(sum, acc[2]); 105 sum = vpadalq_s32(sum, acc[3]); 106 107 return (horizontal_add_s64x2(sum) > limit); 108 } 109 110 void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr, 111 const int16_t *b_ptr, int N) { 112 do { 113 int16x8_t a = vld1q_s16(a_ptr); 114 int16x8_t b = vld1q_s16(b_ptr); 115 116 int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a)); 117 int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a)); 118 119 sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b)); 120 sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b)); 121 122 int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi)); 123 124 vst1q_s16(d_ptr, res); 125 126 d_ptr += 8; 127 a_ptr += 8; 128 b_ptr += 8; 129 N -= 8; 130 } while (N != 0); 131 }