sadxd_neon_dotprod.c (11202B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/sum_neon.h" 20 21 static inline void sad16_neon(uint8x16_t src, uint8x16_t ref, 22 uint32x4_t *const sad_sum) { 23 uint8x16_t abs_diff = vabdq_u8(src, ref); 24 *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); 25 } 26 27 static inline void sadwxhx3d_large_neon_dotprod(const uint8_t *src, 28 int src_stride, 29 const uint8_t *const ref[4], 30 int ref_stride, uint32_t res[4], 31 int w, int h) { 32 uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 33 uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 34 35 int ref_offset = 0; 36 int i = h; 37 do { 38 int j = 0; 39 do { 40 const uint8x16_t s0 = vld1q_u8(src + j); 41 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); 42 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); 43 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); 44 45 const uint8x16_t s1 = vld1q_u8(src + j + 16); 46 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); 47 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); 48 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); 49 50 j += 32; 51 } while (j < w); 52 53 src += src_stride; 54 ref_offset += ref_stride; 55 } while (--i != 0); 56 57 res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0])); 58 res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1])); 59 res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2])); 60 } 61 62 static inline void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride, 63 const uint8_t *const ref[4], 64 int ref_stride, uint32_t res[4], 65 int h) { 66 sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h); 67 } 68 69 static inline void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride, 70 const uint8_t *const ref[4], 71 int ref_stride, uint32_t res[4], 72 int h) { 73 sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h); 74 } 75 76 static inline void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride, 77 const uint8_t *const ref[4], 78 int ref_stride, uint32_t res[4], 79 int h) { 80 sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h); 81 } 82 83 static inline void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride, 84 const uint8_t *const ref[4], 85 int ref_stride, uint32_t res[4], 86 int h) { 87 uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 88 89 int ref_offset = 0; 90 int i = h; 91 do { 92 const uint8x16_t s = vld1q_u8(src); 93 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); 94 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); 95 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); 96 97 src += src_stride; 98 ref_offset += ref_stride; 99 } while (--i != 0); 100 101 res[0] = horizontal_add_u32x4(sum[0]); 102 res[1] = horizontal_add_u32x4(sum[1]); 103 res[2] = horizontal_add_u32x4(sum[2]); 104 } 105 106 #define SAD_WXH_3D_NEON_DOTPROD(w, h) \ 107 void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \ 108 const uint8_t *const ref[4], \ 109 int ref_stride, uint32_t res[4]) { \ 110 sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \ 111 } 112 113 SAD_WXH_3D_NEON_DOTPROD(16, 8) 114 SAD_WXH_3D_NEON_DOTPROD(16, 16) 115 SAD_WXH_3D_NEON_DOTPROD(16, 32) 116 117 SAD_WXH_3D_NEON_DOTPROD(32, 16) 118 SAD_WXH_3D_NEON_DOTPROD(32, 32) 119 SAD_WXH_3D_NEON_DOTPROD(32, 64) 120 121 SAD_WXH_3D_NEON_DOTPROD(64, 32) 122 SAD_WXH_3D_NEON_DOTPROD(64, 64) 123 SAD_WXH_3D_NEON_DOTPROD(64, 128) 124 125 SAD_WXH_3D_NEON_DOTPROD(128, 64) 126 SAD_WXH_3D_NEON_DOTPROD(128, 128) 127 128 #if !CONFIG_REALTIME_ONLY 129 SAD_WXH_3D_NEON_DOTPROD(16, 4) 130 SAD_WXH_3D_NEON_DOTPROD(16, 64) 131 SAD_WXH_3D_NEON_DOTPROD(32, 8) 132 SAD_WXH_3D_NEON_DOTPROD(64, 16) 133 #endif // !CONFIG_REALTIME_ONLY 134 135 #undef SAD_WXH_3D_NEON_DOTPROD 136 137 static inline void sadwxhx4d_large_neon_dotprod(const uint8_t *src, 138 int src_stride, 139 const uint8_t *const ref[4], 140 int ref_stride, uint32_t res[4], 141 int w, int h) { 142 uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 143 vdupq_n_u32(0) }; 144 uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 145 vdupq_n_u32(0) }; 146 uint32x4_t sum[4]; 147 148 int ref_offset = 0; 149 int i = h; 150 do { 151 int j = 0; 152 do { 153 const uint8x16_t s0 = vld1q_u8(src + j); 154 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); 155 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); 156 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); 157 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); 158 159 const uint8x16_t s1 = vld1q_u8(src + j + 16); 160 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); 161 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); 162 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); 163 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); 164 165 j += 32; 166 } while (j < w); 167 168 src += src_stride; 169 ref_offset += ref_stride; 170 } while (--i != 0); 171 172 sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); 173 sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); 174 sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); 175 sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); 176 177 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 178 } 179 180 static inline void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride, 181 const uint8_t *const ref[4], 182 int ref_stride, uint32_t res[4], 183 int h) { 184 sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h); 185 } 186 187 static inline void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, 188 const uint8_t *const ref[4], 189 int ref_stride, uint32_t res[4], 190 int h) { 191 sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h); 192 } 193 194 static inline void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, 195 const uint8_t *const ref[4], 196 int ref_stride, uint32_t res[4], 197 int h) { 198 sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h); 199 } 200 201 static inline void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, 202 const uint8_t *const ref[4], 203 int ref_stride, uint32_t res[4], 204 int h) { 205 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 206 vdupq_n_u32(0) }; 207 208 int ref_offset = 0; 209 int i = h; 210 do { 211 const uint8x16_t s = vld1q_u8(src); 212 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); 213 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); 214 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); 215 sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]); 216 217 src += src_stride; 218 ref_offset += ref_stride; 219 } while (--i != 0); 220 221 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 222 } 223 224 #define SAD_WXH_4D_NEON_DOTPROD(w, h) \ 225 void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \ 226 const uint8_t *const ref[4], \ 227 int ref_stride, uint32_t res[4]) { \ 228 sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \ 229 } 230 231 SAD_WXH_4D_NEON_DOTPROD(16, 8) 232 SAD_WXH_4D_NEON_DOTPROD(16, 16) 233 SAD_WXH_4D_NEON_DOTPROD(16, 32) 234 235 SAD_WXH_4D_NEON_DOTPROD(32, 16) 236 SAD_WXH_4D_NEON_DOTPROD(32, 32) 237 SAD_WXH_4D_NEON_DOTPROD(32, 64) 238 239 SAD_WXH_4D_NEON_DOTPROD(64, 32) 240 SAD_WXH_4D_NEON_DOTPROD(64, 64) 241 SAD_WXH_4D_NEON_DOTPROD(64, 128) 242 243 SAD_WXH_4D_NEON_DOTPROD(128, 64) 244 SAD_WXH_4D_NEON_DOTPROD(128, 128) 245 246 #if !CONFIG_REALTIME_ONLY 247 SAD_WXH_4D_NEON_DOTPROD(16, 4) 248 SAD_WXH_4D_NEON_DOTPROD(16, 64) 249 SAD_WXH_4D_NEON_DOTPROD(32, 8) 250 SAD_WXH_4D_NEON_DOTPROD(64, 16) 251 #endif // !CONFIG_REALTIME_ONLY 252 253 #undef SAD_WXH_4D_NEON_DOTPROD 254 255 #define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ 256 void aom_sad_skip_##w##x##h##x4d_neon_dotprod( \ 257 const uint8_t *src, int src_stride, const uint8_t *const ref[4], \ 258 int ref_stride, uint32_t res[4]) { \ 259 sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \ 260 ((h) >> 1)); \ 261 res[0] <<= 1; \ 262 res[1] <<= 1; \ 263 res[2] <<= 1; \ 264 res[3] <<= 1; \ 265 } 266 267 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) 268 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) 269 270 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) 271 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) 272 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) 273 274 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) 275 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) 276 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128) 277 278 SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64) 279 SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128) 280 281 #if !CONFIG_REALTIME_ONLY 282 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64) 283 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16) 284 #endif // !CONFIG_REALTIME_ONLY 285 286 #undef SAD_SKIP_WXH_4D_NEON_DOTPROD