sad_neon_dotprod.c (9928B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/sum_neon.h" 20 21 static inline unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, 22 int src_stride, 23 const uint8_t *ref_ptr, 24 int ref_stride, int w, int h) { 25 // Only two accumulators are required for optimal instruction throughput of 26 // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. 27 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 28 29 int i = h; 30 do { 31 int j = 0; 32 do { 33 uint8x16_t s0, s1, r0, r1, diff0, diff1; 34 35 s0 = vld1q_u8(src_ptr + j); 36 r0 = vld1q_u8(ref_ptr + j); 37 diff0 = vabdq_u8(s0, r0); 38 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); 39 40 s1 = vld1q_u8(src_ptr + j + 16); 41 r1 = vld1q_u8(ref_ptr + j + 16); 42 diff1 = vabdq_u8(s1, r1); 43 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); 44 45 j += 32; 46 } while (j < w); 47 48 src_ptr += src_stride; 49 ref_ptr += ref_stride; 50 } while (--i != 0); 51 52 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); 53 } 54 55 static inline unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr, 56 int src_stride, 57 const uint8_t *ref_ptr, 58 int ref_stride, int h) { 59 return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h); 60 } 61 62 static inline unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, 63 int src_stride, 64 const uint8_t *ref_ptr, 65 int ref_stride, int h) { 66 return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); 67 } 68 69 static inline unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, 70 int src_stride, 71 const uint8_t *ref_ptr, 72 int ref_stride, int h) { 73 return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); 74 } 75 76 static inline unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, 77 int src_stride, 78 const uint8_t *ref_ptr, 79 int ref_stride, int h) { 80 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 81 82 int i = h / 2; 83 do { 84 uint8x16_t s0, s1, r0, r1, diff0, diff1; 85 86 s0 = vld1q_u8(src_ptr); 87 r0 = vld1q_u8(ref_ptr); 88 diff0 = vabdq_u8(s0, r0); 89 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); 90 91 src_ptr += src_stride; 92 ref_ptr += ref_stride; 93 94 s1 = vld1q_u8(src_ptr); 95 r1 = vld1q_u8(ref_ptr); 96 diff1 = vabdq_u8(s1, r1); 97 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); 98 99 src_ptr += src_stride; 100 ref_ptr += ref_stride; 101 } while (--i != 0); 102 103 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); 104 } 105 106 #define SAD_WXH_NEON_DOTPROD(w, h) \ 107 unsigned int aom_sad##w##x##h##_neon_dotprod( \ 108 const uint8_t *src, int src_stride, const uint8_t *ref, \ 109 int ref_stride) { \ 110 return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ 111 } 112 113 SAD_WXH_NEON_DOTPROD(16, 8) 114 SAD_WXH_NEON_DOTPROD(16, 16) 115 SAD_WXH_NEON_DOTPROD(16, 32) 116 117 SAD_WXH_NEON_DOTPROD(32, 16) 118 SAD_WXH_NEON_DOTPROD(32, 32) 119 SAD_WXH_NEON_DOTPROD(32, 64) 120 121 SAD_WXH_NEON_DOTPROD(64, 32) 122 SAD_WXH_NEON_DOTPROD(64, 64) 123 SAD_WXH_NEON_DOTPROD(64, 128) 124 125 SAD_WXH_NEON_DOTPROD(128, 64) 126 SAD_WXH_NEON_DOTPROD(128, 128) 127 128 #if !CONFIG_REALTIME_ONLY 129 SAD_WXH_NEON_DOTPROD(16, 4) 130 SAD_WXH_NEON_DOTPROD(16, 64) 131 SAD_WXH_NEON_DOTPROD(32, 8) 132 SAD_WXH_NEON_DOTPROD(64, 16) 133 #endif // !CONFIG_REALTIME_ONLY 134 135 #undef SAD_WXH_NEON_DOTPROD 136 137 #define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ 138 unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \ 139 const uint8_t *src, int src_stride, const uint8_t *ref, \ 140 int ref_stride) { \ 141 return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ 142 2 * ref_stride, (h) / 2); \ 143 } 144 145 SAD_SKIP_WXH_NEON_DOTPROD(16, 16) 146 SAD_SKIP_WXH_NEON_DOTPROD(16, 32) 147 148 SAD_SKIP_WXH_NEON_DOTPROD(32, 16) 149 SAD_SKIP_WXH_NEON_DOTPROD(32, 32) 150 SAD_SKIP_WXH_NEON_DOTPROD(32, 64) 151 152 SAD_SKIP_WXH_NEON_DOTPROD(64, 32) 153 SAD_SKIP_WXH_NEON_DOTPROD(64, 64) 154 SAD_SKIP_WXH_NEON_DOTPROD(64, 128) 155 156 SAD_SKIP_WXH_NEON_DOTPROD(128, 64) 157 SAD_SKIP_WXH_NEON_DOTPROD(128, 128) 158 159 #if !CONFIG_REALTIME_ONLY 160 SAD_SKIP_WXH_NEON_DOTPROD(16, 64) 161 SAD_SKIP_WXH_NEON_DOTPROD(64, 16) 162 #endif // !CONFIG_REALTIME_ONLY 163 164 #undef SAD_SKIP_WXH_NEON_DOTPROD 165 166 static inline unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, 167 int src_stride, 168 const uint8_t *ref_ptr, 169 int ref_stride, int w, int h, 170 const uint8_t *second_pred) { 171 // Only two accumulators are required for optimal instruction throughput of 172 // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. 173 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 174 175 int i = h; 176 do { 177 int j = 0; 178 do { 179 uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; 180 181 s0 = vld1q_u8(src_ptr + j); 182 r0 = vld1q_u8(ref_ptr + j); 183 p0 = vld1q_u8(second_pred); 184 avg0 = vrhaddq_u8(r0, p0); 185 diff0 = vabdq_u8(s0, avg0); 186 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); 187 188 s1 = vld1q_u8(src_ptr + j + 16); 189 r1 = vld1q_u8(ref_ptr + j + 16); 190 p1 = vld1q_u8(second_pred + 16); 191 avg1 = vrhaddq_u8(r1, p1); 192 diff1 = vabdq_u8(s1, avg1); 193 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); 194 195 j += 32; 196 second_pred += 32; 197 } while (j < w); 198 199 src_ptr += src_stride; 200 ref_ptr += ref_stride; 201 } while (--i != 0); 202 203 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); 204 } 205 206 static inline unsigned int sad128xh_avg_neon_dotprod( 207 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, 208 int ref_stride, int h, const uint8_t *second_pred) { 209 return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, 210 h, second_pred); 211 } 212 213 static inline unsigned int sad64xh_avg_neon_dotprod( 214 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, 215 int ref_stride, int h, const uint8_t *second_pred) { 216 return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, 217 h, second_pred); 218 } 219 220 static inline unsigned int sad32xh_avg_neon_dotprod( 221 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, 222 int ref_stride, int h, const uint8_t *second_pred) { 223 return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, 224 h, second_pred); 225 } 226 227 static inline unsigned int sad16xh_avg_neon_dotprod( 228 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, 229 int ref_stride, int h, const uint8_t *second_pred) { 230 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 231 232 int i = h / 2; 233 do { 234 uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; 235 236 s0 = vld1q_u8(src_ptr); 237 r0 = vld1q_u8(ref_ptr); 238 p0 = vld1q_u8(second_pred); 239 avg0 = vrhaddq_u8(r0, p0); 240 diff0 = vabdq_u8(s0, avg0); 241 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); 242 243 src_ptr += src_stride; 244 ref_ptr += ref_stride; 245 second_pred += 16; 246 247 s1 = vld1q_u8(src_ptr); 248 r1 = vld1q_u8(ref_ptr); 249 p1 = vld1q_u8(second_pred); 250 avg1 = vrhaddq_u8(r1, p1); 251 diff1 = vabdq_u8(s1, avg1); 252 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); 253 254 src_ptr += src_stride; 255 ref_ptr += ref_stride; 256 second_pred += 16; 257 } while (--i != 0); 258 259 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); 260 } 261 262 #define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ 263 unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \ 264 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 265 const uint8_t *second_pred) { \ 266 return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ 267 second_pred); \ 268 } 269 270 SAD_WXH_AVG_NEON_DOTPROD(16, 8) 271 SAD_WXH_AVG_NEON_DOTPROD(16, 16) 272 SAD_WXH_AVG_NEON_DOTPROD(16, 32) 273 274 SAD_WXH_AVG_NEON_DOTPROD(32, 16) 275 SAD_WXH_AVG_NEON_DOTPROD(32, 32) 276 SAD_WXH_AVG_NEON_DOTPROD(32, 64) 277 278 SAD_WXH_AVG_NEON_DOTPROD(64, 32) 279 SAD_WXH_AVG_NEON_DOTPROD(64, 64) 280 SAD_WXH_AVG_NEON_DOTPROD(64, 128) 281 282 SAD_WXH_AVG_NEON_DOTPROD(128, 64) 283 SAD_WXH_AVG_NEON_DOTPROD(128, 128) 284 285 #if !CONFIG_REALTIME_ONLY 286 SAD_WXH_AVG_NEON_DOTPROD(16, 64) 287 SAD_WXH_AVG_NEON_DOTPROD(32, 8) 288 SAD_WXH_AVG_NEON_DOTPROD(64, 16) 289 #endif // !CONFIG_REALTIME_ONLY 290 291 #undef SAD_WXH_AVG_NEON_DOTPROD