sse_neon.c (6719B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 #include "aom_dsp/arm/mem_neon.h" 16 #include "aom_dsp/arm/sum_neon.h" 17 18 static inline void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, 19 uint32x4_t *sse) { 20 uint8x16_t s = vld1q_u8(src); 21 uint8x16_t r = vld1q_u8(ref); 22 23 uint8x16_t abs_diff = vabdq_u8(s, r); 24 uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); 25 uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); 26 27 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); 28 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); 29 } 30 31 static inline void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, 32 uint32x4_t *sse) { 33 uint8x8_t s = vld1_u8(src); 34 uint8x8_t r = vld1_u8(ref); 35 36 uint8x8_t abs_diff = vabd_u8(s, r); 37 38 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); 39 } 40 41 static inline void sse_4x2_neon(const uint8_t *src, int src_stride, 42 const uint8_t *ref, int ref_stride, 43 uint32x4_t *sse) { 44 uint8x8_t s = load_unaligned_u8(src, src_stride); 45 uint8x8_t r = load_unaligned_u8(ref, ref_stride); 46 47 uint8x8_t abs_diff = vabd_u8(s, r); 48 49 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); 50 } 51 52 static inline uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, 53 const uint8_t *ref, int ref_stride, 54 int width, int height) { 55 uint32x4_t sse = vdupq_n_u32(0); 56 57 if ((width & 0x07) && ((width & 0x07) < 5)) { 58 int i = height; 59 do { 60 int j = 0; 61 do { 62 sse_8x1_neon(src + j, ref + j, &sse); 63 sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); 64 j += 8; 65 } while (j + 4 < width); 66 67 sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); 68 src += 2 * src_stride; 69 ref += 2 * ref_stride; 70 i -= 2; 71 } while (i != 0); 72 } else { 73 int i = height; 74 do { 75 int j = 0; 76 do { 77 sse_8x1_neon(src + j, ref + j, &sse); 78 j += 8; 79 } while (j < width); 80 81 src += src_stride; 82 ref += ref_stride; 83 } while (--i != 0); 84 } 85 return horizontal_add_u32x4(sse); 86 } 87 88 static inline uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, 89 const uint8_t *ref, int ref_stride, 90 int height) { 91 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 92 93 int i = height; 94 do { 95 sse_16x1_neon(src, ref, &sse[0]); 96 sse_16x1_neon(src + 16, ref + 16, &sse[1]); 97 sse_16x1_neon(src + 32, ref + 32, &sse[0]); 98 sse_16x1_neon(src + 48, ref + 48, &sse[1]); 99 sse_16x1_neon(src + 64, ref + 64, &sse[0]); 100 sse_16x1_neon(src + 80, ref + 80, &sse[1]); 101 sse_16x1_neon(src + 96, ref + 96, &sse[0]); 102 sse_16x1_neon(src + 112, ref + 112, &sse[1]); 103 104 src += src_stride; 105 ref += ref_stride; 106 } while (--i != 0); 107 108 return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); 109 } 110 111 static inline uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, 112 const uint8_t *ref, int ref_stride, 113 int height) { 114 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 115 116 int i = height; 117 do { 118 sse_16x1_neon(src, ref, &sse[0]); 119 sse_16x1_neon(src + 16, ref + 16, &sse[1]); 120 sse_16x1_neon(src + 32, ref + 32, &sse[0]); 121 sse_16x1_neon(src + 48, ref + 48, &sse[1]); 122 123 src += src_stride; 124 ref += ref_stride; 125 } while (--i != 0); 126 127 return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); 128 } 129 130 static inline uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, 131 const uint8_t *ref, int ref_stride, 132 int height) { 133 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 134 135 int i = height; 136 do { 137 sse_16x1_neon(src, ref, &sse[0]); 138 sse_16x1_neon(src + 16, ref + 16, &sse[1]); 139 140 src += src_stride; 141 ref += ref_stride; 142 } while (--i != 0); 143 144 return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); 145 } 146 147 static inline uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, 148 const uint8_t *ref, int ref_stride, 149 int height) { 150 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 151 152 int i = height; 153 do { 154 sse_16x1_neon(src, ref, &sse[0]); 155 src += src_stride; 156 ref += ref_stride; 157 sse_16x1_neon(src, ref, &sse[1]); 158 src += src_stride; 159 ref += ref_stride; 160 i -= 2; 161 } while (i != 0); 162 163 return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); 164 } 165 166 static inline uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, 167 const uint8_t *ref, int ref_stride, 168 int height) { 169 uint32x4_t sse = vdupq_n_u32(0); 170 171 int i = height; 172 do { 173 sse_8x1_neon(src, ref, &sse); 174 175 src += src_stride; 176 ref += ref_stride; 177 } while (--i != 0); 178 179 return horizontal_add_u32x4(sse); 180 } 181 182 static inline uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, 183 const uint8_t *ref, int ref_stride, 184 int height) { 185 uint32x4_t sse = vdupq_n_u32(0); 186 187 int i = height; 188 do { 189 sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); 190 191 src += 2 * src_stride; 192 ref += 2 * ref_stride; 193 i -= 2; 194 } while (i != 0); 195 196 return horizontal_add_u32x4(sse); 197 } 198 199 int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, 200 int ref_stride, int width, int height) { 201 switch (width) { 202 case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); 203 case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); 204 case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); 205 case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); 206 case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); 207 case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); 208 default: 209 return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); 210 } 211 }