sad_avx2.c (11176B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <immintrin.h> 12 #include <stdint.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 #include "aom_ports/mem.h" 17 18 // SAD, SAD_SKIP and SAD_AVG for 64xh blocks 19 #if !CONFIG_HIGHWAY 20 static inline unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, 21 const uint8_t *ref_ptr, int ref_stride, 22 int h) { 23 int i; 24 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; 25 __m256i sum_sad = _mm256_setzero_si256(); 26 __m256i sum_sad_h; 27 __m128i sum_sad128; 28 for (i = 0; i < h; i++) { 29 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); 30 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); 31 sad1_reg = 32 _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); 33 sad2_reg = _mm256_sad_epu8( 34 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); 35 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); 36 ref_ptr += ref_stride; 37 src_ptr += src_stride; 38 } 39 sum_sad_h = _mm256_srli_si256(sum_sad, 8); 40 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); 41 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); 42 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); 43 unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); 44 _mm256_zeroupper(); 45 return res; 46 } 47 48 #define FSAD64_H(h) \ 49 unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ 50 const uint8_t *ref_ptr, int ref_stride) { \ 51 return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ 52 } 53 54 #define FSADS64_H(h) \ 55 unsigned int aom_sad_skip_64x##h##_avx2( \ 56 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 57 int ref_stride) { \ 58 return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ 59 h / 2); \ 60 } 61 62 #define FSAD64 \ 63 FSAD64_H(64) \ 64 FSAD64_H(32) \ 65 FSADS64_H(64) \ 66 FSADS64_H(32) 67 68 /* clang-format off */ 69 FSAD64 70 /* clang-format on */ 71 72 #undef FSAD64 73 #undef FSAD64_H 74 75 #define FSADAVG64_H(h) \ 76 unsigned int aom_sad64x##h##_avg_avx2( \ 77 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 78 int ref_stride, const uint8_t *second_pred) { \ 79 int i; \ 80 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 81 __m256i sum_sad = _mm256_setzero_si256(); \ 82 __m256i sum_sad_h; \ 83 __m128i sum_sad128; \ 84 for (i = 0; i < h; i++) { \ 85 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 86 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 87 ref1_reg = _mm256_avg_epu8( \ 88 ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ 89 ref2_reg = _mm256_avg_epu8( \ 90 ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ 91 sad1_reg = _mm256_sad_epu8( \ 92 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 93 sad2_reg = _mm256_sad_epu8( \ 94 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 95 sum_sad = \ 96 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 97 ref_ptr += ref_stride; \ 98 src_ptr += src_stride; \ 99 second_pred += 64; \ 100 } \ 101 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 102 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 103 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 104 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 105 unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ 106 _mm256_zeroupper(); \ 107 return res; \ 108 } 109 110 #define FSADAVG64 \ 111 FSADAVG64_H(64) \ 112 FSADAVG64_H(32) 113 114 /* clang-format off */ 115 FSADAVG64 116 /* clang-format on */ 117 118 #undef FSADAVG64 119 #undef FSADAVG64_H 120 #endif // !CONFIG_HIGHWAY 121 122 // SAD, SAD_SKIP and SAD_AVG for 32xh blocks 123 static inline unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, 124 const uint8_t *ref_ptr, int ref_stride, 125 int h) { 126 int i; 127 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; 128 __m256i sum_sad = _mm256_setzero_si256(); 129 __m256i sum_sad_h; 130 __m128i sum_sad128; 131 int ref2_stride = ref_stride << 1; 132 int src2_stride = src_stride << 1; 133 int max = h >> 1; 134 for (i = 0; i < max; i++) { 135 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); 136 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); 137 sad1_reg = 138 _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); 139 sad2_reg = _mm256_sad_epu8( 140 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); 141 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); 142 ref_ptr += ref2_stride; 143 src_ptr += src2_stride; 144 } 145 sum_sad_h = _mm256_srli_si256(sum_sad, 8); 146 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); 147 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); 148 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); 149 unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); 150 _mm256_zeroupper(); 151 return res; 152 } 153 154 #define FSAD32_H(h) \ 155 unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ 156 const uint8_t *ref_ptr, int ref_stride) { \ 157 return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ 158 } 159 160 #define FSADS32_H(h) \ 161 unsigned int aom_sad_skip_32x##h##_avx2( \ 162 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 163 int ref_stride) { \ 164 return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ 165 h / 2); \ 166 } 167 168 #define FSAD32 \ 169 FSAD32_H(64) \ 170 FSAD32_H(32) \ 171 FSAD32_H(16) \ 172 FSADS32_H(64) \ 173 FSADS32_H(32) \ 174 FSADS32_H(16) 175 176 /* clang-format off */ 177 FSAD32 178 /* clang-format on */ 179 180 #undef FSAD32 181 #undef FSAD32_H 182 183 #define FSADAVG32_H(h) \ 184 unsigned int aom_sad32x##h##_avg_avx2( \ 185 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 186 int ref_stride, const uint8_t *second_pred) { \ 187 int i; \ 188 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 189 __m256i sum_sad = _mm256_setzero_si256(); \ 190 __m256i sum_sad_h; \ 191 __m128i sum_sad128; \ 192 int ref2_stride = ref_stride << 1; \ 193 int src2_stride = src_stride << 1; \ 194 int max = h >> 1; \ 195 for (i = 0; i < max; i++) { \ 196 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 197 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 198 ref1_reg = _mm256_avg_epu8( \ 199 ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ 200 ref2_reg = _mm256_avg_epu8( \ 201 ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ 202 sad1_reg = _mm256_sad_epu8( \ 203 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 204 sad2_reg = _mm256_sad_epu8( \ 205 ref2_reg, \ 206 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 207 sum_sad = \ 208 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 209 ref_ptr += ref2_stride; \ 210 src_ptr += src2_stride; \ 211 second_pred += 64; \ 212 } \ 213 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 214 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 215 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 216 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 217 unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ 218 _mm256_zeroupper(); \ 219 return res; \ 220 } 221 222 #define FSADAVG32 \ 223 FSADAVG32_H(64) \ 224 FSADAVG32_H(32) \ 225 FSADAVG32_H(16) 226 227 /* clang-format off */ 228 FSADAVG32 229 /* clang-format on */ 230 231 #undef FSADAVG32 232 #undef FSADAVG32_H