error_intrin_sse2.c (2607B)
1 /* 2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> // SSE2 13 14 #include "config/av1_rtcd.h" 15 16 #include "aom/aom_integer.h" 17 18 static inline __m128i reduce_sum_epi64(__m128i reg) { 19 __m128i reg_hi = _mm_srli_si128(reg, 8); 20 reg = _mm_add_epi64(reg, reg_hi); 21 22 return reg; 23 } 24 25 int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff, 26 intptr_t block_size) { 27 assert(block_size % 16 == 0); 28 assert(block_size >= 16); 29 30 const __m128i zero = _mm_setzero_si128(); 31 __m128i accum_0 = zero; 32 __m128i accum_1 = zero; 33 34 for (int i = 0; i < block_size; i += 16) { 35 // Load 8 elements for coeff and dqcoeff. 36 const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff); 37 const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8)); 38 const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff); 39 const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8)); 40 // Compute the diff 41 const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0); 42 const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1); 43 // Compute the error 44 const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0); 45 const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1); 46 47 const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero); 48 const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero); 49 const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero); 50 const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero); 51 52 // Accumulate 53 accum_0 = _mm_add_epi64(accum_0, error_lo_0); 54 accum_1 = _mm_add_epi64(accum_1, error_lo_1); 55 accum_0 = _mm_add_epi64(accum_0, error_hi_0); 56 accum_1 = _mm_add_epi64(accum_1, error_hi_1); 57 58 // Advance 59 coeff += 16; 60 dqcoeff += 16; 61 } 62 63 __m128i accum = _mm_add_epi64(accum_0, accum_1); 64 // Reduce sum the register 65 accum = reduce_sum_epi64(accum); 66 67 // Store the results. 68 #if AOM_ARCH_X86_64 69 return _mm_cvtsi128_si64(accum); 70 #else 71 int64_t result; 72 _mm_storel_epi64((__m128i *)&result, accum); 73 return result; 74 #endif // AOM_ARCH_X86_64 75 }