ssim_sse2.c (5842B)
1 // Copyright 2017 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // SSE2 version of distortion calculation 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <emmintrin.h> 18 19 #include <assert.h> 20 21 #include "src/dsp/common_sse2.h" 22 #include "src/dsp/cpu.h" 23 #include "src/webp/types.h" 24 25 #if !defined(WEBP_DISABLE_STATS) 26 27 // Helper function 28 static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b, 29 __m128i* const sum) { 30 // take abs(a-b) in 8b 31 const __m128i a_b = _mm_subs_epu8(a, b); 32 const __m128i b_a = _mm_subs_epu8(b, a); 33 const __m128i abs_a_b = _mm_or_si128(a_b, b_a); 34 // zero-extend to 16b 35 const __m128i zero = _mm_setzero_si128(); 36 const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); 37 const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); 38 // multiply with self 39 const __m128i sum1 = _mm_madd_epi16(C0, C0); 40 const __m128i sum2 = _mm_madd_epi16(C1, C1); 41 *sum = _mm_add_epi32(sum1, sum2); 42 } 43 44 //------------------------------------------------------------------------------ 45 // SSIM / PSNR entry point 46 47 static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, 48 const uint8_t* src2, int len) { 49 int i = 0; 50 uint32_t sse2 = 0; 51 if (len >= 16) { 52 const int limit = len - 32; 53 int32_t tmp[4]; 54 __m128i sum1; 55 __m128i sum = _mm_setzero_si128(); 56 __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); 57 __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); 58 i += 16; 59 while (i <= limit) { 60 const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); 61 const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); 62 __m128i sum2; 63 i += 16; 64 SubtractAndSquare_SSE2(a0, b0, &sum1); 65 sum = _mm_add_epi32(sum, sum1); 66 a0 = _mm_loadu_si128((const __m128i*)&src1[i]); 67 b0 = _mm_loadu_si128((const __m128i*)&src2[i]); 68 i += 16; 69 SubtractAndSquare_SSE2(a1, b1, &sum2); 70 sum = _mm_add_epi32(sum, sum2); 71 } 72 SubtractAndSquare_SSE2(a0, b0, &sum1); 73 sum = _mm_add_epi32(sum, sum1); 74 _mm_storeu_si128((__m128i*)tmp, sum); 75 sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 76 } 77 78 for (; i < len; ++i) { 79 const int32_t diff = src1[i] - src2[i]; 80 sse2 += diff * diff; 81 } 82 return sse2; 83 } 84 #endif // !defined(WEBP_DISABLE_STATS) 85 86 #if !defined(WEBP_REDUCE_SIZE) 87 88 static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) { 89 uint16_t tmp[8]; 90 const __m128i a = _mm_srli_si128(*m, 8); 91 const __m128i b = _mm_add_epi16(*m, a); 92 _mm_storeu_si128((__m128i*)tmp, b); 93 return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; 94 } 95 96 static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) { 97 const __m128i a = _mm_srli_si128(*m, 8); 98 const __m128i b = _mm_add_epi32(*m, a); 99 const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); 100 return (uint32_t)_mm_cvtsi128_si32(c); 101 } 102 103 static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; 104 105 #define ACCUMULATE_ROW(WEIGHT) do { \ 106 /* compute row weight (Wx * Wy) */ \ 107 const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ 108 const __m128i W = _mm_mullo_epi16(Wx, Wy); \ 109 /* process 8 bytes at a time (7 bytes, actually) */ \ 110 const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ 111 const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ 112 /* convert to 16b and multiply by weight */ \ 113 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ 114 const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ 115 const __m128i wa1 = _mm_mullo_epi16(a1, W); \ 116 const __m128i wb1 = _mm_mullo_epi16(b1, W); \ 117 /* accumulate */ \ 118 xm = _mm_add_epi16(xm, wa1); \ 119 ym = _mm_add_epi16(ym, wb1); \ 120 xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ 121 xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ 122 yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ 123 src1 += stride1; \ 124 src2 += stride2; \ 125 } while (0) 126 127 static double SSIMGet_SSE2(const uint8_t* src1, int stride1, 128 const uint8_t* src2, int stride2) { 129 VP8DistoStats stats; 130 const __m128i zero = _mm_setzero_si128(); 131 __m128i xm = zero, ym = zero; // 16b accums 132 __m128i xxm = zero, yym = zero, xym = zero; // 32b accum 133 const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); 134 assert(2 * VP8_SSIM_KERNEL + 1 == 7); 135 ACCUMULATE_ROW(1); 136 ACCUMULATE_ROW(2); 137 ACCUMULATE_ROW(3); 138 ACCUMULATE_ROW(4); 139 ACCUMULATE_ROW(3); 140 ACCUMULATE_ROW(2); 141 ACCUMULATE_ROW(1); 142 stats.xm = HorizontalAdd16b_SSE2(&xm); 143 stats.ym = HorizontalAdd16b_SSE2(&ym); 144 stats.xxm = HorizontalAdd32b_SSE2(&xxm); 145 stats.xym = HorizontalAdd32b_SSE2(&xym); 146 stats.yym = HorizontalAdd32b_SSE2(&yym); 147 return VP8SSIMFromStats(&stats); 148 } 149 150 #endif // !defined(WEBP_REDUCE_SIZE) 151 152 extern void VP8SSIMDspInitSSE2(void); 153 154 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { 155 #if !defined(WEBP_DISABLE_STATS) 156 VP8AccumulateSSE = AccumulateSSE_SSE2; 157 #endif 158 #if !defined(WEBP_REDUCE_SIZE) 159 VP8SSIMGet = SSIMGet_SSE2; 160 #endif 161 } 162 163 #else // !WEBP_USE_SSE2 164 165 WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) 166 167 #endif // WEBP_USE_SSE2