sharpyuv_sse2.c (8765B)
1 // Copyright 2022 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Speed-critical functions for Sharp YUV. 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include "sharpyuv/sharpyuv_dsp.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <emmintrin.h> 18 19 #include <stdlib.h> 20 21 #include "src/dsp/cpu.h" 22 #include "src/webp/types.h" 23 24 static uint16_t clip_SSE2(int v, int max) { 25 return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; 26 } 27 28 static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, 29 uint16_t* dst, int len, int bit_depth) { 30 const int max_y = (1 << bit_depth) - 1; 31 uint64_t diff = 0; 32 uint32_t tmp[4]; 33 int i; 34 const __m128i zero = _mm_setzero_si128(); 35 const __m128i max = _mm_set1_epi16(max_y); 36 const __m128i one = _mm_set1_epi16(1); 37 __m128i sum = zero; 38 39 for (i = 0; i + 8 <= len; i += 8) { 40 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); 41 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); 42 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); 43 const __m128i D = _mm_sub_epi16(A, B); // diff_y 44 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0) 45 const __m128i F = _mm_add_epi16(C, D); // new_y 46 const __m128i G = _mm_or_si128(E, one); // -1 or 1 47 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); 48 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...)) 49 _mm_storeu_si128((__m128i*)(dst + i), H); 50 sum = _mm_add_epi32(sum, I); 51 } 52 _mm_storeu_si128((__m128i*)tmp, sum); 53 diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; 54 for (; i < len; ++i) { 55 const int diff_y = ref[i] - src[i]; 56 const int new_y = (int)dst[i] + diff_y; 57 dst[i] = clip_SSE2(new_y, max_y); 58 diff += (uint64_t)abs(diff_y); 59 } 60 return diff; 61 } 62 63 static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, 64 int16_t* dst, int len) { 65 int i = 0; 66 for (i = 0; i + 8 <= len; i += 8) { 67 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); 68 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); 69 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); 70 const __m128i D = _mm_sub_epi16(A, B); // diff_uv 71 const __m128i E = _mm_add_epi16(C, D); // new_uv 72 _mm_storeu_si128((__m128i*)(dst + i), E); 73 } 74 for (; i < len; ++i) { 75 const int diff_uv = ref[i] - src[i]; 76 dst[i] += diff_uv; 77 } 78 } 79 80 static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B, 81 int len, const uint16_t* best_y, 82 uint16_t* out, int bit_depth) { 83 const int max_y = (1 << bit_depth) - 1; 84 int i; 85 const __m128i kCst8 = _mm_set1_epi16(8); 86 const __m128i max = _mm_set1_epi16(max_y); 87 const __m128i zero = _mm_setzero_si128(); 88 for (i = 0; i + 8 <= len; i += 8) { 89 const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); 90 const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); 91 const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); 92 const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); 93 const __m128i a0b1 = _mm_add_epi16(a0, b1); 94 const __m128i a1b0 = _mm_add_epi16(a1, b0); 95 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1 96 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); 97 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1) 98 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0) 99 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); 100 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); 101 const __m128i d0 = _mm_add_epi16(c1, a0); 102 const __m128i d1 = _mm_add_epi16(c0, a1); 103 const __m128i e0 = _mm_srai_epi16(d0, 1); 104 const __m128i e1 = _mm_srai_epi16(d1, 1); 105 const __m128i f0 = _mm_unpacklo_epi16(e0, e1); 106 const __m128i f1 = _mm_unpackhi_epi16(e0, e1); 107 const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); 108 const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); 109 const __m128i h0 = _mm_add_epi16(g0, f0); 110 const __m128i h1 = _mm_add_epi16(g1, f1); 111 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); 112 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); 113 _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); 114 _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); 115 } 116 for (; i < len; ++i) { 117 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = 118 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 119 // We reuse the common sub-expressions. 120 const int a0b1 = A[i + 0] + B[i + 1]; 121 const int a1b0 = A[i + 1] + B[i + 0]; 122 const int a0a1b0b1 = a0b1 + a1b0 + 8; 123 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; 124 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; 125 out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y); 126 out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y); 127 } 128 } 129 130 static WEBP_INLINE __m128i s16_to_s32(__m128i in) { 131 return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16); 132 } 133 134 static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B, 135 int len, const uint16_t* best_y, 136 uint16_t* out, int bit_depth) { 137 const int max_y = (1 << bit_depth) - 1; 138 int i; 139 const __m128i kCst8 = _mm_set1_epi32(8); 140 const __m128i max = _mm_set1_epi16(max_y); 141 const __m128i zero = _mm_setzero_si128(); 142 for (i = 0; i + 4 <= len; i += 4) { 143 const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0))); 144 const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1))); 145 const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0))); 146 const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1))); 147 const __m128i a0b1 = _mm_add_epi32(a0, b1); 148 const __m128i a1b0 = _mm_add_epi32(a1, b0); 149 const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0); // A0+A1+B0+B1 150 const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8); 151 const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1); // 2*(A0+B1) 152 const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0); // 2*(A1+B0) 153 const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3); 154 const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3); 155 const __m128i d0 = _mm_add_epi32(c1, a0); 156 const __m128i d1 = _mm_add_epi32(c0, a1); 157 const __m128i e0 = _mm_srai_epi32(d0, 1); 158 const __m128i e1 = _mm_srai_epi32(d1, 1); 159 const __m128i f0 = _mm_unpacklo_epi32(e0, e1); 160 const __m128i f1 = _mm_unpackhi_epi32(e0, e1); 161 const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); 162 const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1)); 163 const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero); 164 _mm_storeu_si128((__m128i*)(out + 2 * i + 0), final); 165 } 166 for (; i < len; ++i) { 167 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = 168 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 169 // We reuse the common sub-expressions. 170 const int a0b1 = A[i + 0] + B[i + 1]; 171 const int a1b0 = A[i + 1] + B[i + 0]; 172 const int a0a1b0b1 = a0b1 + a1b0 + 8; 173 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; 174 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; 175 out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y); 176 out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y); 177 } 178 } 179 180 static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, 181 const uint16_t* best_y, uint16_t* out, 182 int bit_depth) { 183 if (bit_depth <= 10) { 184 SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth); 185 } else { 186 SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth); 187 } 188 } 189 190 //------------------------------------------------------------------------------ 191 192 extern void InitSharpYuvSSE2(void); 193 194 WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) { 195 SharpYuvUpdateY = SharpYuvUpdateY_SSE2; 196 SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2; 197 SharpYuvFilterRow = SharpYuvFilterRow_SSE2; 198 } 199 #else // !WEBP_USE_SSE2 200 201 extern void InitSharpYuvSSE2(void); 202 203 void InitSharpYuvSSE2(void) {} 204 205 #endif // WEBP_USE_SSE2