lossless_sse41.c (5561B)
1 // Copyright 2021 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // SSE41 variant of methods for lossless decoder 11 12 #include "src/dsp/dsp.h" 13 14 #if defined(WEBP_USE_SSE41) 15 #include <emmintrin.h> 16 #include <smmintrin.h> 17 18 #include "src/webp/types.h" 19 #include "src/dsp/cpu.h" 20 #include "src/dsp/lossless.h" 21 22 //------------------------------------------------------------------------------ 23 // Color-space conversion functions 24 25 static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, 26 const uint32_t* const src, 27 int num_pixels, uint32_t* dst) { 28 // sign-extended multiplying constants, pre-shifted by 5. 29 #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend 30 const __m128i mults_rb = 31 _mm_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 | 32 (CST(green_to_blue) & 0xffff))); 33 const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue)); 34 #undef CST 35 const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); 36 const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, 37 -1, 9, -1, 9, -1, 13, -1, 13); 38 const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, 39 -1, 10, -1, -1, -1, 14, -1, -1); 40 int i; 41 for (i = 0; i + 4 <= num_pixels; i += 4) { 42 const __m128i A = _mm_loadu_si128((const __m128i*)(src + i)); 43 const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0 44 const __m128i C = _mm_mulhi_epi16(B, mults_rb); 45 const __m128i D = _mm_add_epi8(A, C); 46 const __m128i E = _mm_shuffle_epi8(D, perm2); 47 const __m128i F = _mm_mulhi_epi16(E, mults_b2); 48 const __m128i G = _mm_add_epi8(D, F); 49 const __m128i out = _mm_blendv_epi8(G, A, mask_ag); 50 _mm_storeu_si128((__m128i*)&dst[i], out); 51 } 52 // Fall-back to C-version for left-overs. 53 if (i != num_pixels) { 54 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 60 #define ARGB_TO_RGB_SSE41 do { \ 61 while (num_pixels >= 16) { \ 62 const __m128i in0 = _mm_loadu_si128(in + 0); \ 63 const __m128i in1 = _mm_loadu_si128(in + 1); \ 64 const __m128i in2 = _mm_loadu_si128(in + 2); \ 65 const __m128i in3 = _mm_loadu_si128(in + 3); \ 66 const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \ 67 const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \ 68 const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \ 69 const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \ 70 const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \ 71 const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \ 72 const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \ 73 _mm_storeu_si128(out + 0, b0); \ 74 _mm_storeu_si128(out + 1, b1); \ 75 _mm_storeu_si128(out + 2, b2); \ 76 in += 4; \ 77 out += 3; \ 78 num_pixels -= 16; \ 79 } \ 80 } while (0) 81 82 static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src, 83 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 84 const __m128i* in = (const __m128i*)src; 85 __m128i* out = (__m128i*)dst; 86 const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 87 8, 14, 13, 12, -1, -1, -1, -1); 88 const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); 89 const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); 90 const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); 91 92 ARGB_TO_RGB_SSE41; 93 94 // left-overs 95 if (num_pixels > 0) { 96 VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); 97 } 98 } 99 100 static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src, 101 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 102 const __m128i* in = (const __m128i*)src; 103 __m128i* out = (__m128i*)dst; 104 const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 105 12, 13, 14, -1, -1, -1, -1); 106 const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); 107 const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); 108 const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); 109 110 ARGB_TO_RGB_SSE41; 111 112 // left-overs 113 if (num_pixels > 0) { 114 VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out); 115 } 116 } 117 118 #undef ARGB_TO_RGB_SSE41 119 120 //------------------------------------------------------------------------------ 121 // Entry point 122 123 extern void VP8LDspInitSSE41(void); 124 125 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) { 126 VP8LTransformColorInverse = TransformColorInverse_SSE41; 127 VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41; 128 VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41; 129 130 // SSE exports for AVX and above. 131 VP8LTransformColorInverse_SSE = TransformColorInverse_SSE41; 132 VP8LConvertBGRAToRGB_SSE = ConvertBGRAToRGB_SSE41; 133 } 134 135 #else // !WEBP_USE_SSE41 136 137 WEBP_DSP_INIT_STUB(VP8LDspInitSSE41) 138 139 #endif // WEBP_USE_SSE41