lossless_enc_sse41.c (7267B)
1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // SSE4.1 variant of methods for lossless encoder 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_SSE41) 17 #include <emmintrin.h> 18 #include <smmintrin.h> 19 20 #include <assert.h> 21 22 #include "src/dsp/cpu.h" 23 #include "src/dsp/lossless.h" 24 #include "src/webp/types.h" 25 26 //------------------------------------------------------------------------------ 27 // Cost operations. 28 29 static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) { 30 cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8)); 31 cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4)); 32 return _mm_cvtsi128_si32(cost); 33 } 34 35 static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { 36 int i; 37 __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]); 38 assert(length % 8 == 0); 39 40 for (i = 8; i + 8 <= length; i += 8) { 41 const int j = (i - 2) >> 1; 42 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); 43 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); 44 const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); 45 const __m128i a2 = _mm_hadd_epi32(a0, a1); 46 const __m128i mul = _mm_mullo_epi32(a2, w); 47 cost = _mm_add_epi32(mul, cost); 48 } 49 return HorizontalSum_SSE41(cost); 50 } 51 52 //------------------------------------------------------------------------------ 53 // Subtract-Green Transform 54 55 static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, 56 int num_pixels) { 57 int i; 58 const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9, 59 -1, 5, -1, 5, -1, 1, -1, 1); 60 for (i = 0; i + 4 <= num_pixels; i += 4) { 61 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); 62 const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle); 63 const __m128i out = _mm_sub_epi8(in, in_0g0g); 64 _mm_storeu_si128((__m128i*)&argb_data[i], out); 65 } 66 // fallthrough and finish off with plain-C 67 if (i != num_pixels) { 68 VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); 69 } 70 } 71 72 //------------------------------------------------------------------------------ 73 // Color Transform 74 75 // For sign-extended multiplying constants, pre-shifted by 5: 76 #define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) 77 78 #define MK_CST_16(HI, LO) \ 79 _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) 80 81 static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb, 82 int stride, 83 int tile_width, int tile_height, 84 int green_to_blue, int red_to_blue, 85 uint32_t histo[]) { 86 const __m128i mult = 87 MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue)); 88 const __m128i perm = 89 _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14); 90 if (tile_width >= 4) { 91 int y; 92 for (y = 0; y < tile_height; ++y) { 93 const uint32_t* const src = argb + y * stride; 94 const __m128i A1 = _mm_loadu_si128((const __m128i*)src); 95 const __m128i B1 = _mm_shuffle_epi8(A1, perm); 96 const __m128i C1 = _mm_mulhi_epi16(B1, mult); 97 const __m128i D1 = _mm_sub_epi16(A1, C1); 98 __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1); 99 int x; 100 for (x = 4; x + 4 <= tile_width; x += 4) { 101 const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); 102 __m128i B2, C2, D2; 103 ++histo[_mm_extract_epi8(E, 0)]; 104 B2 = _mm_shuffle_epi8(A2, perm); 105 ++histo[_mm_extract_epi8(E, 4)]; 106 C2 = _mm_mulhi_epi16(B2, mult); 107 ++histo[_mm_extract_epi8(E, 8)]; 108 D2 = _mm_sub_epi16(A2, C2); 109 ++histo[_mm_extract_epi8(E, 12)]; 110 E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2); 111 } 112 ++histo[_mm_extract_epi8(E, 0)]; 113 ++histo[_mm_extract_epi8(E, 4)]; 114 ++histo[_mm_extract_epi8(E, 8)]; 115 ++histo[_mm_extract_epi8(E, 12)]; 116 } 117 } 118 { 119 const int left_over = tile_width & 3; 120 if (left_over > 0) { 121 VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, 122 left_over, tile_height, 123 green_to_blue, red_to_blue, histo); 124 } 125 } 126 } 127 128 static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb, 129 int stride, 130 int tile_width, int tile_height, 131 int green_to_red, 132 uint32_t histo[]) { 133 const __m128i mult = MK_CST_16(0, CST_5b(green_to_red)); 134 const __m128i mask_g = _mm_set1_epi32(0x0000ff00); 135 if (tile_width >= 4) { 136 int y; 137 for (y = 0; y < tile_height; ++y) { 138 const uint32_t* const src = argb + y * stride; 139 const __m128i A1 = _mm_loadu_si128((const __m128i*)src); 140 const __m128i B1 = _mm_and_si128(A1, mask_g); 141 const __m128i C1 = _mm_madd_epi16(B1, mult); 142 __m128i D = _mm_sub_epi16(A1, C1); 143 int x; 144 for (x = 4; x + 4 <= tile_width; x += 4) { 145 const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); 146 __m128i B2, C2; 147 ++histo[_mm_extract_epi8(D, 2)]; 148 B2 = _mm_and_si128(A2, mask_g); 149 ++histo[_mm_extract_epi8(D, 6)]; 150 C2 = _mm_madd_epi16(B2, mult); 151 ++histo[_mm_extract_epi8(D, 10)]; 152 ++histo[_mm_extract_epi8(D, 14)]; 153 D = _mm_sub_epi16(A2, C2); 154 } 155 ++histo[_mm_extract_epi8(D, 2)]; 156 ++histo[_mm_extract_epi8(D, 6)]; 157 ++histo[_mm_extract_epi8(D, 10)]; 158 ++histo[_mm_extract_epi8(D, 14)]; 159 } 160 } 161 { 162 const int left_over = tile_width & 3; 163 if (left_over > 0) { 164 VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, 165 left_over, tile_height, green_to_red, 166 histo); 167 } 168 } 169 } 170 171 #undef MK_CST_16 172 173 //------------------------------------------------------------------------------ 174 // Entry point 175 176 extern void VP8LEncDspInitSSE41(void); 177 178 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { 179 VP8LExtraCost = ExtraCost_SSE41; 180 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; 181 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; 182 VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; 183 184 // SSE exports for AVX and above. 185 VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE41; 186 VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE41; 187 VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE41; 188 } 189 190 #else // !WEBP_USE_SSE41 191 192 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41) 193 194 #endif // WEBP_USE_SSE41