enc_sse2.c (62456B)
1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // SSE2 version of speed-critical encoding functions. 11 // 12 // Author: Christian Duvivier (cduvivier@google.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <emmintrin.h> 18 19 #include <assert.h> 20 #include <stdlib.h> // for abs() 21 #include <string.h> 22 23 #include "src/dsp/common_sse2.h" 24 #include "src/dsp/cpu.h" 25 #include "src/enc/cost_enc.h" 26 #include "src/enc/vp8i_enc.h" 27 #include "src/utils/utils.h" 28 #include "src/webp/types.h" 29 30 //------------------------------------------------------------------------------ 31 // Transforms (Paragraph 14.4) 32 33 // Does one inverse transform. 34 static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref, 35 const int16_t* WEBP_RESTRICT in, 36 uint8_t* WEBP_RESTRICT dst) { 37 // This implementation makes use of 16-bit fixed point versions of two 38 // multiply constants: 39 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 40 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 41 // 42 // To be able to use signed 16-bit integers, we use the following trick to 43 // have constants within range: 44 // - Associated constants are obtained by subtracting the 16-bit fixed point 45 // version of one: 46 // k = K - (1 << 16) => K = k + (1 << 16) 47 // K1 = 85267 => k1 = 20091 48 // K2 = 35468 => k2 = -30068 49 // - The multiplication of a variable by a constant become the sum of the 50 // variable and the multiplication of that variable by the associated 51 // constant: 52 // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x 53 const __m128i k1k2 = _mm_set_epi16(-30068, -30068, -30068, -30068, 54 20091, 20091, 20091, 20091); 55 const __m128i k2k1 = _mm_set_epi16(20091, 20091, 20091, 20091, 56 -30068, -30068, -30068, -30068); 57 const __m128i zero = _mm_setzero_si128(); 58 const __m128i zero_four = _mm_set_epi16(0, 0, 0, 0, 4, 4, 4, 4); 59 __m128i T01, T23; 60 61 // Load and concatenate the transform coefficients. 62 const __m128i in01 = _mm_loadu_si128((const __m128i*)&in[0]); 63 const __m128i in23 = _mm_loadu_si128((const __m128i*)&in[8]); 64 // a00 a10 a20 a30 a01 a11 a21 a31 65 // a02 a12 a22 a32 a03 a13 a23 a33 66 67 // Vertical pass and subsequent transpose. 68 { 69 const __m128i in1 = _mm_unpackhi_epi64(in01, in01); 70 const __m128i in3 = _mm_unpackhi_epi64(in23, in23); 71 72 // First pass, c and d calculations are longer because of the "trick" 73 // multiplications. 74 // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 75 // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 76 const __m128i a_d3 = _mm_add_epi16(in01, in23); 77 const __m128i b_c3 = _mm_sub_epi16(in01, in23); 78 const __m128i c1d1 = _mm_mulhi_epi16(in1, k2k1); 79 const __m128i c2d2 = _mm_mulhi_epi16(in3, k1k2); 80 const __m128i c3 = _mm_unpackhi_epi64(b_c3, b_c3); 81 const __m128i c4 = _mm_sub_epi16(c1d1, c2d2); 82 const __m128i c = _mm_add_epi16(c3, c4); 83 const __m128i d4u = _mm_add_epi16(c1d1, c2d2); 84 const __m128i du = _mm_add_epi16(a_d3, d4u); 85 const __m128i d = _mm_unpackhi_epi64(du, du); 86 87 // Second pass. 88 const __m128i comb_ab = _mm_unpacklo_epi64(a_d3, b_c3); 89 const __m128i comb_dc = _mm_unpacklo_epi64(d, c); 90 91 const __m128i tmp01 = _mm_add_epi16(comb_ab, comb_dc); 92 const __m128i tmp32 = _mm_sub_epi16(comb_ab, comb_dc); 93 const __m128i tmp23 = _mm_shuffle_epi32(tmp32, _MM_SHUFFLE(1, 0, 3, 2)); 94 95 const __m128i transpose_0 = _mm_unpacklo_epi16(tmp01, tmp23); 96 const __m128i transpose_1 = _mm_unpackhi_epi16(tmp01, tmp23); 97 // a00 a20 a01 a21 a02 a22 a03 a23 98 // a10 a30 a11 a31 a12 a32 a13 a33 99 100 T01 = _mm_unpacklo_epi16(transpose_0, transpose_1); 101 T23 = _mm_unpackhi_epi16(transpose_0, transpose_1); 102 // a00 a10 a20 a30 a01 a11 a21 a31 103 // a02 a12 a22 a32 a03 a13 a23 a33 104 } 105 106 // Horizontal pass and subsequent transpose. 107 { 108 const __m128i T1 = _mm_unpackhi_epi64(T01, T01); 109 const __m128i T3 = _mm_unpackhi_epi64(T23, T23); 110 111 // First pass, c and d calculations are longer because of the "trick" 112 // multiplications. 113 const __m128i dc = _mm_add_epi16(T01, zero_four); 114 115 // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 116 // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 117 const __m128i a_d3 = _mm_add_epi16(dc, T23); 118 const __m128i b_c3 = _mm_sub_epi16(dc, T23); 119 const __m128i c1d1 = _mm_mulhi_epi16(T1, k2k1); 120 const __m128i c2d2 = _mm_mulhi_epi16(T3, k1k2); 121 const __m128i c3 = _mm_unpackhi_epi64(b_c3, b_c3); 122 const __m128i c4 = _mm_sub_epi16(c1d1, c2d2); 123 const __m128i c = _mm_add_epi16(c3, c4); 124 const __m128i d4u = _mm_add_epi16(c1d1, c2d2); 125 const __m128i du = _mm_add_epi16(a_d3, d4u); 126 const __m128i d = _mm_unpackhi_epi64(du, du); 127 128 // Second pass. 129 const __m128i comb_ab = _mm_unpacklo_epi64(a_d3, b_c3); 130 const __m128i comb_dc = _mm_unpacklo_epi64(d, c); 131 132 const __m128i tmp01 = _mm_add_epi16(comb_ab, comb_dc); 133 const __m128i tmp32 = _mm_sub_epi16(comb_ab, comb_dc); 134 const __m128i tmp23 = _mm_shuffle_epi32(tmp32, _MM_SHUFFLE(1, 0, 3, 2)); 135 136 const __m128i shifted01 = _mm_srai_epi16(tmp01, 3); 137 const __m128i shifted23 = _mm_srai_epi16(tmp23, 3); 138 // a00 a01 a02 a03 a10 a11 a12 a13 139 // a20 a21 a22 a23 a30 a31 a32 a33 140 141 const __m128i transpose_0 = _mm_unpacklo_epi16(shifted01, shifted23); 142 const __m128i transpose_1 = _mm_unpackhi_epi16(shifted01, shifted23); 143 // a00 a20 a01 a21 a02 a22 a03 a23 144 // a10 a30 a11 a31 a12 a32 a13 a33 145 146 T01 = _mm_unpacklo_epi16(transpose_0, transpose_1); 147 T23 = _mm_unpackhi_epi16(transpose_0, transpose_1); 148 // a00 a10 a20 a30 a01 a11 a21 a31 149 // a02 a12 a22 a32 a03 a13 a23 a33 150 } 151 152 // Add inverse transform to 'ref' and store. 153 { 154 // Load the reference(s). 155 __m128i ref01, ref23, ref0123; 156 int32_t buf[4]; 157 158 // Load four bytes/pixels per line. 159 const __m128i ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS])); 160 const __m128i ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS])); 161 const __m128i ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS])); 162 const __m128i ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS])); 163 ref01 = _mm_unpacklo_epi32(ref0, ref1); 164 ref23 = _mm_unpacklo_epi32(ref2, ref3); 165 166 // Convert to 16b. 167 ref01 = _mm_unpacklo_epi8(ref01, zero); 168 ref23 = _mm_unpacklo_epi8(ref23, zero); 169 // Add the inverse transform(s). 170 ref01 = _mm_add_epi16(ref01, T01); 171 ref23 = _mm_add_epi16(ref23, T23); 172 // Unsigned saturate to 8b. 173 ref0123 = _mm_packus_epi16(ref01, ref23); 174 175 _mm_storeu_si128((__m128i *)buf, ref0123); 176 177 // Store four bytes/pixels per line. 178 WebPInt32ToMem(&dst[0 * BPS], buf[0]); 179 WebPInt32ToMem(&dst[1 * BPS], buf[1]); 180 WebPInt32ToMem(&dst[2 * BPS], buf[2]); 181 WebPInt32ToMem(&dst[3 * BPS], buf[3]); 182 } 183 } 184 185 // Does two inverse transforms. 186 static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref, 187 const int16_t* WEBP_RESTRICT in, 188 uint8_t* WEBP_RESTRICT dst) { 189 // This implementation makes use of 16-bit fixed point versions of two 190 // multiply constants: 191 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 192 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 193 // 194 // To be able to use signed 16-bit integers, we use the following trick to 195 // have constants within range: 196 // - Associated constants are obtained by subtracting the 16-bit fixed point 197 // version of one: 198 // k = K - (1 << 16) => K = k + (1 << 16) 199 // K1 = 85267 => k1 = 20091 200 // K2 = 35468 => k2 = -30068 201 // - The multiplication of a variable by a constant become the sum of the 202 // variable and the multiplication of that variable by the associated 203 // constant: 204 // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x 205 const __m128i k1 = _mm_set1_epi16(20091); 206 const __m128i k2 = _mm_set1_epi16(-30068); 207 __m128i T0, T1, T2, T3; 208 209 // Load and concatenate the transform coefficients (we'll do two inverse 210 // transforms in parallel). 211 __m128i in0, in1, in2, in3; 212 { 213 const __m128i tmp0 = _mm_loadu_si128((const __m128i*)&in[0]); 214 const __m128i tmp1 = _mm_loadu_si128((const __m128i*)&in[8]); 215 const __m128i tmp2 = _mm_loadu_si128((const __m128i*)&in[16]); 216 const __m128i tmp3 = _mm_loadu_si128((const __m128i*)&in[24]); 217 in0 = _mm_unpacklo_epi64(tmp0, tmp2); 218 in1 = _mm_unpackhi_epi64(tmp0, tmp2); 219 in2 = _mm_unpacklo_epi64(tmp1, tmp3); 220 in3 = _mm_unpackhi_epi64(tmp1, tmp3); 221 // a00 a10 a20 a30 b00 b10 b20 b30 222 // a01 a11 a21 a31 b01 b11 b21 b31 223 // a02 a12 a22 a32 b02 b12 b22 b32 224 // a03 a13 a23 a33 b03 b13 b23 b33 225 } 226 227 // Vertical pass and subsequent transpose. 228 { 229 // First pass, c and d calculations are longer because of the "trick" 230 // multiplications. 231 const __m128i a = _mm_add_epi16(in0, in2); 232 const __m128i b = _mm_sub_epi16(in0, in2); 233 // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 234 const __m128i c1 = _mm_mulhi_epi16(in1, k2); 235 const __m128i c2 = _mm_mulhi_epi16(in3, k1); 236 const __m128i c3 = _mm_sub_epi16(in1, in3); 237 const __m128i c4 = _mm_sub_epi16(c1, c2); 238 const __m128i c = _mm_add_epi16(c3, c4); 239 // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 240 const __m128i d1 = _mm_mulhi_epi16(in1, k1); 241 const __m128i d2 = _mm_mulhi_epi16(in3, k2); 242 const __m128i d3 = _mm_add_epi16(in1, in3); 243 const __m128i d4 = _mm_add_epi16(d1, d2); 244 const __m128i d = _mm_add_epi16(d3, d4); 245 246 // Second pass. 247 const __m128i tmp0 = _mm_add_epi16(a, d); 248 const __m128i tmp1 = _mm_add_epi16(b, c); 249 const __m128i tmp2 = _mm_sub_epi16(b, c); 250 const __m128i tmp3 = _mm_sub_epi16(a, d); 251 252 // Transpose the two 4x4. 253 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); 254 } 255 256 // Horizontal pass and subsequent transpose. 257 { 258 // First pass, c and d calculations are longer because of the "trick" 259 // multiplications. 260 const __m128i four = _mm_set1_epi16(4); 261 const __m128i dc = _mm_add_epi16(T0, four); 262 const __m128i a = _mm_add_epi16(dc, T2); 263 const __m128i b = _mm_sub_epi16(dc, T2); 264 // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 265 const __m128i c1 = _mm_mulhi_epi16(T1, k2); 266 const __m128i c2 = _mm_mulhi_epi16(T3, k1); 267 const __m128i c3 = _mm_sub_epi16(T1, T3); 268 const __m128i c4 = _mm_sub_epi16(c1, c2); 269 const __m128i c = _mm_add_epi16(c3, c4); 270 // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 271 const __m128i d1 = _mm_mulhi_epi16(T1, k1); 272 const __m128i d2 = _mm_mulhi_epi16(T3, k2); 273 const __m128i d3 = _mm_add_epi16(T1, T3); 274 const __m128i d4 = _mm_add_epi16(d1, d2); 275 const __m128i d = _mm_add_epi16(d3, d4); 276 277 // Second pass. 278 const __m128i tmp0 = _mm_add_epi16(a, d); 279 const __m128i tmp1 = _mm_add_epi16(b, c); 280 const __m128i tmp2 = _mm_sub_epi16(b, c); 281 const __m128i tmp3 = _mm_sub_epi16(a, d); 282 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); 283 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); 284 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); 285 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); 286 287 // Transpose the two 4x4. 288 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, 289 &T2, &T3); 290 } 291 292 // Add inverse transform to 'ref' and store. 293 { 294 const __m128i zero = _mm_setzero_si128(); 295 // Load the reference(s). 296 __m128i ref0, ref1, ref2, ref3; 297 // Load eight bytes/pixels per line. 298 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 299 ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 300 ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 301 ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 302 // Convert to 16b. 303 ref0 = _mm_unpacklo_epi8(ref0, zero); 304 ref1 = _mm_unpacklo_epi8(ref1, zero); 305 ref2 = _mm_unpacklo_epi8(ref2, zero); 306 ref3 = _mm_unpacklo_epi8(ref3, zero); 307 // Add the inverse transform(s). 308 ref0 = _mm_add_epi16(ref0, T0); 309 ref1 = _mm_add_epi16(ref1, T1); 310 ref2 = _mm_add_epi16(ref2, T2); 311 ref3 = _mm_add_epi16(ref3, T3); 312 // Unsigned saturate to 8b. 313 ref0 = _mm_packus_epi16(ref0, ref0); 314 ref1 = _mm_packus_epi16(ref1, ref1); 315 ref2 = _mm_packus_epi16(ref2, ref2); 316 ref3 = _mm_packus_epi16(ref3, ref3); 317 // Store eight bytes/pixels per line. 318 _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); 319 _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); 320 _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); 321 _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); 322 } 323 } 324 325 // Does one or two inverse transforms. 326 static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref, 327 const int16_t* WEBP_RESTRICT in, 328 uint8_t* WEBP_RESTRICT dst, 329 int do_two) { 330 if (do_two) { 331 ITransform_Two_SSE2(ref, in, dst); 332 } else { 333 ITransform_One_SSE2(ref, in, dst); 334 } 335 } 336 337 static void FTransformPass1_SSE2(const __m128i* const in01, 338 const __m128i* const in23, 339 __m128i* const out01, 340 __m128i* const out32) { 341 const __m128i k937 = _mm_set1_epi32(937); 342 const __m128i k1812 = _mm_set1_epi32(1812); 343 344 const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); 345 const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); 346 const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 347 2217, 5352, 2217, 5352); 348 const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, 349 -5352, 2217, -5352, 2217); 350 351 // *in01 = 00 01 10 11 02 03 12 13 352 // *in23 = 20 21 30 31 22 23 32 33 353 const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1)); 354 const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1)); 355 // 00 01 10 11 03 02 13 12 356 // 20 21 30 31 23 22 33 32 357 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); 358 const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); 359 // 00 01 10 11 20 21 30 31 360 // 03 02 13 12 23 22 33 32 361 const __m128i a01 = _mm_add_epi16(s01, s32); 362 const __m128i a32 = _mm_sub_epi16(s01, s32); 363 // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] 364 // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] 365 366 const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] 367 const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] 368 const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); 369 const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); 370 const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); 371 const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); 372 const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); 373 const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); 374 const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); 375 const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); 376 const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... 377 const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 378 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); 379 *out01 = _mm_unpacklo_epi32(s_lo, s_hi); 380 *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. 381 } 382 383 static void FTransformPass2_SSE2(const __m128i* const v01, 384 const __m128i* const v32, 385 int16_t* WEBP_RESTRICT out) { 386 const __m128i zero = _mm_setzero_si128(); 387 const __m128i seven = _mm_set1_epi16(7); 388 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 389 5352, 2217, 5352, 2217); 390 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 391 2217, -5352, 2217, -5352); 392 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); 393 const __m128i k51000 = _mm_set1_epi32(51000); 394 395 // Same operations are done on the (0,3) and (1,2) pairs. 396 // a3 = v0 - v3 397 // a2 = v1 - v2 398 const __m128i a32 = _mm_sub_epi16(*v01, *v32); 399 const __m128i a22 = _mm_unpackhi_epi64(a32, a32); 400 401 const __m128i b23 = _mm_unpacklo_epi16(a22, a32); 402 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); 403 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); 404 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); 405 const __m128i d3 = _mm_add_epi32(c3, k51000); 406 const __m128i e1 = _mm_srai_epi32(d1, 16); 407 const __m128i e3 = _mm_srai_epi32(d3, 16); 408 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) 409 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) 410 const __m128i f1 = _mm_packs_epi32(e1, e1); 411 const __m128i f3 = _mm_packs_epi32(e3, e3); 412 // g1 = f1 + (a3 != 0); 413 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the 414 // desired (0, 1), we add one earlier through k12000_plus_one. 415 // -> g1 = f1 + 1 - (a3 == 0) 416 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); 417 418 // a0 = v0 + v3 419 // a1 = v1 + v2 420 const __m128i a01 = _mm_add_epi16(*v01, *v32); 421 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); 422 const __m128i a11 = _mm_unpackhi_epi64(a01, a01); 423 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); 424 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); 425 // d0 = (a0 + a1 + 7) >> 4; 426 // d2 = (a0 - a1 + 7) >> 4; 427 const __m128i d0 = _mm_srai_epi16(c0, 4); 428 const __m128i d2 = _mm_srai_epi16(c2, 4); 429 430 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); 431 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); 432 _mm_storeu_si128((__m128i*)&out[0], d0_g1); 433 _mm_storeu_si128((__m128i*)&out[8], d2_f3); 434 } 435 436 static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src, 437 const uint8_t* WEBP_RESTRICT ref, 438 int16_t* WEBP_RESTRICT out) { 439 const __m128i zero = _mm_setzero_si128(); 440 // Load src. 441 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 442 const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 443 const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 444 const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 445 // 00 01 02 03 * 446 // 10 11 12 13 * 447 // 20 21 22 23 * 448 // 30 31 32 33 * 449 // Shuffle. 450 const __m128i src_0 = _mm_unpacklo_epi16(src0, src1); 451 const __m128i src_1 = _mm_unpacklo_epi16(src2, src3); 452 // 00 01 10 11 02 03 12 13 * * ... 453 // 20 21 30 31 22 22 32 33 * * ... 454 455 // Load ref. 456 const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 457 const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 458 const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 459 const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 460 const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1); 461 const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3); 462 463 // Convert both to 16 bit. 464 const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero); 465 const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero); 466 const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero); 467 const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero); 468 469 // Compute the difference. 470 const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b); 471 const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b); 472 __m128i v01, v32; 473 474 // First pass 475 FTransformPass1_SSE2(&row01, &row23, &v01, &v32); 476 477 // Second pass 478 FTransformPass2_SSE2(&v01, &v32, out); 479 } 480 481 static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src, 482 const uint8_t* WEBP_RESTRICT ref, 483 int16_t* WEBP_RESTRICT out) { 484 const __m128i zero = _mm_setzero_si128(); 485 486 // Load src and convert to 16b. 487 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 488 const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 489 const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 490 const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 491 const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); 492 const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); 493 const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); 494 const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); 495 // Load ref and convert to 16b. 496 const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 497 const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 498 const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 499 const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 500 const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); 501 const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); 502 const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); 503 const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); 504 // Compute difference. -> 00 01 02 03 00' 01' 02' 03' 505 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); 506 const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); 507 const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); 508 const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); 509 510 // Unpack and shuffle 511 // 00 01 02 03 0 0 0 0 512 // 10 11 12 13 0 0 0 0 513 // 20 21 22 23 0 0 0 0 514 // 30 31 32 33 0 0 0 0 515 const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1); 516 const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3); 517 const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1); 518 const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3); 519 __m128i v01l, v32l; 520 __m128i v01h, v32h; 521 522 // First pass 523 FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l); 524 FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h); 525 526 // Second pass 527 FTransformPass2_SSE2(&v01l, &v32l, out + 0); 528 FTransformPass2_SSE2(&v01h, &v32h, out + 16); 529 } 530 531 static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in, 532 __m128i* const out) { 533 const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); 534 const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); 535 const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); 536 const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]); 537 const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]); 538 const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ... 539 const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ... 540 const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ... 541 const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ... 542 const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ... 543 const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ... 544 const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1 545 *out = _mm_madd_epi16(D, kMult); 546 } 547 548 static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in, 549 int16_t* WEBP_RESTRICT out) { 550 // Input is 12b signed. 551 __m128i row0, row1, row2, row3; 552 // Rows are 14b signed. 553 FTransformWHTRow_SSE2(in + 0 * 64, &row0); 554 FTransformWHTRow_SSE2(in + 1 * 64, &row1); 555 FTransformWHTRow_SSE2(in + 2 * 64, &row2); 556 FTransformWHTRow_SSE2(in + 3 * 64, &row3); 557 558 { 559 // The a* are 15b signed. 560 const __m128i a0 = _mm_add_epi32(row0, row2); 561 const __m128i a1 = _mm_add_epi32(row1, row3); 562 const __m128i a2 = _mm_sub_epi32(row1, row3); 563 const __m128i a3 = _mm_sub_epi32(row0, row2); 564 const __m128i a0a3 = _mm_packs_epi32(a0, a3); 565 const __m128i a1a2 = _mm_packs_epi32(a1, a2); 566 567 // The b* are 16b signed. 568 const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2); 569 const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2); 570 const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2); 571 const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2); 572 573 _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1)); 574 _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1)); 575 } 576 } 577 578 //------------------------------------------------------------------------------ 579 // Compute susceptibility based on DCT-coeff histograms: 580 // the higher, the "easier" the macroblock is to compress. 581 582 static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref, 583 const uint8_t* WEBP_RESTRICT pred, 584 int start_block, int end_block, 585 VP8Histogram* WEBP_RESTRICT const histo) { 586 const __m128i zero = _mm_setzero_si128(); 587 const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); 588 int j; 589 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 590 for (j = start_block; j < end_block; ++j) { 591 int16_t out[16]; 592 int k; 593 594 FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 595 596 // Convert coefficients to bin (within out[]). 597 { 598 // Load. 599 const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); 600 const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); 601 const __m128i d0 = _mm_sub_epi16(zero, out0); 602 const __m128i d1 = _mm_sub_epi16(zero, out1); 603 const __m128i abs0 = _mm_max_epi16(out0, d0); // abs(v), 16b 604 const __m128i abs1 = _mm_max_epi16(out1, d1); 605 // v = abs(out) >> 3 606 const __m128i v0 = _mm_srai_epi16(abs0, 3); 607 const __m128i v1 = _mm_srai_epi16(abs1, 3); 608 // bin = min(v, MAX_COEFF_THRESH) 609 const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); 610 const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); 611 // Store. 612 _mm_storeu_si128((__m128i*)&out[0], bin0); 613 _mm_storeu_si128((__m128i*)&out[8], bin1); 614 } 615 616 // Convert coefficients to bin. 617 for (k = 0; k < 16; ++k) { 618 ++distribution[out[k]]; 619 } 620 } 621 VP8SetHistogramData(distribution, histo); 622 } 623 624 //------------------------------------------------------------------------------ 625 // Intra predictions 626 627 // helper for chroma-DC predictions 628 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { 629 int j; 630 const __m128i values = _mm_set1_epi8((char)v); 631 for (j = 0; j < 8; ++j) { 632 _mm_storel_epi64((__m128i*)(dst + j * BPS), values); 633 } 634 } 635 636 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { 637 int j; 638 const __m128i values = _mm_set1_epi8((char)v); 639 for (j = 0; j < 16; ++j) { 640 _mm_store_si128((__m128i*)(dst + j * BPS), values); 641 } 642 } 643 644 static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) { 645 if (size == 4) { 646 int j; 647 for (j = 0; j < 4; ++j) { 648 memset(dst + j * BPS, value, 4); 649 } 650 } else if (size == 8) { 651 Put8x8uv_SSE2(value, dst); 652 } else { 653 Put16_SSE2(value, dst); 654 } 655 } 656 657 static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst, 658 const uint8_t* WEBP_RESTRICT top) { 659 int j; 660 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 661 for (j = 0; j < 8; ++j) { 662 _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values); 663 } 664 } 665 666 static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst, 667 const uint8_t* WEBP_RESTRICT top) { 668 const __m128i top_values = _mm_load_si128((const __m128i*)top); 669 int j; 670 for (j = 0; j < 16; ++j) { 671 _mm_store_si128((__m128i*)(dst + j * BPS), top_values); 672 } 673 } 674 675 static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst, 676 const uint8_t* WEBP_RESTRICT top, 677 int size) { 678 if (top != NULL) { 679 if (size == 8) { 680 VE8uv_SSE2(dst, top); 681 } else { 682 VE16_SSE2(dst, top); 683 } 684 } else { 685 Fill_SSE2(dst, 127, size); 686 } 687 } 688 689 static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst, 690 const uint8_t* WEBP_RESTRICT left) { 691 int j; 692 for (j = 0; j < 8; ++j) { 693 const __m128i values = _mm_set1_epi8((char)left[j]); 694 _mm_storel_epi64((__m128i*)dst, values); 695 dst += BPS; 696 } 697 } 698 699 static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst, 700 const uint8_t* WEBP_RESTRICT left) { 701 int j; 702 for (j = 0; j < 16; ++j) { 703 const __m128i values = _mm_set1_epi8((char)left[j]); 704 _mm_store_si128((__m128i*)dst, values); 705 dst += BPS; 706 } 707 } 708 709 static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst, 710 const uint8_t* WEBP_RESTRICT left, 711 int size) { 712 if (left != NULL) { 713 if (size == 8) { 714 HE8uv_SSE2(dst, left); 715 } else { 716 HE16_SSE2(dst, left); 717 } 718 } else { 719 Fill_SSE2(dst, 129, size); 720 } 721 } 722 723 static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst, 724 const uint8_t* WEBP_RESTRICT left, 725 const uint8_t* WEBP_RESTRICT top, int size) { 726 const __m128i zero = _mm_setzero_si128(); 727 int y; 728 if (size == 8) { 729 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 730 const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); 731 for (y = 0; y < 8; ++y, dst += BPS) { 732 const int val = left[y] - left[-1]; 733 const __m128i base = _mm_set1_epi16(val); 734 const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); 735 _mm_storel_epi64((__m128i*)dst, out); 736 } 737 } else { 738 const __m128i top_values = _mm_load_si128((const __m128i*)top); 739 const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero); 740 const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero); 741 for (y = 0; y < 16; ++y, dst += BPS) { 742 const int val = left[y] - left[-1]; 743 const __m128i base = _mm_set1_epi16(val); 744 const __m128i out_0 = _mm_add_epi16(base, top_base_0); 745 const __m128i out_1 = _mm_add_epi16(base, top_base_1); 746 const __m128i out = _mm_packus_epi16(out_0, out_1); 747 _mm_store_si128((__m128i*)dst, out); 748 } 749 } 750 } 751 752 static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst, 753 const uint8_t* WEBP_RESTRICT left, 754 const uint8_t* WEBP_RESTRICT top, 755 int size) { 756 if (left != NULL) { 757 if (top != NULL) { 758 TM_SSE2(dst, left, top, size); 759 } else { 760 HorizontalPred_SSE2(dst, left, size); 761 } 762 } else { 763 // true motion without left samples (hence: with default 129 value) 764 // is equivalent to VE prediction where you just copy the top samples. 765 // Note that if top samples are not available, the default value is 766 // then 129, and not 127 as in the VerticalPred case. 767 if (top != NULL) { 768 VerticalPred_SSE2(dst, top, size); 769 } else { 770 Fill_SSE2(dst, 129, size); 771 } 772 } 773 } 774 775 static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst, 776 const uint8_t* WEBP_RESTRICT left, 777 const uint8_t* WEBP_RESTRICT top) { 778 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 779 const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); 780 const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); 781 const int DC = VP8HorizontalAdd8b(&combined) + 8; 782 Put8x8uv_SSE2(DC >> 4, dst); 783 } 784 785 static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst, 786 const uint8_t* WEBP_RESTRICT top) { 787 const __m128i zero = _mm_setzero_si128(); 788 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 789 const __m128i sum = _mm_sad_epu8(top_values, zero); 790 const int DC = _mm_cvtsi128_si32(sum) + 4; 791 Put8x8uv_SSE2(DC >> 3, dst); 792 } 793 794 static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst, 795 const uint8_t* WEBP_RESTRICT left) { 796 // 'left' is contiguous so we can reuse the top summation. 797 DC8uvNoLeft_SSE2(dst, left); 798 } 799 800 static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) { 801 Put8x8uv_SSE2(0x80, dst); 802 } 803 804 static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst, 805 const uint8_t* WEBP_RESTRICT left, 806 const uint8_t* WEBP_RESTRICT top) { 807 if (top != NULL) { 808 if (left != NULL) { // top and left present 809 DC8uv_SSE2(dst, left, top); 810 } else { // top, but no left 811 DC8uvNoLeft_SSE2(dst, top); 812 } 813 } else if (left != NULL) { // left but no top 814 DC8uvNoTop_SSE2(dst, left); 815 } else { // no top, no left, nothing. 816 DC8uvNoTopLeft_SSE2(dst); 817 } 818 } 819 820 static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst, 821 const uint8_t* WEBP_RESTRICT left, 822 const uint8_t* WEBP_RESTRICT top) { 823 const __m128i top_row = _mm_load_si128((const __m128i*)top); 824 const __m128i left_row = _mm_load_si128((const __m128i*)left); 825 const int DC = 826 VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16; 827 Put16_SSE2(DC >> 5, dst); 828 } 829 830 static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst, 831 const uint8_t* WEBP_RESTRICT top) { 832 const __m128i top_row = _mm_load_si128((const __m128i*)top); 833 const int DC = VP8HorizontalAdd8b(&top_row) + 8; 834 Put16_SSE2(DC >> 4, dst); 835 } 836 837 static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst, 838 const uint8_t* WEBP_RESTRICT left) { 839 // 'left' is contiguous so we can reuse the top summation. 840 DC16NoLeft_SSE2(dst, left); 841 } 842 843 static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) { 844 Put16_SSE2(0x80, dst); 845 } 846 847 static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst, 848 const uint8_t* WEBP_RESTRICT left, 849 const uint8_t* WEBP_RESTRICT top) { 850 if (top != NULL) { 851 if (left != NULL) { // top and left present 852 DC16_SSE2(dst, left, top); 853 } else { // top, but no left 854 DC16NoLeft_SSE2(dst, top); 855 } 856 } else if (left != NULL) { // left but no top 857 DC16NoTop_SSE2(dst, left); 858 } else { // no top, no left, nothing. 859 DC16NoTopLeft_SSE2(dst); 860 } 861 } 862 863 //------------------------------------------------------------------------------ 864 // 4x4 predictions 865 866 #define DST(x, y) dst[(x) + (y) * BPS] 867 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) 868 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 869 870 // We use the following 8b-arithmetic tricks: 871 // (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1 872 // where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1] 873 // and: 874 // (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb 875 // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 876 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 877 878 // vertical 879 static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst, 880 const uint8_t* WEBP_RESTRICT top) { 881 const __m128i one = _mm_set1_epi8(1); 882 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); 883 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); 884 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); 885 const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00); 886 const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); 887 const __m128i b = _mm_subs_epu8(a, lsb); 888 const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); 889 const int vals = _mm_cvtsi128_si32(avg); 890 int i; 891 for (i = 0; i < 4; ++i) { 892 WebPInt32ToMem(dst + i * BPS, vals); 893 } 894 } 895 896 // horizontal 897 static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst, 898 const uint8_t* WEBP_RESTRICT top) { 899 const int X = top[-1]; 900 const int I = top[-2]; 901 const int J = top[-3]; 902 const int K = top[-4]; 903 const int L = top[-5]; 904 WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); 905 WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); 906 WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); 907 WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); 908 } 909 910 static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst, 911 const uint8_t* WEBP_RESTRICT top) { 912 uint32_t dc = 4; 913 int i; 914 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 915 Fill_SSE2(dst, dc >> 3, 4); 916 } 917 918 // Down-Left 919 static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst, 920 const uint8_t* WEBP_RESTRICT top) { 921 const __m128i one = _mm_set1_epi8(1); 922 const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); 923 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); 924 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); 925 const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3); 926 const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0); 927 const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); 928 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 929 const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); 930 WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); 931 WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); 932 WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); 933 WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); 934 } 935 936 // Vertical-Right 937 static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst, 938 const uint8_t* WEBP_RESTRICT top) { 939 const __m128i one = _mm_set1_epi8(1); 940 const int I = top[-2]; 941 const int J = top[-3]; 942 const int K = top[-4]; 943 const int X = top[-1]; 944 const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1)); 945 const __m128i ABCD0 = _mm_srli_si128(XABCD, 1); 946 const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0); 947 const __m128i _XABCD = _mm_slli_si128(XABCD, 1); 948 const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0); 949 const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0); 950 const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); 951 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 952 const __m128i efgh = _mm_avg_epu8(avg2, XABCD); 953 WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); 954 WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); 955 WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); 956 WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); 957 958 // these two are hard to implement in SSE2, so we keep the C-version: 959 DST(0, 2) = AVG3(J, I, X); 960 DST(0, 3) = AVG3(K, J, I); 961 } 962 963 // Vertical-Left 964 static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst, 965 const uint8_t* WEBP_RESTRICT top) { 966 const __m128i one = _mm_set1_epi8(1); 967 const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); 968 const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); 969 const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2); 970 const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_); 971 const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_); 972 const __m128i avg3 = _mm_avg_epu8(avg1, avg2); 973 const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one); 974 const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_); 975 const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_); 976 const __m128i abbc = _mm_or_si128(ab, bc); 977 const __m128i lsb2 = _mm_and_si128(abbc, lsb1); 978 const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); 979 const uint32_t extra_out = 980 (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); 981 WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); 982 WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); 983 WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); 984 WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); 985 986 // these two are hard to get and irregular 987 DST(3, 2) = (extra_out >> 0) & 0xff; 988 DST(3, 3) = (extra_out >> 8) & 0xff; 989 } 990 991 // Down-right 992 static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst, 993 const uint8_t* WEBP_RESTRICT top) { 994 const __m128i one = _mm_set1_epi8(1); 995 const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); 996 const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); 997 const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1); 998 const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2); 999 const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD); 1000 const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); 1001 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 1002 const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); 1003 WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); 1004 WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); 1005 WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); 1006 WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); 1007 } 1008 1009 static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst, 1010 const uint8_t* WEBP_RESTRICT top) { 1011 const int I = top[-2]; 1012 const int J = top[-3]; 1013 const int K = top[-4]; 1014 const int L = top[-5]; 1015 DST(0, 0) = AVG2(I, J); 1016 DST(2, 0) = DST(0, 1) = AVG2(J, K); 1017 DST(2, 1) = DST(0, 2) = AVG2(K, L); 1018 DST(1, 0) = AVG3(I, J, K); 1019 DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 1020 DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 1021 DST(3, 2) = DST(2, 2) = 1022 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 1023 } 1024 1025 static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst, 1026 const uint8_t* WEBP_RESTRICT top) { 1027 const int X = top[-1]; 1028 const int I = top[-2]; 1029 const int J = top[-3]; 1030 const int K = top[-4]; 1031 const int L = top[-5]; 1032 const int A = top[0]; 1033 const int B = top[1]; 1034 const int C = top[2]; 1035 1036 DST(0, 0) = DST(2, 1) = AVG2(I, X); 1037 DST(0, 1) = DST(2, 2) = AVG2(J, I); 1038 DST(0, 2) = DST(2, 3) = AVG2(K, J); 1039 DST(0, 3) = AVG2(L, K); 1040 1041 DST(3, 0) = AVG3(A, B, C); 1042 DST(2, 0) = AVG3(X, A, B); 1043 DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 1044 DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 1045 DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 1046 DST(1, 3) = AVG3(L, K, J); 1047 } 1048 1049 static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst, 1050 const uint8_t* WEBP_RESTRICT top) { 1051 const __m128i zero = _mm_setzero_si128(); 1052 const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); 1053 const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); 1054 int y; 1055 for (y = 0; y < 4; ++y, dst += BPS) { 1056 const int val = top[-2 - y] - top[-1]; 1057 const __m128i base = _mm_set1_epi16(val); 1058 const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); 1059 WebPInt32ToMem(dst, _mm_cvtsi128_si32(out)); 1060 } 1061 } 1062 1063 #undef DST 1064 #undef AVG3 1065 #undef AVG2 1066 1067 //------------------------------------------------------------------------------ 1068 // luma 4x4 prediction 1069 1070 // Left samples are top[-5 .. -2], top_left is top[-1], top are 1071 // located at top[0..3], and top right is top[4..7] 1072 static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst, 1073 const uint8_t* WEBP_RESTRICT top) { 1074 DC4_SSE2(I4DC4 + dst, top); 1075 TM4_SSE2(I4TM4 + dst, top); 1076 VE4_SSE2(I4VE4 + dst, top); 1077 HE4_SSE2(I4HE4 + dst, top); 1078 RD4_SSE2(I4RD4 + dst, top); 1079 VR4_SSE2(I4VR4 + dst, top); 1080 LD4_SSE2(I4LD4 + dst, top); 1081 VL4_SSE2(I4VL4 + dst, top); 1082 HD4_SSE2(I4HD4 + dst, top); 1083 HU4_SSE2(I4HU4 + dst, top); 1084 } 1085 1086 //------------------------------------------------------------------------------ 1087 // Chroma 8x8 prediction (paragraph 12.2) 1088 1089 static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst, 1090 const uint8_t* WEBP_RESTRICT left, 1091 const uint8_t* WEBP_RESTRICT top) { 1092 // U block 1093 DC8uvMode_SSE2(C8DC8 + dst, left, top); 1094 VerticalPred_SSE2(C8VE8 + dst, top, 8); 1095 HorizontalPred_SSE2(C8HE8 + dst, left, 8); 1096 TrueMotion_SSE2(C8TM8 + dst, left, top, 8); 1097 // V block 1098 dst += 8; 1099 if (top != NULL) top += 8; 1100 if (left != NULL) left += 16; 1101 DC8uvMode_SSE2(C8DC8 + dst, left, top); 1102 VerticalPred_SSE2(C8VE8 + dst, top, 8); 1103 HorizontalPred_SSE2(C8HE8 + dst, left, 8); 1104 TrueMotion_SSE2(C8TM8 + dst, left, top, 8); 1105 } 1106 1107 //------------------------------------------------------------------------------ 1108 // luma 16x16 prediction (paragraph 12.3) 1109 1110 static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst, 1111 const uint8_t* WEBP_RESTRICT left, 1112 const uint8_t* WEBP_RESTRICT top) { 1113 DC16Mode_SSE2(I16DC16 + dst, left, top); 1114 VerticalPred_SSE2(I16VE16 + dst, top, 16); 1115 HorizontalPred_SSE2(I16HE16 + dst, left, 16); 1116 TrueMotion_SSE2(I16TM16 + dst, left, top, 16); 1117 } 1118 1119 //------------------------------------------------------------------------------ 1120 // Metric 1121 1122 static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a, 1123 const __m128i b, 1124 __m128i* const sum) { 1125 // take abs(a-b) in 8b 1126 const __m128i a_b = _mm_subs_epu8(a, b); 1127 const __m128i b_a = _mm_subs_epu8(b, a); 1128 const __m128i abs_a_b = _mm_or_si128(a_b, b_a); 1129 // zero-extend to 16b 1130 const __m128i zero = _mm_setzero_si128(); 1131 const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); 1132 const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); 1133 // multiply with self 1134 const __m128i sum1 = _mm_madd_epi16(C0, C0); 1135 const __m128i sum2 = _mm_madd_epi16(C1, C1); 1136 *sum = _mm_add_epi32(sum1, sum2); 1137 } 1138 1139 static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a, 1140 const uint8_t* WEBP_RESTRICT b, 1141 int num_pairs) { 1142 __m128i sum = _mm_setzero_si128(); 1143 int32_t tmp[4]; 1144 int i; 1145 1146 for (i = 0; i < num_pairs; ++i) { 1147 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]); 1148 const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]); 1149 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]); 1150 const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]); 1151 __m128i sum1, sum2; 1152 SubtractAndAccumulate_SSE2(a0, b0, &sum1); 1153 SubtractAndAccumulate_SSE2(a1, b1, &sum2); 1154 sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2)); 1155 a += 2 * BPS; 1156 b += 2 * BPS; 1157 } 1158 _mm_storeu_si128((__m128i*)tmp, sum); 1159 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1160 } 1161 1162 static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a, 1163 const uint8_t* WEBP_RESTRICT b) { 1164 return SSE_16xN_SSE2(a, b, 8); 1165 } 1166 1167 static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a, 1168 const uint8_t* WEBP_RESTRICT b) { 1169 return SSE_16xN_SSE2(a, b, 4); 1170 } 1171 1172 #define LOAD_8x16b(ptr) \ 1173 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) 1174 1175 static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a, 1176 const uint8_t* WEBP_RESTRICT b) { 1177 const __m128i zero = _mm_setzero_si128(); 1178 int num_pairs = 4; 1179 __m128i sum = zero; 1180 int32_t tmp[4]; 1181 while (num_pairs-- > 0) { 1182 const __m128i a0 = LOAD_8x16b(&a[BPS * 0]); 1183 const __m128i a1 = LOAD_8x16b(&a[BPS * 1]); 1184 const __m128i b0 = LOAD_8x16b(&b[BPS * 0]); 1185 const __m128i b1 = LOAD_8x16b(&b[BPS * 1]); 1186 // subtract 1187 const __m128i c0 = _mm_subs_epi16(a0, b0); 1188 const __m128i c1 = _mm_subs_epi16(a1, b1); 1189 // multiply/accumulate with self 1190 const __m128i d0 = _mm_madd_epi16(c0, c0); 1191 const __m128i d1 = _mm_madd_epi16(c1, c1); 1192 // collect 1193 const __m128i sum01 = _mm_add_epi32(d0, d1); 1194 sum = _mm_add_epi32(sum, sum01); 1195 a += 2 * BPS; 1196 b += 2 * BPS; 1197 } 1198 _mm_storeu_si128((__m128i*)tmp, sum); 1199 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1200 } 1201 #undef LOAD_8x16b 1202 1203 static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a, 1204 const uint8_t* WEBP_RESTRICT b) { 1205 const __m128i zero = _mm_setzero_si128(); 1206 1207 // Load values. Note that we read 8 pixels instead of 4, 1208 // but the a/b buffers are over-allocated to that effect. 1209 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]); 1210 const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]); 1211 const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]); 1212 const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]); 1213 const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]); 1214 const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]); 1215 const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]); 1216 const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]); 1217 // Combine pair of lines. 1218 const __m128i a01 = _mm_unpacklo_epi32(a0, a1); 1219 const __m128i a23 = _mm_unpacklo_epi32(a2, a3); 1220 const __m128i b01 = _mm_unpacklo_epi32(b0, b1); 1221 const __m128i b23 = _mm_unpacklo_epi32(b2, b3); 1222 // Convert to 16b. 1223 const __m128i a01s = _mm_unpacklo_epi8(a01, zero); 1224 const __m128i a23s = _mm_unpacklo_epi8(a23, zero); 1225 const __m128i b01s = _mm_unpacklo_epi8(b01, zero); 1226 const __m128i b23s = _mm_unpacklo_epi8(b23, zero); 1227 // subtract, square and accumulate 1228 const __m128i d0 = _mm_subs_epi16(a01s, b01s); 1229 const __m128i d1 = _mm_subs_epi16(a23s, b23s); 1230 const __m128i e0 = _mm_madd_epi16(d0, d0); 1231 const __m128i e1 = _mm_madd_epi16(d1, d1); 1232 const __m128i sum = _mm_add_epi32(e0, e1); 1233 1234 int32_t tmp[4]; 1235 _mm_storeu_si128((__m128i*)tmp, sum); 1236 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1237 } 1238 1239 //------------------------------------------------------------------------------ 1240 1241 static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) { 1242 const __m128i mask = _mm_set1_epi16(0x00ff); 1243 const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); 1244 const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); 1245 const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]); 1246 const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]); 1247 const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte 1248 const __m128i b1 = _mm_srli_epi16(a1, 8); 1249 const __m128i b2 = _mm_srli_epi16(a2, 8); 1250 const __m128i b3 = _mm_srli_epi16(a3, 8); 1251 const __m128i c0 = _mm_and_si128(a0, mask); // lo byte 1252 const __m128i c1 = _mm_and_si128(a1, mask); 1253 const __m128i c2 = _mm_and_si128(a2, mask); 1254 const __m128i c3 = _mm_and_si128(a3, mask); 1255 const __m128i d0 = _mm_add_epi32(b0, c0); 1256 const __m128i d1 = _mm_add_epi32(b1, c1); 1257 const __m128i d2 = _mm_add_epi32(b2, c2); 1258 const __m128i d3 = _mm_add_epi32(b3, c3); 1259 const __m128i e0 = _mm_add_epi32(d0, d1); 1260 const __m128i e1 = _mm_add_epi32(d2, d3); 1261 const __m128i f0 = _mm_add_epi32(e0, e1); 1262 uint16_t tmp[8]; 1263 _mm_storeu_si128((__m128i*)tmp, f0); 1264 dc[0] = tmp[0] + tmp[1]; 1265 dc[1] = tmp[2] + tmp[3]; 1266 dc[2] = tmp[4] + tmp[5]; 1267 dc[3] = tmp[6] + tmp[7]; 1268 } 1269 1270 //------------------------------------------------------------------------------ 1271 // Texture distortion 1272 // 1273 // We try to match the spectral content (weighted) between source and 1274 // reconstructed samples. 1275 1276 // Hadamard transform 1277 // Returns the weighted sum of the absolute value of transformed coefficients. 1278 // w[] contains a row-major 4 by 4 symmetric matrix. 1279 static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA, 1280 const uint8_t* WEBP_RESTRICT inB, 1281 const uint16_t* WEBP_RESTRICT const w) { 1282 int32_t sum[4]; 1283 __m128i tmp_0, tmp_1, tmp_2, tmp_3; 1284 const __m128i zero = _mm_setzero_si128(); 1285 1286 // Load and combine inputs. 1287 { 1288 const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); 1289 const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); 1290 const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); 1291 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); 1292 const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); 1293 const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); 1294 const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); 1295 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); 1296 1297 // Combine inA and inB (we'll do two transforms in parallel). 1298 const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); 1299 const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); 1300 const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); 1301 const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); 1302 tmp_0 = _mm_unpacklo_epi8(inAB_0, zero); 1303 tmp_1 = _mm_unpacklo_epi8(inAB_1, zero); 1304 tmp_2 = _mm_unpacklo_epi8(inAB_2, zero); 1305 tmp_3 = _mm_unpacklo_epi8(inAB_3, zero); 1306 // a00 a01 a02 a03 b00 b01 b02 b03 1307 // a10 a11 a12 a13 b10 b11 b12 b13 1308 // a20 a21 a22 a23 b20 b21 b22 b23 1309 // a30 a31 a32 a33 b30 b31 b32 b33 1310 } 1311 1312 // Vertical pass first to avoid a transpose (vertical and horizontal passes 1313 // are commutative because w/kWeightY is symmetric) and subsequent transpose. 1314 { 1315 // Calculate a and b (two 4x4 at once). 1316 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1317 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1318 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1319 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1320 const __m128i b0 = _mm_add_epi16(a0, a1); 1321 const __m128i b1 = _mm_add_epi16(a3, a2); 1322 const __m128i b2 = _mm_sub_epi16(a3, a2); 1323 const __m128i b3 = _mm_sub_epi16(a0, a1); 1324 // a00 a01 a02 a03 b00 b01 b02 b03 1325 // a10 a11 a12 a13 b10 b11 b12 b13 1326 // a20 a21 a22 a23 b20 b21 b22 b23 1327 // a30 a31 a32 a33 b30 b31 b32 b33 1328 1329 // Transpose the two 4x4. 1330 VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); 1331 } 1332 1333 // Horizontal pass and difference of weighted sums. 1334 { 1335 // Load all inputs. 1336 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); 1337 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); 1338 1339 // Calculate a and b (two 4x4 at once). 1340 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1341 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1342 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1343 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1344 const __m128i b0 = _mm_add_epi16(a0, a1); 1345 const __m128i b1 = _mm_add_epi16(a3, a2); 1346 const __m128i b2 = _mm_sub_epi16(a3, a2); 1347 const __m128i b3 = _mm_sub_epi16(a0, a1); 1348 1349 // Separate the transforms of inA and inB. 1350 __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); 1351 __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); 1352 __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); 1353 __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); 1354 1355 { 1356 const __m128i d0 = _mm_sub_epi16(zero, A_b0); 1357 const __m128i d1 = _mm_sub_epi16(zero, A_b2); 1358 const __m128i d2 = _mm_sub_epi16(zero, B_b0); 1359 const __m128i d3 = _mm_sub_epi16(zero, B_b2); 1360 A_b0 = _mm_max_epi16(A_b0, d0); // abs(v), 16b 1361 A_b2 = _mm_max_epi16(A_b2, d1); 1362 B_b0 = _mm_max_epi16(B_b0, d2); 1363 B_b2 = _mm_max_epi16(B_b2, d3); 1364 } 1365 1366 // weighted sums 1367 A_b0 = _mm_madd_epi16(A_b0, w_0); 1368 A_b2 = _mm_madd_epi16(A_b2, w_8); 1369 B_b0 = _mm_madd_epi16(B_b0, w_0); 1370 B_b2 = _mm_madd_epi16(B_b2, w_8); 1371 A_b0 = _mm_add_epi32(A_b0, A_b2); 1372 B_b0 = _mm_add_epi32(B_b0, B_b2); 1373 1374 // difference of weighted sums 1375 A_b0 = _mm_sub_epi32(A_b0, B_b0); 1376 _mm_storeu_si128((__m128i*)&sum[0], A_b0); 1377 } 1378 return sum[0] + sum[1] + sum[2] + sum[3]; 1379 } 1380 1381 static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a, 1382 const uint8_t* WEBP_RESTRICT const b, 1383 const uint16_t* WEBP_RESTRICT const w) { 1384 const int diff_sum = TTransform_SSE2(a, b, w); 1385 return abs(diff_sum) >> 5; 1386 } 1387 1388 static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a, 1389 const uint8_t* WEBP_RESTRICT const b, 1390 const uint16_t* WEBP_RESTRICT const w) { 1391 int D = 0; 1392 int x, y; 1393 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 1394 for (x = 0; x < 16; x += 4) { 1395 D += Disto4x4_SSE2(a + x + y, b + x + y, w); 1396 } 1397 } 1398 return D; 1399 } 1400 1401 //------------------------------------------------------------------------------ 1402 // Quantization 1403 // 1404 1405 static WEBP_INLINE int DoQuantizeBlock_SSE2( 1406 int16_t in[16], int16_t out[16], 1407 const uint16_t* WEBP_RESTRICT const sharpen, 1408 const VP8Matrix* WEBP_RESTRICT const mtx) { 1409 const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); 1410 const __m128i zero = _mm_setzero_si128(); 1411 __m128i coeff0, coeff8; 1412 __m128i out0, out8; 1413 __m128i packed_out; 1414 1415 // Load all inputs. 1416 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); 1417 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); 1418 const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq[0]); 1419 const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq[8]); 1420 const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q[0]); 1421 const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q[8]); 1422 1423 // extract sign(in) (0x0000 if positive, 0xffff if negative) 1424 const __m128i sign0 = _mm_cmpgt_epi16(zero, in0); 1425 const __m128i sign8 = _mm_cmpgt_epi16(zero, in8); 1426 1427 // coeff = abs(in) = (in ^ sign) - sign 1428 coeff0 = _mm_xor_si128(in0, sign0); 1429 coeff8 = _mm_xor_si128(in8, sign8); 1430 coeff0 = _mm_sub_epi16(coeff0, sign0); 1431 coeff8 = _mm_sub_epi16(coeff8, sign8); 1432 1433 // coeff = abs(in) + sharpen 1434 if (sharpen != NULL) { 1435 const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); 1436 const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); 1437 coeff0 = _mm_add_epi16(coeff0, sharpen0); 1438 coeff8 = _mm_add_epi16(coeff8, sharpen8); 1439 } 1440 1441 // out = (coeff * iQ + B) >> QFIX 1442 { 1443 // doing calculations with 32b precision (QFIX=17) 1444 // out = (coeff * iQ) 1445 const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); 1446 const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); 1447 const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); 1448 const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); 1449 __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); 1450 __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); 1451 __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); 1452 __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); 1453 // out = (coeff * iQ + B) 1454 const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias[0]); 1455 const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias[4]); 1456 const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias[8]); 1457 const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias[12]); 1458 out_00 = _mm_add_epi32(out_00, bias_00); 1459 out_04 = _mm_add_epi32(out_04, bias_04); 1460 out_08 = _mm_add_epi32(out_08, bias_08); 1461 out_12 = _mm_add_epi32(out_12, bias_12); 1462 // out = QUANTDIV(coeff, iQ, B, QFIX) 1463 out_00 = _mm_srai_epi32(out_00, QFIX); 1464 out_04 = _mm_srai_epi32(out_04, QFIX); 1465 out_08 = _mm_srai_epi32(out_08, QFIX); 1466 out_12 = _mm_srai_epi32(out_12, QFIX); 1467 1468 // pack result as 16b 1469 out0 = _mm_packs_epi32(out_00, out_04); 1470 out8 = _mm_packs_epi32(out_08, out_12); 1471 1472 // if (coeff > 2047) coeff = 2047 1473 out0 = _mm_min_epi16(out0, max_coeff_2047); 1474 out8 = _mm_min_epi16(out8, max_coeff_2047); 1475 } 1476 1477 // get sign back (if (sign[j]) out_n = -out_n) 1478 out0 = _mm_xor_si128(out0, sign0); 1479 out8 = _mm_xor_si128(out8, sign8); 1480 out0 = _mm_sub_epi16(out0, sign0); 1481 out8 = _mm_sub_epi16(out8, sign8); 1482 1483 // in = out * Q 1484 in0 = _mm_mullo_epi16(out0, q0); 1485 in8 = _mm_mullo_epi16(out8, q8); 1486 1487 _mm_storeu_si128((__m128i*)&in[0], in0); 1488 _mm_storeu_si128((__m128i*)&in[8], in8); 1489 1490 // zigzag the output before storing it. 1491 // 1492 // The zigzag pattern can almost be reproduced with a small sequence of 1493 // shuffles. After it, we only need to swap the 7th (ending up in third 1494 // position instead of twelfth) and 8th values. 1495 { 1496 __m128i outZ0, outZ8; 1497 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); 1498 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); 1499 outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); 1500 outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); 1501 outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); 1502 outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); 1503 _mm_storeu_si128((__m128i*)&out[0], outZ0); 1504 _mm_storeu_si128((__m128i*)&out[8], outZ8); 1505 packed_out = _mm_packs_epi16(outZ0, outZ8); 1506 } 1507 { 1508 const int16_t outZ_12 = out[12]; 1509 const int16_t outZ_3 = out[3]; 1510 out[3] = outZ_12; 1511 out[12] = outZ_3; 1512 } 1513 1514 // detect if all 'out' values are zeroes or not 1515 return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); 1516 } 1517 1518 static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16], 1519 const VP8Matrix* WEBP_RESTRICT const mtx) { 1520 return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen[0], mtx); 1521 } 1522 1523 static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16], 1524 const VP8Matrix* WEBP_RESTRICT const mtx) { 1525 return DoQuantizeBlock_SSE2(in, out, NULL, mtx); 1526 } 1527 1528 static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32], 1529 const VP8Matrix* WEBP_RESTRICT const mtx) { 1530 int nz; 1531 const uint16_t* const sharpen = &mtx->sharpen[0]; 1532 nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; 1533 nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; 1534 return nz; 1535 } 1536 1537 //------------------------------------------------------------------------------ 1538 // Entry point 1539 1540 extern void VP8EncDspInitSSE2(void); 1541 1542 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { 1543 VP8CollectHistogram = CollectHistogram_SSE2; 1544 VP8EncPredLuma16 = Intra16Preds_SSE2; 1545 VP8EncPredChroma8 = IntraChromaPreds_SSE2; 1546 VP8EncPredLuma4 = Intra4Preds_SSE2; 1547 VP8EncQuantizeBlock = QuantizeBlock_SSE2; 1548 VP8EncQuantize2Blocks = Quantize2Blocks_SSE2; 1549 VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2; 1550 VP8ITransform = ITransform_SSE2; 1551 VP8FTransform = FTransform_SSE2; 1552 VP8FTransform2 = FTransform2_SSE2; 1553 VP8FTransformWHT = FTransformWHT_SSE2; 1554 VP8SSE16x16 = SSE16x16_SSE2; 1555 VP8SSE16x8 = SSE16x8_SSE2; 1556 VP8SSE8x8 = SSE8x8_SSE2; 1557 VP8SSE4x4 = SSE4x4_SSE2; 1558 VP8TDisto4x4 = Disto4x4_SSE2; 1559 VP8TDisto16x16 = Disto16x16_SSE2; 1560 VP8Mean16x4 = Mean16x4_SSE2; 1561 } 1562 1563 #else // !WEBP_USE_SSE2 1564 1565 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) 1566 1567 #endif // WEBP_USE_SSE2