enc.c (27006B)
1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Speed-critical encoding functions. 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include <assert.h> 15 #include <stdlib.h> // for abs() 16 #include <string.h> 17 18 #include "src/dsp/cpu.h" 19 #include "src/dsp/dsp.h" 20 #include "src/enc/vp8i_enc.h" 21 #include "src/utils/utils.h" 22 #include "src/webp/types.h" 23 24 static WEBP_INLINE uint8_t clip_8b(int v) { 25 return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; 26 } 27 28 #if !WEBP_NEON_OMIT_C_CODE 29 static WEBP_INLINE int clip_max(int v, int max) { 30 return (v > max) ? max : v; 31 } 32 #endif // !WEBP_NEON_OMIT_C_CODE 33 34 //------------------------------------------------------------------------------ 35 // Compute susceptibility based on DCT-coeff histograms: 36 // the higher, the "easier" the macroblock is to compress. 37 38 const int VP8DspScan[16 + 4 + 4] = { 39 // Luma 40 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 41 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 42 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 43 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 44 45 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 46 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 47 }; 48 49 // general-purpose util function 50 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], 51 VP8Histogram* const histo) { 52 int max_value = 0, last_non_zero = 1; 53 int k; 54 for (k = 0; k <= MAX_COEFF_THRESH; ++k) { 55 const int value = distribution[k]; 56 if (value > 0) { 57 if (value > max_value) max_value = value; 58 last_non_zero = k; 59 } 60 } 61 histo->max_value = max_value; 62 histo->last_non_zero = last_non_zero; 63 } 64 65 #if !WEBP_NEON_OMIT_C_CODE 66 static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref, 67 const uint8_t* WEBP_RESTRICT pred, 68 int start_block, int end_block, 69 VP8Histogram* WEBP_RESTRICT const histo) { 70 int j; 71 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 72 for (j = start_block; j < end_block; ++j) { 73 int k; 74 int16_t out[16]; 75 76 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 77 78 // Convert coefficients to bin. 79 for (k = 0; k < 16; ++k) { 80 const int v = abs(out[k]) >> 3; 81 const int clipped_value = clip_max(v, MAX_COEFF_THRESH); 82 ++distribution[clipped_value]; 83 } 84 } 85 VP8SetHistogramData(distribution, histo); 86 } 87 #endif // !WEBP_NEON_OMIT_C_CODE 88 89 //------------------------------------------------------------------------------ 90 // run-time tables (~4k) 91 92 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] 93 94 // We declare this variable 'volatile' to prevent instruction reordering 95 // and make sure it's set to true _last_ (so as to be thread-safe) 96 static volatile int tables_ok = 0; 97 98 static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) { 99 if (!tables_ok) { 100 int i; 101 for (i = -255; i <= 255 + 255; ++i) { 102 clip1[255 + i] = clip_8b(i); 103 } 104 tables_ok = 1; 105 } 106 } 107 108 109 //------------------------------------------------------------------------------ 110 // Transforms (Paragraph 14.4) 111 112 #if !WEBP_NEON_OMIT_C_CODE 113 114 #define STORE(x, y, v) \ 115 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) 116 117 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, 118 const int16_t* WEBP_RESTRICT in, 119 uint8_t* WEBP_RESTRICT dst) { 120 int C[4 * 4], *tmp; 121 int i; 122 tmp = C; 123 for (i = 0; i < 4; ++i) { // vertical pass 124 const int a = in[0] + in[8]; 125 const int b = in[0] - in[8]; 126 const int c = 127 WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]); 128 const int d = 129 WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]); 130 tmp[0] = a + d; 131 tmp[1] = b + c; 132 tmp[2] = b - c; 133 tmp[3] = a - d; 134 tmp += 4; 135 in++; 136 } 137 138 tmp = C; 139 for (i = 0; i < 4; ++i) { // horizontal pass 140 const int dc = tmp[0] + 4; 141 const int a = dc + tmp[8]; 142 const int b = dc - tmp[8]; 143 const int c = 144 WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]); 145 const int d = 146 WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]); 147 STORE(0, i, a + d); 148 STORE(1, i, b + c); 149 STORE(2, i, b - c); 150 STORE(3, i, a - d); 151 tmp++; 152 } 153 } 154 155 static void ITransform_C(const uint8_t* WEBP_RESTRICT ref, 156 const int16_t* WEBP_RESTRICT in, 157 uint8_t* WEBP_RESTRICT dst, 158 int do_two) { 159 ITransformOne(ref, in, dst); 160 if (do_two) { 161 ITransformOne(ref + 4, in + 16, dst + 4); 162 } 163 } 164 165 static void FTransform_C(const uint8_t* WEBP_RESTRICT src, 166 const uint8_t* WEBP_RESTRICT ref, 167 int16_t* WEBP_RESTRICT out) { 168 int i; 169 int tmp[16]; 170 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { 171 const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255]) 172 const int d1 = src[1] - ref[1]; 173 const int d2 = src[2] - ref[2]; 174 const int d3 = src[3] - ref[3]; 175 const int a0 = (d0 + d3); // 10b [-510,510] 176 const int a1 = (d1 + d2); 177 const int a2 = (d1 - d2); 178 const int a3 = (d0 - d3); 179 tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160] 180 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542] 181 tmp[2 + i * 4] = (a0 - a1) * 8; 182 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9; 183 } 184 for (i = 0; i < 4; ++i) { 185 const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b 186 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); 187 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); 188 const int a3 = (tmp[0 + i] - tmp[12 + i]); 189 out[0 + i] = (a0 + a1 + 7) >> 4; // 12b 190 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); 191 out[8 + i] = (a0 - a1 + 7) >> 4; 192 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); 193 } 194 } 195 #endif // !WEBP_NEON_OMIT_C_CODE 196 197 static void FTransform2_C(const uint8_t* WEBP_RESTRICT src, 198 const uint8_t* WEBP_RESTRICT ref, 199 int16_t* WEBP_RESTRICT out) { 200 VP8FTransform(src, ref, out); 201 VP8FTransform(src + 4, ref + 4, out + 16); 202 } 203 204 #if !WEBP_NEON_OMIT_C_CODE 205 static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in, 206 int16_t* WEBP_RESTRICT out) { 207 // input is 12b signed 208 int32_t tmp[16]; 209 int i; 210 for (i = 0; i < 4; ++i, in += 64) { 211 const int a0 = (in[0 * 16] + in[2 * 16]); // 13b 212 const int a1 = (in[1 * 16] + in[3 * 16]); 213 const int a2 = (in[1 * 16] - in[3 * 16]); 214 const int a3 = (in[0 * 16] - in[2 * 16]); 215 tmp[0 + i * 4] = a0 + a1; // 14b 216 tmp[1 + i * 4] = a3 + a2; 217 tmp[2 + i * 4] = a3 - a2; 218 tmp[3 + i * 4] = a0 - a1; 219 } 220 for (i = 0; i < 4; ++i) { 221 const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b 222 const int a1 = (tmp[4 + i] + tmp[12+ i]); 223 const int a2 = (tmp[4 + i] - tmp[12+ i]); 224 const int a3 = (tmp[0 + i] - tmp[8 + i]); 225 const int b0 = a0 + a1; // 16b 226 const int b1 = a3 + a2; 227 const int b2 = a3 - a2; 228 const int b3 = a0 - a1; 229 out[ 0 + i] = b0 >> 1; // 15b 230 out[ 4 + i] = b1 >> 1; 231 out[ 8 + i] = b2 >> 1; 232 out[12 + i] = b3 >> 1; 233 } 234 } 235 #endif // !WEBP_NEON_OMIT_C_CODE 236 237 #undef STORE 238 239 //------------------------------------------------------------------------------ 240 // Intra predictions 241 242 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { 243 int j; 244 for (j = 0; j < size; ++j) { 245 memset(dst + j * BPS, value, size); 246 } 247 } 248 249 static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst, 250 const uint8_t* WEBP_RESTRICT top, 251 int size) { 252 int j; 253 if (top != NULL) { 254 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); 255 } else { 256 Fill(dst, 127, size); 257 } 258 } 259 260 static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst, 261 const uint8_t* WEBP_RESTRICT left, 262 int size) { 263 if (left != NULL) { 264 int j; 265 for (j = 0; j < size; ++j) { 266 memset(dst + j * BPS, left[j], size); 267 } 268 } else { 269 Fill(dst, 129, size); 270 } 271 } 272 273 static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst, 274 const uint8_t* WEBP_RESTRICT left, 275 const uint8_t* WEBP_RESTRICT top, int size) { 276 int y; 277 if (left != NULL) { 278 if (top != NULL) { 279 const uint8_t* const clip = clip1 + 255 - left[-1]; 280 for (y = 0; y < size; ++y) { 281 const uint8_t* const clip_table = clip + left[y]; 282 int x; 283 for (x = 0; x < size; ++x) { 284 dst[x] = clip_table[top[x]]; 285 } 286 dst += BPS; 287 } 288 } else { 289 HorizontalPred(dst, left, size); 290 } 291 } else { 292 // true motion without left samples (hence: with default 129 value) 293 // is equivalent to VE prediction where you just copy the top samples. 294 // Note that if top samples are not available, the default value is 295 // then 129, and not 127 as in the VerticalPred case. 296 if (top != NULL) { 297 VerticalPred(dst, top, size); 298 } else { 299 Fill(dst, 129, size); 300 } 301 } 302 } 303 304 static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst, 305 const uint8_t* WEBP_RESTRICT left, 306 const uint8_t* WEBP_RESTRICT top, 307 int size, int round, int shift) { 308 int DC = 0; 309 int j; 310 if (top != NULL) { 311 for (j = 0; j < size; ++j) DC += top[j]; 312 if (left != NULL) { // top and left present 313 for (j = 0; j < size; ++j) DC += left[j]; 314 } else { // top, but no left 315 DC += DC; 316 } 317 DC = (DC + round) >> shift; 318 } else if (left != NULL) { // left but no top 319 for (j = 0; j < size; ++j) DC += left[j]; 320 DC += DC; 321 DC = (DC + round) >> shift; 322 } else { // no top, no left, nothing. 323 DC = 0x80; 324 } 325 Fill(dst, DC, size); 326 } 327 328 //------------------------------------------------------------------------------ 329 // Chroma 8x8 prediction (paragraph 12.2) 330 331 static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst, 332 const uint8_t* WEBP_RESTRICT left, 333 const uint8_t* WEBP_RESTRICT top) { 334 // U block 335 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 336 VerticalPred(C8VE8 + dst, top, 8); 337 HorizontalPred(C8HE8 + dst, left, 8); 338 TrueMotion(C8TM8 + dst, left, top, 8); 339 // V block 340 dst += 8; 341 if (top != NULL) top += 8; 342 if (left != NULL) left += 16; 343 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 344 VerticalPred(C8VE8 + dst, top, 8); 345 HorizontalPred(C8HE8 + dst, left, 8); 346 TrueMotion(C8TM8 + dst, left, top, 8); 347 } 348 349 //------------------------------------------------------------------------------ 350 // luma 16x16 prediction (paragraph 12.3) 351 352 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 353 static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst, 354 const uint8_t* WEBP_RESTRICT left, 355 const uint8_t* WEBP_RESTRICT top) { 356 DCMode(I16DC16 + dst, left, top, 16, 16, 5); 357 VerticalPred(I16VE16 + dst, top, 16); 358 HorizontalPred(I16HE16 + dst, left, 16); 359 TrueMotion(I16TM16 + dst, left, top, 16); 360 } 361 #endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 362 363 //------------------------------------------------------------------------------ 364 // luma 4x4 prediction 365 366 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32 367 368 #define DST(x, y) dst[(x) + (y) * BPS] 369 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) 370 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 371 372 // vertical 373 static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 374 const uint8_t vals[4] = { 375 AVG3(top[-1], top[0], top[1]), 376 AVG3(top[ 0], top[1], top[2]), 377 AVG3(top[ 1], top[2], top[3]), 378 AVG3(top[ 2], top[3], top[4]) 379 }; 380 int i; 381 for (i = 0; i < 4; ++i) { 382 memcpy(dst + i * BPS, vals, 4); 383 } 384 } 385 386 // horizontal 387 static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 388 const int X = top[-1]; 389 const int I = top[-2]; 390 const int J = top[-3]; 391 const int K = top[-4]; 392 const int L = top[-5]; 393 WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); 394 WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); 395 WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); 396 WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); 397 } 398 399 static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 400 uint32_t dc = 4; 401 int i; 402 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 403 Fill(dst, dc >> 3, 4); 404 } 405 406 static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 407 const int X = top[-1]; 408 const int I = top[-2]; 409 const int J = top[-3]; 410 const int K = top[-4]; 411 const int L = top[-5]; 412 const int A = top[0]; 413 const int B = top[1]; 414 const int C = top[2]; 415 const int D = top[3]; 416 DST(0, 3) = AVG3(J, K, L); 417 DST(0, 2) = DST(1, 3) = AVG3(I, J, K); 418 DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J); 419 DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I); 420 DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X); 421 DST(2, 0) = DST(3, 1) = AVG3(C, B, A); 422 DST(3, 0) = AVG3(D, C, B); 423 } 424 425 static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 426 const int A = top[0]; 427 const int B = top[1]; 428 const int C = top[2]; 429 const int D = top[3]; 430 const int E = top[4]; 431 const int F = top[5]; 432 const int G = top[6]; 433 const int H = top[7]; 434 DST(0, 0) = AVG3(A, B, C); 435 DST(1, 0) = DST(0, 1) = AVG3(B, C, D); 436 DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); 437 DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); 438 DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); 439 DST(3, 2) = DST(2, 3) = AVG3(F, G, H); 440 DST(3, 3) = AVG3(G, H, H); 441 } 442 443 static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 444 const int X = top[-1]; 445 const int I = top[-2]; 446 const int J = top[-3]; 447 const int K = top[-4]; 448 const int A = top[0]; 449 const int B = top[1]; 450 const int C = top[2]; 451 const int D = top[3]; 452 DST(0, 0) = DST(1, 2) = AVG2(X, A); 453 DST(1, 0) = DST(2, 2) = AVG2(A, B); 454 DST(2, 0) = DST(3, 2) = AVG2(B, C); 455 DST(3, 0) = AVG2(C, D); 456 457 DST(0, 3) = AVG3(K, J, I); 458 DST(0, 2) = AVG3(J, I, X); 459 DST(0, 1) = DST(1, 3) = AVG3(I, X, A); 460 DST(1, 1) = DST(2, 3) = AVG3(X, A, B); 461 DST(2, 1) = DST(3, 3) = AVG3(A, B, C); 462 DST(3, 1) = AVG3(B, C, D); 463 } 464 465 static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 466 const int A = top[0]; 467 const int B = top[1]; 468 const int C = top[2]; 469 const int D = top[3]; 470 const int E = top[4]; 471 const int F = top[5]; 472 const int G = top[6]; 473 const int H = top[7]; 474 DST(0, 0) = AVG2(A, B); 475 DST(1, 0) = DST(0, 2) = AVG2(B, C); 476 DST(2, 0) = DST(1, 2) = AVG2(C, D); 477 DST(3, 0) = DST(2, 2) = AVG2(D, E); 478 479 DST(0, 1) = AVG3(A, B, C); 480 DST(1, 1) = DST(0, 3) = AVG3(B, C, D); 481 DST(2, 1) = DST(1, 3) = AVG3(C, D, E); 482 DST(3, 1) = DST(2, 3) = AVG3(D, E, F); 483 DST(3, 2) = AVG3(E, F, G); 484 DST(3, 3) = AVG3(F, G, H); 485 } 486 487 static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 488 const int I = top[-2]; 489 const int J = top[-3]; 490 const int K = top[-4]; 491 const int L = top[-5]; 492 DST(0, 0) = AVG2(I, J); 493 DST(2, 0) = DST(0, 1) = AVG2(J, K); 494 DST(2, 1) = DST(0, 2) = AVG2(K, L); 495 DST(1, 0) = AVG3(I, J, K); 496 DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 497 DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 498 DST(3, 2) = DST(2, 2) = 499 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 500 } 501 502 static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 503 const int X = top[-1]; 504 const int I = top[-2]; 505 const int J = top[-3]; 506 const int K = top[-4]; 507 const int L = top[-5]; 508 const int A = top[0]; 509 const int B = top[1]; 510 const int C = top[2]; 511 512 DST(0, 0) = DST(2, 1) = AVG2(I, X); 513 DST(0, 1) = DST(2, 2) = AVG2(J, I); 514 DST(0, 2) = DST(2, 3) = AVG2(K, J); 515 DST(0, 3) = AVG2(L, K); 516 517 DST(3, 0) = AVG3(A, B, C); 518 DST(2, 0) = AVG3(X, A, B); 519 DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 520 DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 521 DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 522 DST(1, 3) = AVG3(L, K, J); 523 } 524 525 static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { 526 int x, y; 527 const uint8_t* const clip = clip1 + 255 - top[-1]; 528 for (y = 0; y < 4; ++y) { 529 const uint8_t* const clip_table = clip + top[-2 - y]; 530 for (x = 0; x < 4; ++x) { 531 dst[x] = clip_table[top[x]]; 532 } 533 dst += BPS; 534 } 535 } 536 537 #undef DST 538 #undef AVG3 539 #undef AVG2 540 541 // Left samples are top[-5 .. -2], top_left is top[-1], top are 542 // located at top[0..3], and top right is top[4..7] 543 static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst, 544 const uint8_t* WEBP_RESTRICT top) { 545 DC4(I4DC4 + dst, top); 546 TM4(I4TM4 + dst, top); 547 VE4(I4VE4 + dst, top); 548 HE4(I4HE4 + dst, top); 549 RD4(I4RD4 + dst, top); 550 VR4(I4VR4 + dst, top); 551 LD4(I4LD4 + dst, top); 552 VL4(I4VL4 + dst, top); 553 HD4(I4HD4 + dst, top); 554 HU4(I4HU4 + dst, top); 555 } 556 557 #endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32 558 559 //------------------------------------------------------------------------------ 560 // Metric 561 562 #if !WEBP_NEON_OMIT_C_CODE 563 static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a, 564 const uint8_t* WEBP_RESTRICT b, 565 int w, int h) { 566 int count = 0; 567 int y, x; 568 for (y = 0; y < h; ++y) { 569 for (x = 0; x < w; ++x) { 570 const int diff = (int)a[x] - b[x]; 571 count += diff * diff; 572 } 573 a += BPS; 574 b += BPS; 575 } 576 return count; 577 } 578 579 static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a, 580 const uint8_t* WEBP_RESTRICT b) { 581 return GetSSE(a, b, 16, 16); 582 } 583 static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a, 584 const uint8_t* WEBP_RESTRICT b) { 585 return GetSSE(a, b, 16, 8); 586 } 587 static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a, 588 const uint8_t* WEBP_RESTRICT b) { 589 return GetSSE(a, b, 8, 8); 590 } 591 static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a, 592 const uint8_t* WEBP_RESTRICT b) { 593 return GetSSE(a, b, 4, 4); 594 } 595 #endif // !WEBP_NEON_OMIT_C_CODE 596 597 static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) { 598 int k, x, y; 599 for (k = 0; k < 4; ++k) { 600 uint32_t avg = 0; 601 for (y = 0; y < 4; ++y) { 602 for (x = 0; x < 4; ++x) { 603 avg += ref[x + y * BPS]; 604 } 605 } 606 dc[k] = avg; 607 ref += 4; // go to next 4x4 block. 608 } 609 } 610 611 //------------------------------------------------------------------------------ 612 // Texture distortion 613 // 614 // We try to match the spectral content (weighted) between source and 615 // reconstructed samples. 616 617 #if !WEBP_NEON_OMIT_C_CODE 618 // Hadamard transform 619 // Returns the weighted sum of the absolute value of transformed coefficients. 620 // w[] contains a row-major 4 by 4 symmetric matrix. 621 static int TTransform(const uint8_t* WEBP_RESTRICT in, 622 const uint16_t* WEBP_RESTRICT w) { 623 int sum = 0; 624 int tmp[16]; 625 int i; 626 // horizontal pass 627 for (i = 0; i < 4; ++i, in += BPS) { 628 const int a0 = in[0] + in[2]; 629 const int a1 = in[1] + in[3]; 630 const int a2 = in[1] - in[3]; 631 const int a3 = in[0] - in[2]; 632 tmp[0 + i * 4] = a0 + a1; 633 tmp[1 + i * 4] = a3 + a2; 634 tmp[2 + i * 4] = a3 - a2; 635 tmp[3 + i * 4] = a0 - a1; 636 } 637 // vertical pass 638 for (i = 0; i < 4; ++i, ++w) { 639 const int a0 = tmp[0 + i] + tmp[8 + i]; 640 const int a1 = tmp[4 + i] + tmp[12+ i]; 641 const int a2 = tmp[4 + i] - tmp[12+ i]; 642 const int a3 = tmp[0 + i] - tmp[8 + i]; 643 const int b0 = a0 + a1; 644 const int b1 = a3 + a2; 645 const int b2 = a3 - a2; 646 const int b3 = a0 - a1; 647 648 sum += w[ 0] * abs(b0); 649 sum += w[ 4] * abs(b1); 650 sum += w[ 8] * abs(b2); 651 sum += w[12] * abs(b3); 652 } 653 return sum; 654 } 655 656 static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a, 657 const uint8_t* WEBP_RESTRICT const b, 658 const uint16_t* WEBP_RESTRICT const w) { 659 const int sum1 = TTransform(a, w); 660 const int sum2 = TTransform(b, w); 661 return abs(sum2 - sum1) >> 5; 662 } 663 664 static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a, 665 const uint8_t* WEBP_RESTRICT const b, 666 const uint16_t* WEBP_RESTRICT const w) { 667 int D = 0; 668 int x, y; 669 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 670 for (x = 0; x < 16; x += 4) { 671 D += Disto4x4_C(a + x + y, b + x + y, w); 672 } 673 } 674 return D; 675 } 676 #endif // !WEBP_NEON_OMIT_C_CODE 677 678 //------------------------------------------------------------------------------ 679 // Quantization 680 // 681 682 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC 683 static const uint8_t kZigzag[16] = { 684 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 685 }; 686 687 // Simple quantization 688 static int QuantizeBlock_C(int16_t in[16], int16_t out[16], 689 const VP8Matrix* WEBP_RESTRICT const mtx) { 690 int last = -1; 691 int n; 692 for (n = 0; n < 16; ++n) { 693 const int j = kZigzag[n]; 694 const int sign = (in[j] < 0); 695 const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen[j]; 696 if (coeff > mtx->zthresh[j]) { 697 const uint32_t Q = mtx->q[j]; 698 const uint32_t iQ = mtx->iq[j]; 699 const uint32_t B = mtx->bias[j]; 700 int level = QUANTDIV(coeff, iQ, B); 701 if (level > MAX_LEVEL) level = MAX_LEVEL; 702 if (sign) level = -level; 703 in[j] = level * (int)Q; 704 out[n] = level; 705 if (level) last = n; 706 } else { 707 out[n] = 0; 708 in[j] = 0; 709 } 710 } 711 return (last >= 0); 712 } 713 714 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], 715 const VP8Matrix* WEBP_RESTRICT const mtx) { 716 int nz; 717 nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; 718 nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; 719 return nz; 720 } 721 #endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC 722 723 //------------------------------------------------------------------------------ 724 // Block copy 725 726 static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src, 727 uint8_t* WEBP_RESTRICT dst, int w, int h) { 728 int y; 729 for (y = 0; y < h; ++y) { 730 memcpy(dst, src, w); 731 src += BPS; 732 dst += BPS; 733 } 734 } 735 736 static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src, 737 uint8_t* WEBP_RESTRICT dst) { 738 Copy(src, dst, 4, 4); 739 } 740 741 static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src, 742 uint8_t* WEBP_RESTRICT dst) { 743 Copy(src, dst, 16, 8); 744 } 745 746 //------------------------------------------------------------------------------ 747 // Initialization 748 749 // Speed-critical function pointers. We have to initialize them to the default 750 // implementations within VP8EncDspInit(). 751 VP8CHisto VP8CollectHistogram; 752 VP8Idct VP8ITransform; 753 VP8Fdct VP8FTransform; 754 VP8Fdct VP8FTransform2; 755 VP8WHT VP8FTransformWHT; 756 VP8Intra4Preds VP8EncPredLuma4; 757 VP8IntraPreds VP8EncPredLuma16; 758 VP8IntraPreds VP8EncPredChroma8; 759 VP8Metric VP8SSE16x16; 760 VP8Metric VP8SSE8x8; 761 VP8Metric VP8SSE16x8; 762 VP8Metric VP8SSE4x4; 763 VP8WMetric VP8TDisto4x4; 764 VP8WMetric VP8TDisto16x16; 765 VP8MeanMetric VP8Mean16x4; 766 VP8QuantizeBlock VP8EncQuantizeBlock; 767 VP8Quantize2Blocks VP8EncQuantize2Blocks; 768 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; 769 VP8BlockCopy VP8Copy4x4; 770 VP8BlockCopy VP8Copy16x8; 771 772 extern VP8CPUInfo VP8GetCPUInfo; 773 extern void VP8EncDspInitSSE2(void); 774 extern void VP8EncDspInitSSE41(void); 775 extern void VP8EncDspInitNEON(void); 776 extern void VP8EncDspInitMIPS32(void); 777 extern void VP8EncDspInitMIPSdspR2(void); 778 extern void VP8EncDspInitMSA(void); 779 780 WEBP_DSP_INIT_FUNC(VP8EncDspInit) { 781 VP8DspInit(); // common inverse transforms 782 InitTables(); 783 784 // default C implementations 785 #if !WEBP_NEON_OMIT_C_CODE 786 VP8ITransform = ITransform_C; 787 VP8FTransform = FTransform_C; 788 VP8FTransformWHT = FTransformWHT_C; 789 VP8TDisto4x4 = Disto4x4_C; 790 VP8TDisto16x16 = Disto16x16_C; 791 VP8CollectHistogram = CollectHistogram_C; 792 VP8SSE16x16 = SSE16x16_C; 793 VP8SSE16x8 = SSE16x8_C; 794 VP8SSE8x8 = SSE8x8_C; 795 VP8SSE4x4 = SSE4x4_C; 796 #endif 797 798 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC 799 VP8EncQuantizeBlock = QuantizeBlock_C; 800 VP8EncQuantize2Blocks = Quantize2Blocks_C; 801 VP8EncQuantizeBlockWHT = QuantizeBlock_C; 802 #endif 803 804 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32 805 VP8EncPredLuma4 = Intra4Preds_C; 806 #endif 807 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 808 VP8EncPredLuma16 = Intra16Preds_C; 809 #endif 810 811 VP8FTransform2 = FTransform2_C; 812 VP8EncPredChroma8 = IntraChromaPreds_C; 813 VP8Mean16x4 = Mean16x4_C; 814 VP8Copy4x4 = Copy4x4_C; 815 VP8Copy16x8 = Copy16x8_C; 816 817 // If defined, use CPUInfo() to overwrite some pointers with faster versions. 818 if (VP8GetCPUInfo != NULL) { 819 #if defined(WEBP_HAVE_SSE2) 820 if (VP8GetCPUInfo(kSSE2)) { 821 VP8EncDspInitSSE2(); 822 #if defined(WEBP_HAVE_SSE41) 823 if (VP8GetCPUInfo(kSSE4_1)) { 824 VP8EncDspInitSSE41(); 825 } 826 #endif 827 } 828 #endif 829 #if defined(WEBP_USE_MIPS32) 830 if (VP8GetCPUInfo(kMIPS32)) { 831 VP8EncDspInitMIPS32(); 832 } 833 #endif 834 #if defined(WEBP_USE_MIPS_DSP_R2) 835 if (VP8GetCPUInfo(kMIPSdspR2)) { 836 VP8EncDspInitMIPSdspR2(); 837 } 838 #endif 839 #if defined(WEBP_USE_MSA) 840 if (VP8GetCPUInfo(kMSA)) { 841 VP8EncDspInitMSA(); 842 } 843 #endif 844 } 845 846 #if defined(WEBP_HAVE_NEON) 847 if (WEBP_NEON_OMIT_C_CODE || 848 (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { 849 VP8EncDspInitNEON(); 850 } 851 #endif 852 853 assert(VP8ITransform != NULL); 854 assert(VP8FTransform != NULL); 855 assert(VP8FTransformWHT != NULL); 856 assert(VP8TDisto4x4 != NULL); 857 assert(VP8TDisto16x16 != NULL); 858 assert(VP8CollectHistogram != NULL); 859 assert(VP8SSE16x16 != NULL); 860 assert(VP8SSE16x8 != NULL); 861 assert(VP8SSE8x8 != NULL); 862 assert(VP8SSE4x4 != NULL); 863 assert(VP8EncQuantizeBlock != NULL); 864 assert(VP8EncQuantize2Blocks != NULL); 865 assert(VP8FTransform2 != NULL); 866 assert(VP8EncPredLuma4 != NULL); 867 assert(VP8EncPredLuma16 != NULL); 868 assert(VP8EncPredChroma8 != NULL); 869 assert(VP8Mean16x4 != NULL); 870 assert(VP8EncQuantizeBlockWHT != NULL); 871 assert(VP8Copy4x4 != NULL); 872 assert(VP8Copy16x8 != NULL); 873 }