lossless_neon.c (26687B)
1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // NEON variant of methods for lossless decoder 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_NEON) 17 18 #include <arm_neon.h> 19 20 #include "src/dsp/lossless.h" 21 #include "src/dsp/neon.h" 22 #include "src/webp/format_constants.h" 23 24 //------------------------------------------------------------------------------ 25 // Colorspace conversion functions 26 27 #if !defined(WORK_AROUND_GCC) 28 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for 29 // gcc-4.8.x at least. 30 static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src, 31 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 32 const uint32_t* const end = src + (num_pixels & ~15); 33 for (; src < end; src += 16) { 34 uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); 35 // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) 36 const uint8x16_t tmp = pixel.val[0]; 37 pixel.val[0] = pixel.val[2]; 38 pixel.val[2] = tmp; 39 vst4q_u8(dst, pixel); 40 dst += 64; 41 } 42 VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs 43 } 44 45 static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src, 46 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 47 const uint32_t* const end = src + (num_pixels & ~15); 48 for (; src < end; src += 16) { 49 const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); 50 const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } }; 51 vst3q_u8(dst, tmp); 52 dst += 48; 53 } 54 VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs 55 } 56 57 static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src, 58 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 59 const uint32_t* const end = src + (num_pixels & ~15); 60 for (; src < end; src += 16) { 61 const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); 62 const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; 63 vst3q_u8(dst, tmp); 64 dst += 48; 65 } 66 VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs 67 } 68 69 #else // WORK_AROUND_GCC 70 71 // gcc-4.6.0 fallback 72 73 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; 74 75 static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src, 76 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 77 const uint32_t* const end = src + (num_pixels & ~1); 78 const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); 79 for (; src < end; src += 2) { 80 const uint8x8_t pixels = vld1_u8((uint8_t*)src); 81 vst1_u8(dst, vtbl1_u8(pixels, shuffle)); 82 dst += 8; 83 } 84 VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs 85 } 86 87 static const uint8_t kBGRShuffle[3][8] = { 88 { 0, 1, 2, 4, 5, 6, 8, 9 }, 89 { 10, 12, 13, 14, 16, 17, 18, 20 }, 90 { 21, 22, 24, 25, 26, 28, 29, 30 } 91 }; 92 93 static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src, 94 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 95 const uint32_t* const end = src + (num_pixels & ~7); 96 const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); 97 const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); 98 const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); 99 for (; src < end; src += 8) { 100 uint8x8x4_t pixels; 101 INIT_VECTOR4(pixels, 102 vld1_u8((const uint8_t*)(src + 0)), 103 vld1_u8((const uint8_t*)(src + 2)), 104 vld1_u8((const uint8_t*)(src + 4)), 105 vld1_u8((const uint8_t*)(src + 6))); 106 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); 107 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); 108 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); 109 dst += 8 * 3; 110 } 111 VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs 112 } 113 114 static const uint8_t kRGBShuffle[3][8] = { 115 { 2, 1, 0, 6, 5, 4, 10, 9 }, 116 { 8, 14, 13, 12, 18, 17, 16, 22 }, 117 { 21, 20, 26, 25, 24, 30, 29, 28 } 118 }; 119 120 static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src, 121 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 122 const uint32_t* const end = src + (num_pixels & ~7); 123 const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); 124 const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); 125 const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]); 126 for (; src < end; src += 8) { 127 uint8x8x4_t pixels; 128 INIT_VECTOR4(pixels, 129 vld1_u8((const uint8_t*)(src + 0)), 130 vld1_u8((const uint8_t*)(src + 2)), 131 vld1_u8((const uint8_t*)(src + 4)), 132 vld1_u8((const uint8_t*)(src + 6))); 133 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); 134 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); 135 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); 136 dst += 8 * 3; 137 } 138 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs 139 } 140 141 #endif // !WORK_AROUND_GCC 142 143 //------------------------------------------------------------------------------ 144 // Predictor Transform 145 146 #define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN))) 147 #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN))) 148 #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN))) 149 #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN))) 150 #define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0) 151 #define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0) 152 #define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN))) 153 #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12) // D|C|B|A -> C|B|A|D 154 155 static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) { 156 const uint8x8_t A0 = LOAD_U32_AS_U8(a0); 157 const uint8x8_t A1 = LOAD_U32_AS_U8(a1); 158 return vhadd_u8(A0, A1); 159 } 160 161 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0, 162 uint32_t c1, 163 uint32_t c2) { 164 const uint8x8_t avg = Average2_u8_NEON(c0, c1); 165 // Remove one to c2 when bigger than avg. 166 const uint8x8_t C2 = LOAD_U32_AS_U8(c2); 167 const uint8x8_t cmp = vcgt_u8(C2, avg); 168 const uint8x8_t C2_1 = vadd_u8(C2, cmp); 169 // Compute half of the difference between avg and c2. 170 const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1)); 171 // Compute the sum with avg and saturate. 172 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg)); 173 const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); 174 const uint32_t output = GET_U8_AS_U32(res); 175 return output; 176 } 177 178 static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) { 179 const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1); 180 const uint32_t avg = GET_U8_AS_U32(avg_u8x8); 181 return avg; 182 } 183 184 static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1, 185 uint32_t a2) { 186 const uint8x8_t avg0 = Average2_u8_NEON(a0, a2); 187 const uint8x8_t A1 = LOAD_U32_AS_U8(a1); 188 const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1)); 189 return avg; 190 } 191 192 static uint32_t Predictor5_NEON(const uint32_t* const left, 193 const uint32_t* const top) { 194 return Average3_NEON(*left, top[0], top[1]); 195 } 196 static uint32_t Predictor6_NEON(const uint32_t* const left, 197 const uint32_t* const top) { 198 return Average2_NEON(*left, top[-1]); 199 } 200 static uint32_t Predictor7_NEON(const uint32_t* const left, 201 const uint32_t* const top) { 202 return Average2_NEON(*left, top[0]); 203 } 204 static uint32_t Predictor13_NEON(const uint32_t* const left, 205 const uint32_t* const top) { 206 return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]); 207 } 208 209 // Batch versions of those functions. 210 211 // Predictor0: ARGB_BLACK. 212 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper, 213 int num_pixels, uint32_t* WEBP_RESTRICT out) { 214 int i; 215 const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK)); 216 for (i = 0; i + 4 <= num_pixels; i += 4) { 217 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 218 const uint8x16_t res = vaddq_u8(src, black); 219 STOREQ_U8_AS_U32P(&out[i], res); 220 } 221 VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); 222 } 223 224 // Predictor1: left. 225 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper, 226 int num_pixels, uint32_t* WEBP_RESTRICT out) { 227 int i; 228 const uint8x16_t zero = LOADQ_U32_AS_U8(0); 229 for (i = 0; i + 4 <= num_pixels; i += 4) { 230 // a | b | c | d 231 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 232 // 0 | a | b | c 233 const uint8x16_t shift0 = vextq_u8(zero, src, 12); 234 // a | a + b | b + c | c + d 235 const uint8x16_t sum0 = vaddq_u8(src, shift0); 236 // 0 | 0 | a | a + b 237 const uint8x16_t shift1 = vextq_u8(zero, sum0, 8); 238 // a | a + b | a + b + c | a + b + c + d 239 const uint8x16_t sum1 = vaddq_u8(sum0, shift1); 240 const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]); 241 const uint8x16_t res = vaddq_u8(sum1, prev); 242 STOREQ_U8_AS_U32P(&out[i], res); 243 } 244 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); 245 } 246 247 // Macro that adds 32-bit integers from IN using mod 256 arithmetic 248 // per 8 bit channel. 249 #define GENERATE_PREDICTOR_1(X, IN) \ 250 static void PredictorAdd##X##_NEON(const uint32_t* in, \ 251 const uint32_t* upper, int num_pixels, \ 252 uint32_t* WEBP_RESTRICT out) { \ 253 int i; \ 254 for (i = 0; i + 4 <= num_pixels; i += 4) { \ 255 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ 256 const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN)); \ 257 const uint8x16_t res = vaddq_u8(src, other); \ 258 STOREQ_U8_AS_U32P(&out[i], res); \ 259 } \ 260 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ 261 } 262 // Predictor2: Top. 263 GENERATE_PREDICTOR_1(2, upper[i]) 264 // Predictor3: Top-right. 265 GENERATE_PREDICTOR_1(3, upper[i + 1]) 266 // Predictor4: Top-left. 267 GENERATE_PREDICTOR_1(4, upper[i - 1]) 268 #undef GENERATE_PREDICTOR_1 269 270 // Predictor5: average(average(left, TR), T) 271 #define DO_PRED5(LANE) do { \ 272 const uint8x16_t avgLTR = vhaddq_u8(L, TR); \ 273 const uint8x16_t avg = vhaddq_u8(avgLTR, T); \ 274 const uint8x16_t res = vaddq_u8(avg, src); \ 275 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ 276 L = ROTATE32_LEFT(res); \ 277 } while (0) 278 279 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper, 280 int num_pixels, uint32_t* WEBP_RESTRICT out) { 281 int i; 282 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); 283 for (i = 0; i + 4 <= num_pixels; i += 4) { 284 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 285 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]); 286 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]); 287 DO_PRED5(0); 288 DO_PRED5(1); 289 DO_PRED5(2); 290 DO_PRED5(3); 291 } 292 VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i); 293 } 294 #undef DO_PRED5 295 296 #define DO_PRED67(LANE) do { \ 297 const uint8x16_t avg = vhaddq_u8(L, top); \ 298 const uint8x16_t res = vaddq_u8(avg, src); \ 299 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ 300 L = ROTATE32_LEFT(res); \ 301 } while (0) 302 303 // Predictor6: average(left, TL) 304 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper, 305 int num_pixels, uint32_t* WEBP_RESTRICT out) { 306 int i; 307 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); 308 for (i = 0; i + 4 <= num_pixels; i += 4) { 309 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 310 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]); 311 DO_PRED67(0); 312 DO_PRED67(1); 313 DO_PRED67(2); 314 DO_PRED67(3); 315 } 316 VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i); 317 } 318 319 // Predictor7: average(left, T) 320 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper, 321 int num_pixels, uint32_t* WEBP_RESTRICT out) { 322 int i; 323 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); 324 for (i = 0; i + 4 <= num_pixels; i += 4) { 325 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 326 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]); 327 DO_PRED67(0); 328 DO_PRED67(1); 329 DO_PRED67(2); 330 DO_PRED67(3); 331 } 332 VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i); 333 } 334 #undef DO_PRED67 335 336 #define GENERATE_PREDICTOR_2(X, IN) \ 337 static void PredictorAdd##X##_NEON(const uint32_t* in, \ 338 const uint32_t* upper, int num_pixels, \ 339 uint32_t* WEBP_RESTRICT out) { \ 340 int i; \ 341 for (i = 0; i + 4 <= num_pixels; i += 4) { \ 342 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ 343 const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN)); \ 344 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); \ 345 const uint8x16_t avg = vhaddq_u8(T, Tother); \ 346 const uint8x16_t res = vaddq_u8(avg, src); \ 347 STOREQ_U8_AS_U32P(&out[i], res); \ 348 } \ 349 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ 350 } 351 // Predictor8: average TL T. 352 GENERATE_PREDICTOR_2(8, upper[i - 1]) 353 // Predictor9: average T TR. 354 GENERATE_PREDICTOR_2(9, upper[i + 1]) 355 #undef GENERATE_PREDICTOR_2 356 357 // Predictor10: average of (average of (L,TL), average of (T, TR)). 358 #define DO_PRED10(LANE) do { \ 359 const uint8x16_t avgLTL = vhaddq_u8(L, TL); \ 360 const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL); \ 361 const uint8x16_t res = vaddq_u8(avg, src); \ 362 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ 363 L = ROTATE32_LEFT(res); \ 364 } while (0) 365 366 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper, 367 int num_pixels, uint32_t* WEBP_RESTRICT out) { 368 int i; 369 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); 370 for (i = 0; i + 4 <= num_pixels; i += 4) { 371 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 372 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); 373 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); 374 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]); 375 const uint8x16_t avgTTR = vhaddq_u8(T, TR); 376 DO_PRED10(0); 377 DO_PRED10(1); 378 DO_PRED10(2); 379 DO_PRED10(3); 380 } 381 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); 382 } 383 #undef DO_PRED10 384 385 // Predictor11: select. 386 #define DO_PRED11(LANE) do { \ 387 const uint8x16_t sumLin = vaddq_u8(L, src); /* in + L */ \ 388 const uint8x16_t pLTL = vabdq_u8(L, TL); /* |L - TL| */ \ 389 const uint16x8_t sum_LTL = vpaddlq_u8(pLTL); \ 390 const uint32x4_t pa = vpaddlq_u16(sum_LTL); \ 391 const uint32x4_t mask = vcleq_u32(pa, pb); \ 392 const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \ 393 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ 394 L = ROTATE32_LEFT(res); \ 395 } while (0) 396 397 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper, 398 int num_pixels, uint32_t* WEBP_RESTRICT out) { 399 int i; 400 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); 401 for (i = 0; i + 4 <= num_pixels; i += 4) { 402 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); 403 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); 404 const uint8x16_t pTTL = vabdq_u8(T, TL); // |T - TL| 405 const uint16x8_t sum_TTL = vpaddlq_u8(pTTL); 406 const uint32x4_t pb = vpaddlq_u16(sum_TTL); 407 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 408 const uint8x16_t sumTin = vaddq_u8(T, src); // in + T 409 DO_PRED11(0); 410 DO_PRED11(1); 411 DO_PRED11(2); 412 DO_PRED11(3); 413 } 414 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); 415 } 416 #undef DO_PRED11 417 418 // Predictor12: ClampedAddSubtractFull. 419 #define DO_PRED12(DIFF, LANE) do { \ 420 const uint8x8_t pred = \ 421 vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF))); \ 422 const uint8x8_t res = \ 423 vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \ 424 const uint16x8_t res16 = vmovl_u8(res); \ 425 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \ 426 /* rotate in the left predictor for next iteration */ \ 427 L = vextq_u16(res16, res16, 4); \ 428 } while (0) 429 430 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper, 431 int num_pixels, uint32_t* WEBP_RESTRICT out) { 432 int i; 433 uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1])); 434 for (i = 0; i + 4 <= num_pixels; i += 4) { 435 // load four pixels of source 436 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 437 // precompute the difference T - TL once for all, stored as s16 438 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); 439 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); 440 const int16x8_t diff_lo = 441 vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL))); 442 const int16x8_t diff_hi = 443 vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL))); 444 // loop over the four reconstructed pixels 445 DO_PRED12(diff_lo, 0); 446 DO_PRED12(diff_lo, 1); 447 DO_PRED12(diff_hi, 2); 448 DO_PRED12(diff_hi, 3); 449 } 450 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); 451 } 452 #undef DO_PRED12 453 454 // Predictor13: ClampedAddSubtractHalf 455 #define DO_PRED13(LANE, LOW_OR_HI) do { \ 456 const uint8x16_t avg = vhaddq_u8(L, T); \ 457 const uint8x16_t cmp = vcgtq_u8(TL, avg); \ 458 const uint8x16_t TL_1 = vaddq_u8(TL, cmp); \ 459 /* Compute half of the difference between avg and TL'. */ \ 460 const int8x8_t diff_avg = \ 461 vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1))); \ 462 /* Compute the sum with avg and saturate. */ \ 463 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg))); \ 464 const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); \ 465 const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta); \ 466 const uint8x16_t res2 = vcombine_u8(res, res); \ 467 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \ 468 L = ROTATE32_LEFT(res2); \ 469 } while (0) 470 471 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper, 472 int num_pixels, uint32_t* WEBP_RESTRICT out) { 473 int i; 474 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); 475 for (i = 0; i + 4 <= num_pixels; i += 4) { 476 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); 477 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); 478 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); 479 DO_PRED13(0, vget_low_u8); 480 DO_PRED13(1, vget_low_u8); 481 DO_PRED13(2, vget_high_u8); 482 DO_PRED13(3, vget_high_u8); 483 } 484 VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i); 485 } 486 #undef DO_PRED13 487 488 #undef LOAD_U32_AS_U8 489 #undef LOAD_U32P_AS_U8 490 #undef LOADQ_U32_AS_U8 491 #undef LOADQ_U32P_AS_U8 492 #undef GET_U8_AS_U32 493 #undef GETQ_U8_AS_U32 494 #undef STOREQ_U8_AS_U32P 495 #undef ROTATE32_LEFT 496 497 //------------------------------------------------------------------------------ 498 // Subtract-Green Transform 499 500 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use 501 // non-standard versions there. 502 #if defined(__APPLE__) && WEBP_AARCH64 && \ 503 defined(__apple_build_version__) && (__apple_build_version__< 6020037) 504 #define USE_VTBLQ 505 #endif 506 507 #ifdef USE_VTBLQ 508 // 255 = byte will be zeroed 509 static const uint8_t kGreenShuffle[16] = { 510 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255 511 }; 512 513 static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, 514 const uint8x16_t shuffle) { 515 return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)), 516 vtbl1q_u8(argb, vget_high_u8(shuffle))); 517 } 518 #else // !USE_VTBLQ 519 // 255 = byte will be zeroed 520 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; 521 522 static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, 523 const uint8x8_t shuffle) { 524 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), 525 vtbl1_u8(vget_high_u8(argb), shuffle)); 526 } 527 #endif // USE_VTBLQ 528 529 static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels, 530 uint32_t* dst) { 531 const uint32_t* const end = src + (num_pixels & ~3); 532 #ifdef USE_VTBLQ 533 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); 534 #else 535 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); 536 #endif 537 for (; src < end; src += 4, dst += 4) { 538 const uint8x16_t argb = vld1q_u8((const uint8_t*)src); 539 const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle); 540 vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens)); 541 } 542 // fallthrough and finish off with plain-C 543 VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst); 544 } 545 546 //------------------------------------------------------------------------------ 547 // Color Transform 548 549 static void TransformColorInverse_NEON(const VP8LMultipliers* const m, 550 const uint32_t* const src, 551 int num_pixels, uint32_t* dst) { 552 // sign-extended multiplying constants, pre-shifted by 6. 553 #define CST(X) (((int16_t)(m->X << 8)) >> 6) 554 const int16_t rb[8] = { 555 CST(green_to_blue), CST(green_to_red), 556 CST(green_to_blue), CST(green_to_red), 557 CST(green_to_blue), CST(green_to_red), 558 CST(green_to_blue), CST(green_to_red) 559 }; 560 const int16x8_t mults_rb = vld1q_s16(rb); 561 const int16_t b2[8] = { 562 0, CST(red_to_blue), 0, CST(red_to_blue), 563 0, CST(red_to_blue), 0, CST(red_to_blue), 564 }; 565 const int16x8_t mults_b2 = vld1q_s16(b2); 566 #undef CST 567 #ifdef USE_VTBLQ 568 static const uint8_t kg0g0[16] = { 569 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13 570 }; 571 const uint8x16_t shuffle = vld1q_u8(kg0g0); 572 #else 573 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 }; 574 const uint8x8_t shuffle = vld1_u8(k0g0g); 575 #endif 576 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); 577 int i; 578 for (i = 0; i + 4 <= num_pixels; i += 4) { 579 const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i)); 580 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); 581 // 0 g 0 g 582 const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle); 583 // x dr x db1 584 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); 585 // x r' x b' 586 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in), 587 vreinterpretq_s8_s16(A)); 588 // r' 0 b' 0 589 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8); 590 // x db2 0 0 591 const int16x8_t D = vqdmulhq_s16(C, mults_b2); 592 // 0 x db2 0 593 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8); 594 // r' x b'' 0 595 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E), 596 vreinterpretq_s8_s16(C)); 597 // 0 r' 0 b'' 598 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); 599 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); 600 vst1q_u32(dst + i, out); 601 } 602 // Fall-back to C-version for left-overs. 603 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); 604 } 605 606 #undef USE_VTBLQ 607 608 //------------------------------------------------------------------------------ 609 // Entry point 610 611 extern void VP8LDspInitNEON(void); 612 613 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { 614 VP8LPredictors[5] = Predictor5_NEON; 615 VP8LPredictors[6] = Predictor6_NEON; 616 VP8LPredictors[7] = Predictor7_NEON; 617 VP8LPredictors[13] = Predictor13_NEON; 618 619 VP8LPredictorsAdd[0] = PredictorAdd0_NEON; 620 VP8LPredictorsAdd[1] = PredictorAdd1_NEON; 621 VP8LPredictorsAdd[2] = PredictorAdd2_NEON; 622 VP8LPredictorsAdd[3] = PredictorAdd3_NEON; 623 VP8LPredictorsAdd[4] = PredictorAdd4_NEON; 624 VP8LPredictorsAdd[5] = PredictorAdd5_NEON; 625 VP8LPredictorsAdd[6] = PredictorAdd6_NEON; 626 VP8LPredictorsAdd[7] = PredictorAdd7_NEON; 627 VP8LPredictorsAdd[8] = PredictorAdd8_NEON; 628 VP8LPredictorsAdd[9] = PredictorAdd9_NEON; 629 VP8LPredictorsAdd[10] = PredictorAdd10_NEON; 630 VP8LPredictorsAdd[11] = PredictorAdd11_NEON; 631 VP8LPredictorsAdd[12] = PredictorAdd12_NEON; 632 VP8LPredictorsAdd[13] = PredictorAdd13_NEON; 633 634 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON; 635 VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON; 636 VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON; 637 638 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON; 639 VP8LTransformColorInverse = TransformColorInverse_NEON; 640 } 641 642 #else // !WEBP_USE_NEON 643 644 WEBP_DSP_INIT_STUB(VP8LDspInitNEON) 645 646 #endif // WEBP_USE_NEON