tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

lossless_neon.c (26687B)


      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // NEON variant of methods for lossless decoder
     11 //
     12 // Author: Skal (pascal.massimino@gmail.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_NEON)
     17 
     18 #include <arm_neon.h>
     19 
     20 #include "src/dsp/lossless.h"
     21 #include "src/dsp/neon.h"
     22 #include "src/webp/format_constants.h"
     23 
     24 //------------------------------------------------------------------------------
     25 // Colorspace conversion functions
     26 
     27 #if !defined(WORK_AROUND_GCC)
     28 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
     29 // gcc-4.8.x at least.
     30 static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
     31                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
     32  const uint32_t* const end = src + (num_pixels & ~15);
     33  for (; src < end; src += 16) {
     34    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
     35    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
     36    const uint8x16_t tmp = pixel.val[0];
     37    pixel.val[0] = pixel.val[2];
     38    pixel.val[2] = tmp;
     39    vst4q_u8(dst, pixel);
     40    dst += 64;
     41  }
     42  VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
     43 }
     44 
     45 static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
     46                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
     47  const uint32_t* const end = src + (num_pixels & ~15);
     48  for (; src < end; src += 16) {
     49    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
     50    const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
     51    vst3q_u8(dst, tmp);
     52    dst += 48;
     53  }
     54  VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
     55 }
     56 
     57 static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
     58                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
     59  const uint32_t* const end = src + (num_pixels & ~15);
     60  for (; src < end; src += 16) {
     61    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
     62    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
     63    vst3q_u8(dst, tmp);
     64    dst += 48;
     65  }
     66  VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst);  // left-overs
     67 }
     68 
     69 #else  // WORK_AROUND_GCC
     70 
     71 // gcc-4.6.0 fallback
     72 
     73 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
     74 
     75 static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
     76                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
     77  const uint32_t* const end = src + (num_pixels & ~1);
     78  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
     79  for (; src < end; src += 2) {
     80    const uint8x8_t pixels = vld1_u8((uint8_t*)src);
     81    vst1_u8(dst, vtbl1_u8(pixels, shuffle));
     82    dst += 8;
     83  }
     84  VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst);  // left-overs
     85 }
     86 
     87 static const uint8_t kBGRShuffle[3][8] = {
     88  {  0,  1,  2,  4,  5,  6,  8,  9 },
     89  { 10, 12, 13, 14, 16, 17, 18, 20 },
     90  { 21, 22, 24, 25, 26, 28, 29, 30 }
     91 };
     92 
     93 static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
     94                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
     95  const uint32_t* const end = src + (num_pixels & ~7);
     96  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
     97  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
     98  const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
     99  for (; src < end; src += 8) {
    100    uint8x8x4_t pixels;
    101    INIT_VECTOR4(pixels,
    102                 vld1_u8((const uint8_t*)(src + 0)),
    103                 vld1_u8((const uint8_t*)(src + 2)),
    104                 vld1_u8((const uint8_t*)(src + 4)),
    105                 vld1_u8((const uint8_t*)(src + 6)));
    106    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
    107    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
    108    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
    109    dst += 8 * 3;
    110  }
    111  VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst);  // left-overs
    112 }
    113 
    114 static const uint8_t kRGBShuffle[3][8] = {
    115  {  2,  1,  0,  6,  5,  4, 10,  9 },
    116  {  8, 14, 13, 12, 18, 17, 16, 22 },
    117  { 21, 20, 26, 25, 24, 30, 29, 28 }
    118 };
    119 
    120 static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
    121                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
    122  const uint32_t* const end = src + (num_pixels & ~7);
    123  const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
    124  const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
    125  const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
    126  for (; src < end; src += 8) {
    127    uint8x8x4_t pixels;
    128    INIT_VECTOR4(pixels,
    129                 vld1_u8((const uint8_t*)(src + 0)),
    130                 vld1_u8((const uint8_t*)(src + 2)),
    131                 vld1_u8((const uint8_t*)(src + 4)),
    132                 vld1_u8((const uint8_t*)(src + 6)));
    133    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
    134    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
    135    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
    136    dst += 8 * 3;
    137  }
    138  VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst);  // left-overs
    139 }
    140 
    141 #endif   // !WORK_AROUND_GCC
    142 
    143 //------------------------------------------------------------------------------
    144 // Predictor Transform
    145 
    146 #define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN)))
    147 #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))
    148 #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))
    149 #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))
    150 #define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0)
    151 #define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0)
    152 #define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)))
    153 #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12)    // D|C|B|A -> C|B|A|D
    154 
    155 static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {
    156  const uint8x8_t A0 = LOAD_U32_AS_U8(a0);
    157  const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
    158  return vhadd_u8(A0, A1);
    159 }
    160 
    161 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0,
    162                                                        uint32_t c1,
    163                                                        uint32_t c2) {
    164  const uint8x8_t avg = Average2_u8_NEON(c0, c1);
    165  // Remove one to c2 when bigger than avg.
    166  const uint8x8_t C2 = LOAD_U32_AS_U8(c2);
    167  const uint8x8_t cmp = vcgt_u8(C2, avg);
    168  const uint8x8_t C2_1 = vadd_u8(C2, cmp);
    169  // Compute half of the difference between avg and c2.
    170  const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1));
    171  // Compute the sum with avg and saturate.
    172  const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg));
    173  const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg));
    174  const uint32_t output = GET_U8_AS_U32(res);
    175  return output;
    176 }
    177 
    178 static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) {
    179  const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1);
    180  const uint32_t avg = GET_U8_AS_U32(avg_u8x8);
    181  return avg;
    182 }
    183 
    184 static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
    185                                          uint32_t a2) {
    186  const uint8x8_t avg0 = Average2_u8_NEON(a0, a2);
    187  const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
    188  const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1));
    189  return avg;
    190 }
    191 
    192 static uint32_t Predictor5_NEON(const uint32_t* const left,
    193                                const uint32_t* const top) {
    194  return Average3_NEON(*left, top[0], top[1]);
    195 }
    196 static uint32_t Predictor6_NEON(const uint32_t* const left,
    197                                const uint32_t* const top) {
    198  return Average2_NEON(*left, top[-1]);
    199 }
    200 static uint32_t Predictor7_NEON(const uint32_t* const left,
    201                                const uint32_t* const top) {
    202  return Average2_NEON(*left, top[0]);
    203 }
    204 static uint32_t Predictor13_NEON(const uint32_t* const left,
    205                                 const uint32_t* const top) {
    206  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
    207 }
    208 
    209 // Batch versions of those functions.
    210 
    211 // Predictor0: ARGB_BLACK.
    212 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
    213                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
    214  int i;
    215  const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
    216  for (i = 0; i + 4 <= num_pixels; i += 4) {
    217    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    218    const uint8x16_t res = vaddq_u8(src, black);
    219    STOREQ_U8_AS_U32P(&out[i], res);
    220  }
    221  VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
    222 }
    223 
    224 // Predictor1: left.
    225 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
    226                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
    227  int i;
    228  const uint8x16_t zero = LOADQ_U32_AS_U8(0);
    229  for (i = 0; i + 4 <= num_pixels; i += 4) {
    230    // a | b | c | d
    231    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    232    // 0 | a | b | c
    233    const uint8x16_t shift0 = vextq_u8(zero, src, 12);
    234    // a | a + b | b + c | c + d
    235    const uint8x16_t sum0 = vaddq_u8(src, shift0);
    236    // 0 | 0 | a | a + b
    237    const uint8x16_t shift1 = vextq_u8(zero, sum0, 8);
    238    // a | a + b | a + b + c | a + b + c + d
    239    const uint8x16_t sum1 = vaddq_u8(sum0, shift1);
    240    const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]);
    241    const uint8x16_t res = vaddq_u8(sum1, prev);
    242    STOREQ_U8_AS_U32P(&out[i], res);
    243  }
    244  VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
    245 }
    246 
    247 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
    248 // per 8 bit channel.
    249 #define GENERATE_PREDICTOR_1(X, IN)                                       \
    250 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
    251                                   const uint32_t* upper, int num_pixels, \
    252                                   uint32_t* WEBP_RESTRICT out) {         \
    253  int i;                                                                  \
    254  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
    255    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
    256    const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN));                     \
    257    const uint8x16_t res = vaddq_u8(src, other);                          \
    258    STOREQ_U8_AS_U32P(&out[i], res);                                      \
    259  }                                                                       \
    260  VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);   \
    261 }
    262 // Predictor2: Top.
    263 GENERATE_PREDICTOR_1(2, upper[i])
    264 // Predictor3: Top-right.
    265 GENERATE_PREDICTOR_1(3, upper[i + 1])
    266 // Predictor4: Top-left.
    267 GENERATE_PREDICTOR_1(4, upper[i - 1])
    268 #undef GENERATE_PREDICTOR_1
    269 
    270 // Predictor5: average(average(left, TR), T)
    271 #define DO_PRED5(LANE) do {                                              \
    272  const uint8x16_t avgLTR = vhaddq_u8(L, TR);                            \
    273  const uint8x16_t avg = vhaddq_u8(avgLTR, T);                           \
    274  const uint8x16_t res = vaddq_u8(avg, src);                             \
    275  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
    276  L = ROTATE32_LEFT(res);                                                \
    277 } while (0)
    278 
    279 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
    280                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
    281  int i;
    282  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
    283  for (i = 0; i + 4 <= num_pixels; i += 4) {
    284    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    285    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]);
    286    const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
    287    DO_PRED5(0);
    288    DO_PRED5(1);
    289    DO_PRED5(2);
    290    DO_PRED5(3);
    291  }
    292  VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i);
    293 }
    294 #undef DO_PRED5
    295 
    296 #define DO_PRED67(LANE) do {                                             \
    297  const uint8x16_t avg = vhaddq_u8(L, top);                              \
    298  const uint8x16_t res = vaddq_u8(avg, src);                             \
    299  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
    300  L = ROTATE32_LEFT(res);                                                \
    301 } while (0)
    302 
    303 // Predictor6: average(left, TL)
    304 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
    305                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
    306  int i;
    307  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
    308  for (i = 0; i + 4 <= num_pixels; i += 4) {
    309    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    310    const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]);
    311    DO_PRED67(0);
    312    DO_PRED67(1);
    313    DO_PRED67(2);
    314    DO_PRED67(3);
    315  }
    316  VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i);
    317 }
    318 
    319 // Predictor7: average(left, T)
    320 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
    321                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
    322  int i;
    323  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
    324  for (i = 0; i + 4 <= num_pixels; i += 4) {
    325    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    326    const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]);
    327    DO_PRED67(0);
    328    DO_PRED67(1);
    329    DO_PRED67(2);
    330    DO_PRED67(3);
    331  }
    332  VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i);
    333 }
    334 #undef DO_PRED67
    335 
    336 #define GENERATE_PREDICTOR_2(X, IN)                                       \
    337 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
    338                                   const uint32_t* upper, int num_pixels, \
    339                                   uint32_t* WEBP_RESTRICT out) {         \
    340  int i;                                                                  \
    341  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
    342    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
    343    const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN));                    \
    344    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);                     \
    345    const uint8x16_t avg = vhaddq_u8(T, Tother);                          \
    346    const uint8x16_t res = vaddq_u8(avg, src);                            \
    347    STOREQ_U8_AS_U32P(&out[i], res);                                      \
    348  }                                                                       \
    349  VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);   \
    350 }
    351 // Predictor8: average TL T.
    352 GENERATE_PREDICTOR_2(8, upper[i - 1])
    353 // Predictor9: average T TR.
    354 GENERATE_PREDICTOR_2(9, upper[i + 1])
    355 #undef GENERATE_PREDICTOR_2
    356 
    357 // Predictor10: average of (average of (L,TL), average of (T, TR)).
    358 #define DO_PRED10(LANE) do {                                             \
    359  const uint8x16_t avgLTL = vhaddq_u8(L, TL);                            \
    360  const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL);                      \
    361  const uint8x16_t res = vaddq_u8(avg, src);                             \
    362  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
    363  L = ROTATE32_LEFT(res);                                                \
    364 } while (0)
    365 
    366 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
    367                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    368  int i;
    369  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
    370  for (i = 0; i + 4 <= num_pixels; i += 4) {
    371    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    372    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
    373    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
    374    const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
    375    const uint8x16_t avgTTR = vhaddq_u8(T, TR);
    376    DO_PRED10(0);
    377    DO_PRED10(1);
    378    DO_PRED10(2);
    379    DO_PRED10(3);
    380  }
    381  VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
    382 }
    383 #undef DO_PRED10
    384 
    385 // Predictor11: select.
    386 #define DO_PRED11(LANE) do {                                                   \
    387  const uint8x16_t sumLin = vaddq_u8(L, src);  /* in + L */                    \
    388  const uint8x16_t pLTL = vabdq_u8(L, TL);  /* |L - TL| */                     \
    389  const uint16x8_t sum_LTL = vpaddlq_u8(pLTL);                                 \
    390  const uint32x4_t pa = vpaddlq_u16(sum_LTL);                                  \
    391  const uint32x4_t mask = vcleq_u32(pa, pb);                                   \
    392  const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \
    393  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));         \
    394  L = ROTATE32_LEFT(res);                                                      \
    395 } while (0)
    396 
    397 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
    398                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    399  int i;
    400  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
    401  for (i = 0; i + 4 <= num_pixels; i += 4) {
    402    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
    403    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
    404    const uint8x16_t pTTL = vabdq_u8(T, TL);   // |T - TL|
    405    const uint16x8_t sum_TTL = vpaddlq_u8(pTTL);
    406    const uint32x4_t pb = vpaddlq_u16(sum_TTL);
    407    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    408    const uint8x16_t sumTin = vaddq_u8(T, src);   // in + T
    409    DO_PRED11(0);
    410    DO_PRED11(1);
    411    DO_PRED11(2);
    412    DO_PRED11(3);
    413  }
    414  VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
    415 }
    416 #undef DO_PRED11
    417 
    418 // Predictor12: ClampedAddSubtractFull.
    419 #define DO_PRED12(DIFF, LANE) do {                                       \
    420  const uint8x8_t pred =                                                 \
    421      vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF)));          \
    422  const uint8x8_t res =                                                  \
    423      vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \
    424  const uint16x8_t res16 = vmovl_u8(res);                                \
    425  vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
    426  /* rotate in the left predictor for next iteration */                  \
    427  L = vextq_u16(res16, res16, 4);                                        \
    428 } while (0)
    429 
    430 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
    431                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    432  int i;
    433  uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
    434  for (i = 0; i + 4 <= num_pixels; i += 4) {
    435    // load four pixels of source
    436    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    437    // precompute the difference T - TL once for all, stored as s16
    438    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
    439    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
    440    const int16x8_t diff_lo =
    441        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL)));
    442    const int16x8_t diff_hi =
    443        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL)));
    444    // loop over the four reconstructed pixels
    445    DO_PRED12(diff_lo, 0);
    446    DO_PRED12(diff_lo, 1);
    447    DO_PRED12(diff_hi, 2);
    448    DO_PRED12(diff_hi, 3);
    449  }
    450  VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
    451 }
    452 #undef DO_PRED12
    453 
    454 // Predictor13: ClampedAddSubtractHalf
    455 #define DO_PRED13(LANE, LOW_OR_HI) do {                                        \
    456  const uint8x16_t avg = vhaddq_u8(L, T);                                      \
    457  const uint8x16_t cmp = vcgtq_u8(TL, avg);                                    \
    458  const uint8x16_t TL_1 = vaddq_u8(TL, cmp);                                   \
    459  /* Compute half of the difference between avg and TL'. */                    \
    460  const int8x8_t diff_avg =                                                    \
    461      vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1)));                     \
    462  /* Compute the sum with avg and saturate. */                                 \
    463  const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg)));    \
    464  const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg));             \
    465  const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta);                        \
    466  const uint8x16_t res2 = vcombine_u8(res, res);                               \
    467  vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1);       \
    468  L = ROTATE32_LEFT(res2);                                                     \
    469 } while (0)
    470 
    471 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
    472                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    473  int i;
    474  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
    475  for (i = 0; i + 4 <= num_pixels; i += 4) {
    476    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
    477    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
    478    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
    479    DO_PRED13(0, vget_low_u8);
    480    DO_PRED13(1, vget_low_u8);
    481    DO_PRED13(2, vget_high_u8);
    482    DO_PRED13(3, vget_high_u8);
    483  }
    484  VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i);
    485 }
    486 #undef DO_PRED13
    487 
    488 #undef LOAD_U32_AS_U8
    489 #undef LOAD_U32P_AS_U8
    490 #undef LOADQ_U32_AS_U8
    491 #undef LOADQ_U32P_AS_U8
    492 #undef GET_U8_AS_U32
    493 #undef GETQ_U8_AS_U32
    494 #undef STOREQ_U8_AS_U32P
    495 #undef ROTATE32_LEFT
    496 
    497 //------------------------------------------------------------------------------
    498 // Subtract-Green Transform
    499 
    500 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
    501 // non-standard versions there.
    502 #if defined(__APPLE__) && WEBP_AARCH64 && \
    503    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
    504 #define USE_VTBLQ
    505 #endif
    506 
    507 #ifdef USE_VTBLQ
    508 // 255 = byte will be zeroed
    509 static const uint8_t kGreenShuffle[16] = {
    510  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
    511 };
    512 
    513 static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
    514                                                  const uint8x16_t shuffle) {
    515  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
    516                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
    517 }
    518 #else  // !USE_VTBLQ
    519 // 255 = byte will be zeroed
    520 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
    521 
    522 static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
    523                                                  const uint8x8_t shuffle) {
    524  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
    525                     vtbl1_u8(vget_high_u8(argb), shuffle));
    526 }
    527 #endif  // USE_VTBLQ
    528 
    529 static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
    530                                      uint32_t* dst) {
    531  const uint32_t* const end = src + (num_pixels & ~3);
    532 #ifdef USE_VTBLQ
    533  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
    534 #else
    535  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
    536 #endif
    537  for (; src < end; src += 4, dst += 4) {
    538    const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
    539    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
    540    vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
    541  }
    542  // fallthrough and finish off with plain-C
    543  VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
    544 }
    545 
    546 //------------------------------------------------------------------------------
    547 // Color Transform
    548 
    549 static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
    550                                       const uint32_t* const src,
    551                                       int num_pixels, uint32_t* dst) {
    552 // sign-extended multiplying constants, pre-shifted by 6.
    553 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
    554  const int16_t rb[8] = {
    555    CST(green_to_blue), CST(green_to_red),
    556    CST(green_to_blue), CST(green_to_red),
    557    CST(green_to_blue), CST(green_to_red),
    558    CST(green_to_blue), CST(green_to_red)
    559  };
    560  const int16x8_t mults_rb = vld1q_s16(rb);
    561  const int16_t b2[8] = {
    562    0, CST(red_to_blue), 0, CST(red_to_blue),
    563    0, CST(red_to_blue), 0, CST(red_to_blue),
    564  };
    565  const int16x8_t mults_b2 = vld1q_s16(b2);
    566 #undef CST
    567 #ifdef USE_VTBLQ
    568  static const uint8_t kg0g0[16] = {
    569    255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
    570  };
    571  const uint8x16_t shuffle = vld1q_u8(kg0g0);
    572 #else
    573  static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
    574  const uint8x8_t shuffle = vld1_u8(k0g0g);
    575 #endif
    576  const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
    577  int i;
    578  for (i = 0; i + 4 <= num_pixels; i += 4) {
    579    const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
    580    const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
    581    // 0 g 0 g
    582    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
    583    // x dr  x db1
    584    const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
    585    // x r'  x   b'
    586    const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),
    587                                 vreinterpretq_s8_s16(A));
    588    // r' 0   b' 0
    589    const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);
    590    // x db2  0  0
    591    const int16x8_t D = vqdmulhq_s16(C, mults_b2);
    592    // 0  x db2  0
    593    const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);
    594    // r' x  b'' 0
    595    const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),
    596                                 vreinterpretq_s8_s16(C));
    597    // 0  r'  0  b''
    598    const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
    599    const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
    600    vst1q_u32(dst + i, out);
    601  }
    602  // Fall-back to C-version for left-overs.
    603  VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
    604 }
    605 
    606 #undef USE_VTBLQ
    607 
    608 //------------------------------------------------------------------------------
    609 // Entry point
    610 
    611 extern void VP8LDspInitNEON(void);
    612 
    613 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
    614  VP8LPredictors[5] = Predictor5_NEON;
    615  VP8LPredictors[6] = Predictor6_NEON;
    616  VP8LPredictors[7] = Predictor7_NEON;
    617  VP8LPredictors[13] = Predictor13_NEON;
    618 
    619  VP8LPredictorsAdd[0] = PredictorAdd0_NEON;
    620  VP8LPredictorsAdd[1] = PredictorAdd1_NEON;
    621  VP8LPredictorsAdd[2] = PredictorAdd2_NEON;
    622  VP8LPredictorsAdd[3] = PredictorAdd3_NEON;
    623  VP8LPredictorsAdd[4] = PredictorAdd4_NEON;
    624  VP8LPredictorsAdd[5] = PredictorAdd5_NEON;
    625  VP8LPredictorsAdd[6] = PredictorAdd6_NEON;
    626  VP8LPredictorsAdd[7] = PredictorAdd7_NEON;
    627  VP8LPredictorsAdd[8] = PredictorAdd8_NEON;
    628  VP8LPredictorsAdd[9] = PredictorAdd9_NEON;
    629  VP8LPredictorsAdd[10] = PredictorAdd10_NEON;
    630  VP8LPredictorsAdd[11] = PredictorAdd11_NEON;
    631  VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
    632  VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
    633 
    634  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
    635  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
    636  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
    637 
    638  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
    639  VP8LTransformColorInverse = TransformColorInverse_NEON;
    640 }
    641 
    642 #else  // !WEBP_USE_NEON
    643 
    644 WEBP_DSP_INIT_STUB(VP8LDspInitNEON)
    645 
    646 #endif  // WEBP_USE_NEON