tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

enc_neon.c (48427B)


      1 // Copyright 2012 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // ARM NEON version of speed-critical encoding functions.
     11 //
     12 // adapted from libvpx (https://www.webmproject.org/code/)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_NEON)
     17 
     18 #include <assert.h>
     19 
     20 #include "src/dsp/neon.h"
     21 #include "src/enc/vp8i_enc.h"
     22 
     23 //------------------------------------------------------------------------------
     24 // Transforms (Paragraph 14.4)
     25 
     26 // Inverse transform.
     27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
     28 // for subtraction to *ref. See the comments there for algorithmic explanations.
     29 
     30 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
     31 static const int16_t kC2 =
     32    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
     33 
     34 // This code works but is *slower* than the inlined-asm version below
     35 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
     36 // WEBP_USE_INTRINSICS define.
     37 // With gcc-4.8, it's a little faster speed than inlined-assembly.
     38 #if defined(WEBP_USE_INTRINSICS)
     39 
     40 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
     41 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
     42  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
     43 }
     44 
     45 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
     46 // to the corresponding rows of 'dst'.
     47 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
     48                                                 const int16x8_t dst01,
     49                                                 const int16x8_t dst23) {
     50  // Unsigned saturate to 8b.
     51  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
     52  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
     53 
     54  // Store the results.
     55  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
     56  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
     57  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
     58  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
     59 }
     60 
     61 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
     62                                    const int16x8_t row23,
     63                                    const uint8_t* WEBP_RESTRICT const ref,
     64                                    uint8_t* WEBP_RESTRICT const dst) {
     65  uint32x2_t dst01 = vdup_n_u32(0);
     66  uint32x2_t dst23 = vdup_n_u32(0);
     67 
     68  // Load the source pixels.
     69  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
     70  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
     71  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
     72  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
     73 
     74  {
     75    // Convert to 16b.
     76    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
     77    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
     78 
     79    // Descale with rounding.
     80    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
     81    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
     82    // Add the inverse transform.
     83    SaturateAndStore4x4_NEON(dst, out01, out23);
     84  }
     85 }
     86 
     87 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
     88                                          const int16x8_t in1,
     89                                          int16x8x2_t* const out) {
     90  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
     91  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
     92  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
     93                                                  // b0 d0 b1 d1 b2 d2 ...
     94  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
     95 }
     96 
     97 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
     98  // {rows} = in0 | in4
     99  //          in8 | in12
    100  // B1 = in4 | in12
    101  const int16x8_t B1 =
    102      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
    103  // C0 = kC1 * in4 | kC1 * in12
    104  // C1 = kC2 * in4 | kC2 * in12
    105  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
    106  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
    107  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
    108                                vget_low_s16(rows->val[1]));   // in0 + in8
    109  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
    110                                vget_low_s16(rows->val[1]));   // in0 - in8
    111  // c = kC2 * in4 - kC1 * in12
    112  // d = kC1 * in4 + kC2 * in12
    113  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
    114  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
    115  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
    116  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
    117  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
    118  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
    119  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
    120  Transpose8x2_NEON(E0, E1, rows);
    121 }
    122 
    123 static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
    124                               const int16_t* WEBP_RESTRICT in,
    125                               uint8_t* WEBP_RESTRICT dst) {
    126  int16x8x2_t rows;
    127  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
    128  TransformPass_NEON(&rows);
    129  TransformPass_NEON(&rows);
    130  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
    131 }
    132 
    133 #else
    134 
    135 static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
    136                               const int16_t* WEBP_RESTRICT in,
    137                               uint8_t* WEBP_RESTRICT dst) {
    138  const int kBPS = BPS;
    139  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
    140 
    141  __asm__ volatile (
    142    "vld1.16         {q1, q2}, [%[in]]           \n"
    143    "vld1.16         {d0}, [%[kC1C2]]            \n"
    144 
    145    // d2: in[0]
    146    // d3: in[8]
    147    // d4: in[4]
    148    // d5: in[12]
    149    "vswp            d3, d4                      \n"
    150 
    151    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
    152    // q9 = {in[4], in[12]} * kC2 >> 16
    153    "vqdmulh.s16     q8, q2, d0[0]               \n"
    154    "vqdmulh.s16     q9, q2, d0[1]               \n"
    155 
    156    // d22 = a = in[0] + in[8]
    157    // d23 = b = in[0] - in[8]
    158    "vqadd.s16       d22, d2, d3                 \n"
    159    "vqsub.s16       d23, d2, d3                 \n"
    160 
    161    //  q8 = in[4]/[12] * kC1 >> 16
    162    "vshr.s16        q8, q8, #1                  \n"
    163 
    164    // Add {in[4], in[12]} back after the multiplication.
    165    "vqadd.s16       q8, q2, q8                  \n"
    166 
    167    // d20 = c = in[4]*kC2 - in[12]*kC1
    168    // d21 = d = in[4]*kC1 + in[12]*kC2
    169    "vqsub.s16       d20, d18, d17               \n"
    170    "vqadd.s16       d21, d19, d16               \n"
    171 
    172    // d2 = tmp[0] = a + d
    173    // d3 = tmp[1] = b + c
    174    // d4 = tmp[2] = b - c
    175    // d5 = tmp[3] = a - d
    176    "vqadd.s16       d2, d22, d21                \n"
    177    "vqadd.s16       d3, d23, d20                \n"
    178    "vqsub.s16       d4, d23, d20                \n"
    179    "vqsub.s16       d5, d22, d21                \n"
    180 
    181    "vzip.16         q1, q2                      \n"
    182    "vzip.16         q1, q2                      \n"
    183 
    184    "vswp            d3, d4                      \n"
    185 
    186    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
    187    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
    188    "vqdmulh.s16     q8, q2, d0[0]               \n"
    189    "vqdmulh.s16     q9, q2, d0[1]               \n"
    190 
    191    // d22 = a = tmp[0] + tmp[8]
    192    // d23 = b = tmp[0] - tmp[8]
    193    "vqadd.s16       d22, d2, d3                 \n"
    194    "vqsub.s16       d23, d2, d3                 \n"
    195 
    196    "vshr.s16        q8, q8, #1                  \n"
    197    "vqadd.s16       q8, q2, q8                  \n"
    198 
    199    // d20 = c = in[4]*kC2 - in[12]*kC1
    200    // d21 = d = in[4]*kC1 + in[12]*kC2
    201    "vqsub.s16       d20, d18, d17               \n"
    202    "vqadd.s16       d21, d19, d16               \n"
    203 
    204    // d2 = tmp[0] = a + d
    205    // d3 = tmp[1] = b + c
    206    // d4 = tmp[2] = b - c
    207    // d5 = tmp[3] = a - d
    208    "vqadd.s16       d2, d22, d21                \n"
    209    "vqadd.s16       d3, d23, d20                \n"
    210    "vqsub.s16       d4, d23, d20                \n"
    211    "vqsub.s16       d5, d22, d21                \n"
    212 
    213    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
    214    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
    215    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
    216    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
    217 
    218    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
    219 
    220    // (val) + 4 >> 3
    221    "vrshr.s16       d2, d2, #3                  \n"
    222    "vrshr.s16       d3, d3, #3                  \n"
    223    "vrshr.s16       d4, d4, #3                  \n"
    224    "vrshr.s16       d5, d5, #3                  \n"
    225 
    226    "vzip.16         q1, q2                      \n"
    227    "vzip.16         q1, q2                      \n"
    228 
    229    // Must accumulate before saturating
    230    "vmovl.u8        q8, d6                      \n"
    231    "vmovl.u8        q9, d7                      \n"
    232 
    233    "vqadd.s16       q1, q1, q8                  \n"
    234    "vqadd.s16       q2, q2, q9                  \n"
    235 
    236    "vqmovun.s16     d0, q1                      \n"
    237    "vqmovun.s16     d1, q2                      \n"
    238 
    239    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
    240    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
    241    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
    242    "vst1.32         d1[1], [%[dst]]             \n"
    243 
    244    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
    245    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
    246    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
    247  );
    248 }
    249 
    250 #endif    // WEBP_USE_INTRINSICS
    251 
    252 static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
    253                            const int16_t* WEBP_RESTRICT in,
    254                            uint8_t* WEBP_RESTRICT dst, int do_two) {
    255  ITransformOne_NEON(ref, in, dst);
    256  if (do_two) {
    257    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
    258  }
    259 }
    260 
    261 // Load all 4x4 pixels into a single uint8x16_t variable.
    262 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
    263  uint32x4_t out = vdupq_n_u32(0);
    264  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
    265  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
    266  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
    267  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
    268  return vreinterpretq_u8_u32(out);
    269 }
    270 
    271 // Forward transform.
    272 
    273 #if defined(WEBP_USE_INTRINSICS)
    274 
    275 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
    276                                              const int16x4_t B,
    277                                              const int16x4_t C,
    278                                              const int16x4_t D,
    279                                              int16x8_t* const out01,
    280                                              int16x8_t* const out32) {
    281  const int16x4x2_t AB = vtrn_s16(A, B);
    282  const int16x4x2_t CD = vtrn_s16(C, D);
    283  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
    284                                     vreinterpret_s32_s16(CD.val[0]));
    285  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
    286                                     vreinterpret_s32_s16(CD.val[1]));
    287  *out01 = vreinterpretq_s16_s64(
    288      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
    289                   vreinterpret_s64_s32(tmp13.val[0])));
    290  *out32 = vreinterpretq_s16_s64(
    291      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
    292                   vreinterpret_s64_s32(tmp02.val[1])));
    293 }
    294 
    295 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
    296                                              const uint8x8_t b) {
    297  return vreinterpretq_s16_u16(vsubl_u8(a, b));
    298 }
    299 
    300 static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
    301                            const uint8_t* WEBP_RESTRICT ref,
    302                            int16_t* WEBP_RESTRICT out) {
    303  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
    304  {
    305    const uint8x16_t S0 = Load4x4_NEON(src);
    306    const uint8x16_t R0 = Load4x4_NEON(ref);
    307    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
    308    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
    309    const int16x4_t D0 = vget_low_s16(D0D1);
    310    const int16x4_t D1 = vget_high_s16(D0D1);
    311    const int16x4_t D2 = vget_low_s16(D2D3);
    312    const int16x4_t D3 = vget_high_s16(D2D3);
    313    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
    314  }
    315  {    // 1rst pass
    316    const int32x4_t kCst937 = vdupq_n_s32(937);
    317    const int32x4_t kCst1812 = vdupq_n_s32(1812);
    318    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
    319    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
    320    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
    321    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
    322                                    vget_high_s16(a0a1_2));
    323    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
    324                                    vget_high_s16(a0a1_2));
    325    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
    326    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
    327    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
    328    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    329    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
    330    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
    331    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
    332  }
    333  {    // 2nd pass
    334    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
    335    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
    336    const int32x4_t kCst51000 = vdupq_n_s32(51000);
    337    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
    338    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
    339    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
    340    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
    341    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
    342    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
    343    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
    344    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
    345    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    346    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
    347    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
    348    const int16x4_t a3_eq_0 =
    349        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
    350    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
    351    vst1_s16(out +  0, out0);
    352    vst1_s16(out +  4, out1);
    353    vst1_s16(out +  8, out2);
    354    vst1_s16(out + 12, out3);
    355  }
    356 }
    357 
    358 #else
    359 
    360 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
    361 static const int16_t kCoeff16[] = {
    362  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
    363 };
    364 static const int32_t kCoeff32[] = {
    365   1812,  1812,  1812,  1812,
    366    937,   937,   937,   937,
    367  12000, 12000, 12000, 12000,
    368  51000, 51000, 51000, 51000
    369 };
    370 
    371 static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
    372                            const uint8_t* WEBP_RESTRICT ref,
    373                            int16_t* WEBP_RESTRICT out) {
    374  const int kBPS = BPS;
    375  const uint8_t* src_ptr = src;
    376  const uint8_t* ref_ptr = ref;
    377  const int16_t* coeff16 = kCoeff16;
    378  const int32_t* coeff32 = kCoeff32;
    379 
    380  __asm__ volatile (
    381    // load src into q4, q5 in high half
    382    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
    383    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
    384    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
    385    "vld1.8 {d11}, [%[src_ptr]]               \n"
    386 
    387    // load ref into q6, q7 in high half
    388    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
    389    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
    390    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
    391    "vld1.8 {d15}, [%[ref_ptr]]               \n"
    392 
    393    // Pack the high values in to q4 and q6
    394    "vtrn.32     q4, q5                       \n"
    395    "vtrn.32     q6, q7                       \n"
    396 
    397    // d[0-3] = src - ref
    398    "vsubl.u8    q0, d8, d12                  \n"
    399    "vsubl.u8    q1, d9, d13                  \n"
    400 
    401    // load coeff16 into q8(d16=5352, d17=2217)
    402    "vld1.16     {q8}, [%[coeff16]]           \n"
    403 
    404    // load coeff32 high half into q9 = 1812, q10 = 937
    405    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
    406 
    407    // load coeff32 low half into q11=12000, q12=51000
    408    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
    409 
    410    // part 1
    411    // Transpose. Register dN is the same as dN in C
    412    "vtrn.32         d0, d2                   \n"
    413    "vtrn.32         d1, d3                   \n"
    414    "vtrn.16         d0, d1                   \n"
    415    "vtrn.16         d2, d3                   \n"
    416 
    417    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
    418    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
    419    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
    420    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
    421 
    422    "vadd.s16        d0, d4, d5               \n" // a0 + a1
    423    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
    424    "vsub.s16        d2, d4, d5               \n" // a0 - a1
    425    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
    426 
    427    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
    428    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
    429    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
    430    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
    431 
    432    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
    433    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
    434    "vshrn.s32       d1, q9, #9               \n"
    435    "vshrn.s32       d3, q10, #9              \n"
    436 
    437    // part 2
    438    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
    439    "vtrn.32         d0, d2                   \n"
    440    "vtrn.32         d1, d3                   \n"
    441    "vtrn.16         d0, d1                   \n"
    442    "vtrn.16         d2, d3                   \n"
    443 
    444    "vmov.s16        d26, #7                  \n"
    445 
    446    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
    447    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
    448    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
    449    "vadd.s16        d4, d4, d26              \n" // a1 + 7
    450    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
    451 
    452    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
    453    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
    454 
    455    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
    456    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
    457 
    458    "vceq.s16        d4, d7, #0               \n"
    459 
    460    "vshr.s16        d0, d0, #4               \n"
    461    "vshr.s16        d2, d2, #4               \n"
    462 
    463    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
    464    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
    465 
    466    "vmvn            d4, d4                   \n" // !(d1 == 0)
    467    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
    468    "vshrn.s32       d1, q11, #16             \n"
    469    // op[4] += (d1!=0)
    470    "vsub.s16        d1, d1, d4               \n"
    471    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
    472    "vshrn.s32       d3, q12, #16             \n"
    473 
    474    // set result to out array
    475    "vst1.16         {q0, q1}, [%[out]]   \n"
    476    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
    477      [coeff32] "+r"(coeff32)          // modified registers
    478    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
    479      [out] "r"(out)                   // constants
    480    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
    481      "q10", "q11", "q12", "q13"       // clobbered
    482  );
    483 }
    484 
    485 #endif
    486 
    487 #define LOAD_LANE_16b(VALUE, LANE) do {             \
    488  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
    489  src += stride;                                    \
    490 } while (0)
    491 
    492 static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
    493                               int16_t* WEBP_RESTRICT out) {
    494  const int stride = 16;
    495  const int16x4_t zero = vdup_n_s16(0);
    496  int32x4x4_t tmp0;
    497  int16x4x4_t in;
    498  INIT_VECTOR4(in, zero, zero, zero, zero);
    499  LOAD_LANE_16b(in.val[0], 0);
    500  LOAD_LANE_16b(in.val[1], 0);
    501  LOAD_LANE_16b(in.val[2], 0);
    502  LOAD_LANE_16b(in.val[3], 0);
    503  LOAD_LANE_16b(in.val[0], 1);
    504  LOAD_LANE_16b(in.val[1], 1);
    505  LOAD_LANE_16b(in.val[2], 1);
    506  LOAD_LANE_16b(in.val[3], 1);
    507  LOAD_LANE_16b(in.val[0], 2);
    508  LOAD_LANE_16b(in.val[1], 2);
    509  LOAD_LANE_16b(in.val[2], 2);
    510  LOAD_LANE_16b(in.val[3], 2);
    511  LOAD_LANE_16b(in.val[0], 3);
    512  LOAD_LANE_16b(in.val[1], 3);
    513  LOAD_LANE_16b(in.val[2], 3);
    514  LOAD_LANE_16b(in.val[3], 3);
    515 
    516  {
    517    // a0 = in[0 * 16] + in[2 * 16]
    518    // a1 = in[1 * 16] + in[3 * 16]
    519    // a2 = in[1 * 16] - in[3 * 16]
    520    // a3 = in[0 * 16] - in[2 * 16]
    521    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
    522    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
    523    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
    524    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
    525    tmp0.val[0] = vaddq_s32(a0, a1);
    526    tmp0.val[1] = vaddq_s32(a3, a2);
    527    tmp0.val[2] = vsubq_s32(a3, a2);
    528    tmp0.val[3] = vsubq_s32(a0, a1);
    529  }
    530  {
    531    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
    532    // a0 = tmp[0 + i] + tmp[ 8 + i]
    533    // a1 = tmp[4 + i] + tmp[12 + i]
    534    // a2 = tmp[4 + i] - tmp[12 + i]
    535    // a3 = tmp[0 + i] - tmp[ 8 + i]
    536    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
    537    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
    538    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
    539    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
    540    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
    541    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
    542    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
    543    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
    544    const int16x4_t out0 = vmovn_s32(b0);
    545    const int16x4_t out1 = vmovn_s32(b1);
    546    const int16x4_t out2 = vmovn_s32(b2);
    547    const int16x4_t out3 = vmovn_s32(b3);
    548 
    549    vst1_s16(out +  0, out0);
    550    vst1_s16(out +  4, out1);
    551    vst1_s16(out +  8, out2);
    552    vst1_s16(out + 12, out3);
    553  }
    554 }
    555 #undef LOAD_LANE_16b
    556 
    557 //------------------------------------------------------------------------------
    558 // Texture distortion
    559 //
    560 // We try to match the spectral content (weighted) between source and
    561 // reconstructed samples.
    562 
    563 // a 0123, b 0123
    564 // a 4567, b 4567
    565 // a 89ab, b 89ab
    566 // a cdef, b cdef
    567 //
    568 // transpose
    569 //
    570 // a 048c, b 048c
    571 // a 159d, b 159d
    572 // a 26ae, b 26ae
    573 // a 37bf, b 37bf
    574 //
    575 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
    576  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
    577  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
    578  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
    579                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
    580  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
    581                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
    582  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
    583  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
    584  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
    585  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
    586  return q4_in;
    587 }
    588 
    589 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
    590    const int16x8x4_t q4_in) {
    591  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
    592  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
    593  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
    594  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
    595  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
    596  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
    597  int16x8x4_t q4_out;
    598  // tmp[0] = a0 + a1
    599  // tmp[1] = a3 + a2
    600  // tmp[2] = a3 - a2
    601  // tmp[3] = a0 - a1
    602  INIT_VECTOR4(q4_out,
    603               vabsq_s16(vaddq_s16(q_a0, q_a1)),
    604               vabsq_s16(vaddq_s16(q_a3, q_a2)),
    605               vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
    606  return q4_out;
    607 }
    608 
    609 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
    610  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
    611                                                        q4_in.val[2]));
    612  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
    613                                                        q4_in.val[3]));
    614  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
    615                                                        q4_in.val[3]));
    616  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
    617                                                        q4_in.val[2]));
    618  int16x8x4_t q4_out;
    619 
    620  INIT_VECTOR4(q4_out,
    621               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
    622               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
    623  return q4_out;
    624 }
    625 
    626 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
    627  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
    628  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
    629  int16x4x4_t d4_w;
    630  INIT_VECTOR4(d4_w,
    631               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
    632               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
    633               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
    634               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
    635  return d4_w;
    636 }
    637 
    638 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
    639                                           const int16x4x4_t d4_w) {
    640  int32x2_t d_sum;
    641  // sum += w[ 0] * abs(b0);
    642  // sum += w[ 4] * abs(b1);
    643  // sum += w[ 8] * abs(b2);
    644  // sum += w[12] * abs(b3);
    645  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
    646  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
    647  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
    648  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
    649  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
    650  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
    651  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
    652  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
    653 
    654  q_sum0 = vaddq_s32(q_sum0, q_sum1);
    655  q_sum2 = vaddq_s32(q_sum2, q_sum3);
    656  q_sum2 = vaddq_s32(q_sum0, q_sum2);
    657  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
    658  d_sum = vpadd_s32(d_sum, d_sum);
    659  return d_sum;
    660 }
    661 
    662 #define LOAD_LANE_32b(src, VALUE, LANE) \
    663    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
    664 
    665 // Hadamard transform
    666 // Returns the weighted sum of the absolute value of transformed coefficients.
    667 // w[] contains a row-major 4 by 4 symmetric matrix.
    668 static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
    669                         const uint8_t* WEBP_RESTRICT const b,
    670                         const uint16_t* WEBP_RESTRICT const w) {
    671  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
    672  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
    673  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
    674  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
    675  uint8x8x4_t d4_in;
    676 
    677  // load data a, b
    678  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
    679  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
    680  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
    681  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
    682  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
    683  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
    684  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
    685  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
    686  INIT_VECTOR4(d4_in,
    687               vreinterpret_u8_u32(d_in_ab_0123),
    688               vreinterpret_u8_u32(d_in_ab_4567),
    689               vreinterpret_u8_u32(d_in_ab_89ab),
    690               vreinterpret_u8_u32(d_in_ab_cdef));
    691 
    692  {
    693    // Vertical pass first to avoid a transpose (vertical and horizontal passes
    694    // are commutative because w/kWeightY is symmetric) and subsequent
    695    // transpose.
    696    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
    697    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
    698    // horizontal pass
    699    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
    700    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
    701    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
    702 
    703    // abs(sum2 - sum1) >> 5
    704    d_sum = vabs_s32(d_sum);
    705    d_sum = vshr_n_s32(d_sum, 5);
    706    return vget_lane_s32(d_sum, 0);
    707  }
    708 }
    709 #undef LOAD_LANE_32b
    710 
    711 static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
    712                           const uint8_t* WEBP_RESTRICT const b,
    713                           const uint16_t* WEBP_RESTRICT const w) {
    714  int D = 0;
    715  int x, y;
    716  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    717    for (x = 0; x < 16; x += 4) {
    718      D += Disto4x4_NEON(a + x + y, b + x + y, w);
    719    }
    720  }
    721  return D;
    722 }
    723 
    724 //------------------------------------------------------------------------------
    725 
    726 static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
    727                                  const uint8_t* WEBP_RESTRICT pred,
    728                                  int start_block, int end_block,
    729                                  VP8Histogram* WEBP_RESTRICT const histo) {
    730  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
    731  int j;
    732  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
    733  for (j = start_block; j < end_block; ++j) {
    734    int16_t out[16];
    735    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    736    {
    737      int k;
    738      const int16x8_t a0 = vld1q_s16(out + 0);
    739      const int16x8_t b0 = vld1q_s16(out + 8);
    740      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
    741      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
    742      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
    743      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
    744      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
    745      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
    746      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
    747      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
    748      // Convert coefficients to bin.
    749      for (k = 0; k < 16; ++k) {
    750        ++distribution[out[k]];
    751      }
    752    }
    753  }
    754  VP8SetHistogramData(distribution, histo);
    755 }
    756 
    757 //------------------------------------------------------------------------------
    758 
    759 static WEBP_INLINE void AccumulateSSE16_NEON(
    760    const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
    761    uint32x4_t* const sum) {
    762  const uint8x16_t a0 = vld1q_u8(a);
    763  const uint8x16_t b0 = vld1q_u8(b);
    764  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
    765  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
    766                                    vget_low_u8(abs_diff));
    767  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
    768                                    vget_high_u8(abs_diff));
    769  /* pair-wise adds and widen */
    770  const uint32x4_t sum1 = vpaddlq_u16(prod1);
    771  const uint32x4_t sum2 = vpaddlq_u16(prod2);
    772  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
    773 }
    774 
    775 // Horizontal sum of all four uint32_t values in 'sum'.
    776 static int SumToInt_NEON(uint32x4_t sum) {
    777 #if WEBP_AARCH64
    778  return (int)vaddvq_u32(sum);
    779 #else
    780  const uint64x2_t sum2 = vpaddlq_u32(sum);
    781  const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
    782                                   vreinterpret_u32_u64(vget_high_u64(sum2)));
    783  return (int)vget_lane_u32(sum3, 0);
    784 #endif
    785 }
    786 
    787 static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
    788                         const uint8_t* WEBP_RESTRICT b) {
    789  uint32x4_t sum = vdupq_n_u32(0);
    790  int y;
    791  for (y = 0; y < 16; ++y) {
    792    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
    793  }
    794  return SumToInt_NEON(sum);
    795 }
    796 
    797 static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
    798                        const uint8_t* WEBP_RESTRICT b) {
    799  uint32x4_t sum = vdupq_n_u32(0);
    800  int y;
    801  for (y = 0; y < 8; ++y) {
    802    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
    803  }
    804  return SumToInt_NEON(sum);
    805 }
    806 
    807 static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
    808                       const uint8_t* WEBP_RESTRICT b) {
    809  uint32x4_t sum = vdupq_n_u32(0);
    810  int y;
    811  for (y = 0; y < 8; ++y) {
    812    const uint8x8_t a0 = vld1_u8(a + y * BPS);
    813    const uint8x8_t b0 = vld1_u8(b + y * BPS);
    814    const uint8x8_t abs_diff = vabd_u8(a0, b0);
    815    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
    816    sum = vpadalq_u16(sum, prod);
    817  }
    818  return SumToInt_NEON(sum);
    819 }
    820 
    821 static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
    822                       const uint8_t* WEBP_RESTRICT b) {
    823  const uint8x16_t a0 = Load4x4_NEON(a);
    824  const uint8x16_t b0 = Load4x4_NEON(b);
    825  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
    826  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
    827                                    vget_low_u8(abs_diff));
    828  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
    829                                    vget_high_u8(abs_diff));
    830  /* pair-wise adds and widen */
    831  const uint32x4_t sum1 = vpaddlq_u16(prod1);
    832  const uint32x4_t sum2 = vpaddlq_u16(prod2);
    833  return SumToInt_NEON(vaddq_u32(sum1, sum2));
    834 }
    835 
    836 //------------------------------------------------------------------------------
    837 
    838 // Compilation with gcc-4.6.x is problematic for now.
    839 #if !defined(WORK_AROUND_GCC)
    840 
    841 static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
    842                               const VP8Matrix* WEBP_RESTRICT const mtx,
    843                               int offset) {
    844  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen[offset]);
    845  const uint16x8_t q = vld1q_u16(&mtx->q[offset]);
    846  const uint16x8_t iq = vld1q_u16(&mtx->iq[offset]);
    847  const uint32x4_t bias0 = vld1q_u32(&mtx->bias[offset + 0]);
    848  const uint32x4_t bias1 = vld1q_u32(&mtx->bias[offset + 4]);
    849 
    850  const int16x8_t a = vld1q_s16(in + offset);                // in
    851  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
    852  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
    853  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
    854  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
    855  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
    856  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
    857  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
    858  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
    859                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
    860  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
    861  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
    862  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
    863  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
    864  vst1q_s16(in + offset, c4);
    865  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
    866  return c3;
    867 }
    868 
    869 static const uint8_t kShuffles[4][8] = {
    870  { 0,   1,  2,  3,  8,  9, 16, 17 },
    871  { 10, 11,  4,  5,  6,  7, 12, 13 },
    872  { 18, 19, 24, 25, 26, 27, 20, 21 },
    873  { 14, 15, 22, 23, 28, 29, 30, 31 }
    874 };
    875 
    876 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
    877                              const VP8Matrix* WEBP_RESTRICT const mtx) {
    878  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
    879  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
    880  uint8x8x4_t shuffles;
    881  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
    882  // non-standard versions there.
    883 #if defined(__APPLE__) && WEBP_AARCH64 && \
    884    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
    885  uint8x16x2_t all_out;
    886  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
    887  INIT_VECTOR4(shuffles,
    888               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
    889               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
    890               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
    891               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
    892 #else
    893  uint8x8x4_t all_out;
    894  INIT_VECTOR4(all_out,
    895               vreinterpret_u8_s16(vget_low_s16(out0)),
    896               vreinterpret_u8_s16(vget_high_s16(out0)),
    897               vreinterpret_u8_s16(vget_low_s16(out1)),
    898               vreinterpret_u8_s16(vget_high_s16(out1)));
    899  INIT_VECTOR4(shuffles,
    900               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
    901               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
    902               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
    903               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
    904 #endif
    905  // Zigzag reordering
    906  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
    907  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
    908  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
    909  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
    910  // test zeros
    911  if (*(uint64_t*)(out +  0) != 0) return 1;
    912  if (*(uint64_t*)(out +  4) != 0) return 1;
    913  if (*(uint64_t*)(out +  8) != 0) return 1;
    914  if (*(uint64_t*)(out + 12) != 0) return 1;
    915  return 0;
    916 }
    917 
    918 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
    919                                const VP8Matrix* WEBP_RESTRICT const mtx) {
    920  int nz;
    921  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
    922  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
    923  return nz;
    924 }
    925 
    926 #endif   // !WORK_AROUND_GCC
    927 
    928 #if WEBP_AARCH64
    929 
    930 #if BPS == 32
    931 #define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane)                              \
    932  do {                                                                         \
    933    uint8x16_t r;                                                              \
    934    r = vqtbl2q_u8(qcombined, tbl);                                            \
    935    r = vreinterpretq_u8_u32(                                                  \
    936        vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane),          \
    937                       vreinterpretq_u32_u8(r), 1));                           \
    938    vst1q_u8(dst, r);                                                          \
    939  } while (0)
    940 
    941 #define RD4_VR4_LD4_VL4_NEON(dst, tbl)                                         \
    942  do {                                                                         \
    943    uint8x16_t r;                                                              \
    944    r = vqtbl2q_u8(qcombined, tbl);                                            \
    945    vst1q_u8(dst, r);                                                          \
    946  } while (0)
    947 
    948 static WEBP_INLINE uint8x8x2_t Vld1U8x2(const uint8_t* ptr) {
    949 #if LOCAL_CLANG_PREREQ(3, 4) || LOCAL_GCC_PREREQ(8, 5) || defined(_MSC_VER)
    950  return vld1_u8_x2(ptr);
    951 #else
    952  uint8x8x2_t res;
    953  INIT_VECTOR2(res, vld1_u8(ptr + 0 * 8), vld1_u8(ptr + 1 * 8));
    954  return res;
    955 #endif
    956 }
    957 
    958 static WEBP_INLINE uint8x16x4_t Vld1qU8x4(const uint8_t* ptr) {
    959 #if LOCAL_CLANG_PREREQ(3, 4) || LOCAL_GCC_PREREQ(9, 4) || defined(_MSC_VER)
    960  return vld1q_u8_x4(ptr);
    961 #else
    962  uint8x16x4_t res;
    963  INIT_VECTOR4(res,
    964               vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
    965               vld1q_u8(ptr + 2 * 16), vld1q_u8(ptr + 3 * 16));
    966  return res;
    967 #endif
    968 }
    969 
    970 static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
    971                             const uint8_t* WEBP_RESTRICT top) {
    972  // 0   1   2   3   4   5   6   7   8   9  10  11  12  13
    973  //     L   K   J   I   X   A   B   C   D   E   F   G   H
    974  //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
    975  static const uint8_t kLookupTbl1[64] = {
    976    0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 12, 12,
    977    3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0,
    978    4, 20, 21, 22,  3, 18,  2, 17,  3, 19,  4, 20,  2, 17,  1, 16,
    979    2, 18,  3, 19,  1, 16, 31, 31,  1, 17,  2, 18, 31, 31, 31, 31
    980  };
    981 
    982  static const uint8_t kLookupTbl2[64] = {
    983    20, 21, 22, 23,  5,  6,  7,  8, 22, 23, 24, 25,  6,  7,  8,  9,
    984    19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
    985    18, 19, 20, 21, 19,  5,  6,  7, 24, 25, 26, 27,  7,  8,  9, 26,
    986    17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
    987  };
    988 
    989  static const uint8_t kLookupTbl3[64] = {
    990    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 19, 19, 19, 19,
    991    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 18, 18, 18, 18,
    992    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 17, 17, 17, 17,
    993    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 16, 16, 16, 16
    994  };
    995 
    996  const uint8x16x4_t lookup_avgs1 = Vld1qU8x4(kLookupTbl1);
    997  const uint8x16x4_t lookup_avgs2 = Vld1qU8x4(kLookupTbl2);
    998  const uint8x16x4_t lookup_avgs3 = Vld1qU8x4(kLookupTbl3);
    999 
   1000  const uint8x16_t preload = vld1q_u8(top - 5);
   1001  uint8x16x2_t qcombined;
   1002  uint8x16_t result0, result1;
   1003 
   1004  uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
   1005  uint8x16_t b = preload;
   1006  uint8x16_t c = vextq_u8(a, a, 2);
   1007 
   1008  uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
   1009  uint8x16_t avg2_all = vrhaddq_u8(a, b);
   1010 
   1011  uint8x8_t preload_x8, sub_a, sub_c;
   1012  uint8_t result_u8;
   1013  uint8x8_t res_lo, res_hi;
   1014  uint8x16_t full_b;
   1015  uint16x8_t sub, sum_lo, sum_hi;
   1016 
   1017  preload_x8 = vget_low_u8(c);
   1018  preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
   1019 
   1020  result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
   1021 
   1022  avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
   1023  avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
   1024 
   1025  qcombined.val[0] = avg2_all;
   1026  qcombined.val[1] = avg3_all;
   1027 
   1028  sub_a = vdup_laneq_u8(preload, 4);
   1029 
   1030  // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
   1031  full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
   1032  // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
   1033  sub_c = vreinterpret_u8_u32(vdup_n_u32(
   1034      vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
   1035 
   1036  sub = vsubl_u8(sub_c, sub_a);
   1037  sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
   1038  res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
   1039 
   1040  sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
   1041  res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
   1042 
   1043  // DC4, VE4, HE4, TM4
   1044  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
   1045  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
   1046  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
   1047  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
   1048 
   1049  // RD4, VR4, LD4, VL4
   1050  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
   1051  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
   1052  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
   1053  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
   1054 
   1055  // HD4, HU4
   1056  result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
   1057  result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
   1058 
   1059  vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
   1060  vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
   1061  vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
   1062  vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
   1063 }
   1064 #endif  // BPS == 32
   1065 
   1066 static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
   1067  uint8x16_t a = vdupq_n_u8(value);
   1068  int i;
   1069  for (i = 0; i < 16; i++) {
   1070    vst1q_u8(dst + BPS * i, a);
   1071  }
   1072 }
   1073 
   1074 static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
   1075  uint8x16_t a = vld1q_u8(src);
   1076  int i;
   1077  for (i = 0; i < 16; i++) {
   1078    vst1q_u8(dst + BPS * i, a);
   1079  }
   1080 }
   1081 
   1082 static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
   1083                                              const uint8_t* left) {
   1084  uint8x16_t a;
   1085 
   1086  if (left == NULL) {
   1087    Fill_NEON(dst, 129);
   1088    return;
   1089  }
   1090 
   1091  a = vld1q_u8(left + 0);
   1092  vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
   1093  vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
   1094  vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
   1095  vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
   1096  vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
   1097  vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
   1098  vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
   1099  vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
   1100  vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
   1101  vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
   1102  vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
   1103  vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
   1104  vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
   1105  vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
   1106  vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
   1107  vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
   1108 }
   1109 
   1110 static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
   1111  if (top != NULL) {
   1112    Fill16_NEON(dst, top);
   1113  } else {
   1114    Fill_NEON(dst, 127);
   1115  }
   1116 }
   1117 
   1118 static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
   1119                                    const uint8_t* top) {
   1120  uint8_t s;
   1121 
   1122  if (top != NULL) {
   1123    uint16_t dc;
   1124    dc = vaddlvq_u8(vld1q_u8(top));
   1125    if (left != NULL) {
   1126      // top and left present.
   1127      dc += vaddlvq_u8(vld1q_u8(left));
   1128      s = vqrshrnh_n_u16(dc, 5);
   1129    } else {
   1130      // top but no left.
   1131      s = vqrshrnh_n_u16(dc, 4);
   1132    }
   1133  } else {
   1134    if (left != NULL) {
   1135      uint16_t dc;
   1136      // left but no top.
   1137      dc = vaddlvq_u8(vld1q_u8(left));
   1138      s = vqrshrnh_n_u16(dc, 4);
   1139    } else {
   1140      // No top, no left, nothing.
   1141      s = 0x80;
   1142    }
   1143  }
   1144  Fill_NEON(dst, s);
   1145 }
   1146 
   1147 static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
   1148                                              const uint8x8_t outer,
   1149                                              const uint8x8x2_t inner,
   1150                                              const uint16x8_t a, int i,
   1151                                              const int n) {
   1152  uint8x8_t d1, d2;
   1153  uint16x8_t r1, r2;
   1154 
   1155  r1 = vaddl_u8(outer, inner.val[0]);
   1156  r1 = vqsubq_u16(r1, a);
   1157  d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
   1158  r2 = vaddl_u8(outer, inner.val[1]);
   1159  r2 = vqsubq_u16(r2, a);
   1160  d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
   1161  vst1_u8(dst + BPS * (i * 4 + n), d1);
   1162  vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
   1163 }
   1164 
   1165 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
   1166                                        const uint8_t* top) {
   1167  int i;
   1168  uint16x8_t a;
   1169  uint8x8x2_t inner;
   1170 
   1171  if (left == NULL) {
   1172    // True motion without left samples (hence: with default 129 value) is
   1173    // equivalent to VE prediction where you just copy the top samples.
   1174    // Note that if top samples are not available, the default value is then
   1175    // 129, and not 127 as in the VerticalPred case.
   1176    if (top != NULL) {
   1177      VerticalPred16_NEON(dst, top);
   1178    } else {
   1179      Fill_NEON(dst, 129);
   1180    }
   1181    return;
   1182  }
   1183 
   1184  // left is not NULL.
   1185  if (top == NULL) {
   1186    HorizontalPred16_NEON(dst, left);
   1187    return;
   1188  }
   1189 
   1190  // Neither left nor top are NULL.
   1191  a = vdupq_n_u16(left[-1]);
   1192  inner = Vld1U8x2(top);
   1193 
   1194  for (i = 0; i < 4; i++) {
   1195    const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
   1196 
   1197    TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
   1198    TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
   1199    TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
   1200    TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
   1201  }
   1202 }
   1203 
   1204 static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
   1205                              const uint8_t* WEBP_RESTRICT left,
   1206                              const uint8_t* WEBP_RESTRICT top) {
   1207  DCMode_NEON(I16DC16 + dst, left, top);
   1208  VerticalPred16_NEON(I16VE16 + dst, top);
   1209  HorizontalPred16_NEON(I16HE16 + dst, left);
   1210  TrueMotion_NEON(I16TM16 + dst, left, top);
   1211 }
   1212 
   1213 #endif // WEBP_AARCH64
   1214 
   1215 //------------------------------------------------------------------------------
   1216 // Entry point
   1217 
   1218 extern void VP8EncDspInitNEON(void);
   1219 
   1220 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   1221  VP8ITransform = ITransform_NEON;
   1222  VP8FTransform = FTransform_NEON;
   1223 
   1224  VP8FTransformWHT = FTransformWHT_NEON;
   1225 
   1226  VP8TDisto4x4 = Disto4x4_NEON;
   1227  VP8TDisto16x16 = Disto16x16_NEON;
   1228  VP8CollectHistogram = CollectHistogram_NEON;
   1229 
   1230  VP8SSE16x16 = SSE16x16_NEON;
   1231  VP8SSE16x8 = SSE16x8_NEON;
   1232  VP8SSE8x8 = SSE8x8_NEON;
   1233  VP8SSE4x4 = SSE4x4_NEON;
   1234 
   1235 #if WEBP_AARCH64
   1236 #if BPS == 32
   1237  VP8EncPredLuma4 = Intra4Preds_NEON;
   1238 #endif
   1239  VP8EncPredLuma16 = Intra16Preds_NEON;
   1240 #endif
   1241 
   1242 #if !defined(WORK_AROUND_GCC)
   1243  VP8EncQuantizeBlock = QuantizeBlock_NEON;
   1244  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
   1245  VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
   1246 #endif
   1247 }
   1248 
   1249 #else  // !WEBP_USE_NEON
   1250 
   1251 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
   1252 
   1253 #endif  // WEBP_USE_NEON