tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dec_neon.c (69683B)


      1 // Copyright 2012 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // ARM NEON version of dsp functions and loop filtering.
     11 //
     12 // Authors: Somnath Banerjee (somnath@google.com)
     13 //          Johann Koenig (johannkoenig@google.com)
     14 
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_NEON)
     18 
     19 #include "src/dsp/neon.h"
     20 #include "src/dec/vp8i_dec.h"
     21 
     22 //------------------------------------------------------------------------------
     23 // NxM Loading functions
     24 
     25 #if !defined(WORK_AROUND_GCC)
     26 
     27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
     28 // (register alloc, probably). The variants somewhat mitigate the problem, but
     29 // not quite. HFilter16i() remains problematic.
     30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
     31                                            int stride) {
     32  const uint8x8_t zero = vdup_n_u8(0);
     33  uint8x8x4_t out;
     34  INIT_VECTOR4(out, zero, zero, zero, zero);
     35  out = vld4_lane_u8(src + 0 * stride, out, 0);
     36  out = vld4_lane_u8(src + 1 * stride, out, 1);
     37  out = vld4_lane_u8(src + 2 * stride, out, 2);
     38  out = vld4_lane_u8(src + 3 * stride, out, 3);
     39  out = vld4_lane_u8(src + 4 * stride, out, 4);
     40  out = vld4_lane_u8(src + 5 * stride, out, 5);
     41  out = vld4_lane_u8(src + 6 * stride, out, 6);
     42  out = vld4_lane_u8(src + 7 * stride, out, 7);
     43  return out;
     44 }
     45 
     46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
     47                                      uint8x16_t* const p1,
     48                                      uint8x16_t* const p0,
     49                                      uint8x16_t* const q0,
     50                                      uint8x16_t* const q1) {
     51  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
     52  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
     53  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
     54  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
     55  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
     56  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
     57  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
     58  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
     59 }
     60 
     61 #else  // WORK_AROUND_GCC
     62 
     63 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
     64  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
     65  src += stride;                                                     \
     66 } while (0)
     67 
     68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
     69                                      uint8x16_t* const p1,
     70                                      uint8x16_t* const p0,
     71                                      uint8x16_t* const q0,
     72                                      uint8x16_t* const q1) {
     73  const uint32x4_t zero = vdupq_n_u32(0);
     74  uint32x4x4_t in;
     75  INIT_VECTOR4(in, zero, zero, zero, zero);
     76  src -= 2;
     77  LOADQ_LANE_32b(in.val[0], 0);
     78  LOADQ_LANE_32b(in.val[1], 0);
     79  LOADQ_LANE_32b(in.val[2], 0);
     80  LOADQ_LANE_32b(in.val[3], 0);
     81  LOADQ_LANE_32b(in.val[0], 1);
     82  LOADQ_LANE_32b(in.val[1], 1);
     83  LOADQ_LANE_32b(in.val[2], 1);
     84  LOADQ_LANE_32b(in.val[3], 1);
     85  LOADQ_LANE_32b(in.val[0], 2);
     86  LOADQ_LANE_32b(in.val[1], 2);
     87  LOADQ_LANE_32b(in.val[2], 2);
     88  LOADQ_LANE_32b(in.val[3], 2);
     89  LOADQ_LANE_32b(in.val[0], 3);
     90  LOADQ_LANE_32b(in.val[1], 3);
     91  LOADQ_LANE_32b(in.val[2], 3);
     92  LOADQ_LANE_32b(in.val[3], 3);
     93  // Transpose four 4x4 parts:
     94  {
     95    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
     96                                        vreinterpretq_u8_u32(in.val[1]));
     97    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
     98                                        vreinterpretq_u8_u32(in.val[3]));
     99    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    100                                         vreinterpretq_u16_u8(row23.val[0]));
    101    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    102                                         vreinterpretq_u16_u8(row23.val[1]));
    103    *p1 = vreinterpretq_u8_u16(row02.val[0]);
    104    *p0 = vreinterpretq_u8_u16(row13.val[0]);
    105    *q0 = vreinterpretq_u8_u16(row02.val[1]);
    106    *q1 = vreinterpretq_u8_u16(row13.val[1]);
    107  }
    108 }
    109 #undef LOADQ_LANE_32b
    110 
    111 #endif  // !WORK_AROUND_GCC
    112 
    113 static WEBP_INLINE void Load8x16_NEON(
    114    const uint8_t* const src, int stride,
    115    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    116    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    117    uint8x16_t* const q2, uint8x16_t* const q3) {
    118  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
    119  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
    120 }
    121 
    122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
    123                                      uint8x16_t* const p1,
    124                                      uint8x16_t* const p0,
    125                                      uint8x16_t* const q0,
    126                                      uint8x16_t* const q1) {
    127  *p1 = vld1q_u8(src - 2 * stride);
    128  *p0 = vld1q_u8(src - 1 * stride);
    129  *q0 = vld1q_u8(src + 0 * stride);
    130  *q1 = vld1q_u8(src + 1 * stride);
    131 }
    132 
    133 static WEBP_INLINE void Load16x8_NEON(
    134    const uint8_t* const src, int stride,
    135    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    136    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    137    uint8x16_t* const q2, uint8x16_t* const q3) {
    138  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
    139  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
    140 }
    141 
    142 static WEBP_INLINE void Load8x8x2_NEON(
    143    const uint8_t* const u, const uint8_t* const v, int stride,
    144    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    145    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    146    uint8x16_t* const q2, uint8x16_t* const q3) {
    147  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    148  // and the v-samples on the higher half.
    149  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
    150  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
    151  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
    152  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
    153  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
    154  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
    155  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
    156  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
    157 }
    158 
    159 #if !defined(WORK_AROUND_GCC)
    160 
    161 #define LOAD_UV_8(ROW) \
    162  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
    163 
    164 static WEBP_INLINE void Load8x8x2T_NEON(
    165    const uint8_t* const u, const uint8_t* const v, int stride,
    166    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    167    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    168    uint8x16_t* const q2, uint8x16_t* const q3) {
    169  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    170  // and the v-samples on the higher half.
    171  const uint8x16_t row0 = LOAD_UV_8(0);
    172  const uint8x16_t row1 = LOAD_UV_8(1);
    173  const uint8x16_t row2 = LOAD_UV_8(2);
    174  const uint8x16_t row3 = LOAD_UV_8(3);
    175  const uint8x16_t row4 = LOAD_UV_8(4);
    176  const uint8x16_t row5 = LOAD_UV_8(5);
    177  const uint8x16_t row6 = LOAD_UV_8(6);
    178  const uint8x16_t row7 = LOAD_UV_8(7);
    179  // Perform two side-by-side 8x8 transposes
    180  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
    181  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
    182  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
    183  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
    184  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
    185  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
    186  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
    187  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
    188  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
    189                                                    // u01 u11 u03 u13 ...
    190  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
    191                                                    // u21 u31 u23 u33 ...
    192  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
    193  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
    194  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    195                                       vreinterpretq_u16_u8(row23.val[0]));
    196  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    197                                       vreinterpretq_u16_u8(row23.val[1]));
    198  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
    199                                       vreinterpretq_u16_u8(row67.val[0]));
    200  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
    201                                       vreinterpretq_u16_u8(row67.val[1]));
    202  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
    203                                       vreinterpretq_u32_u16(row46.val[0]));
    204  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
    205                                       vreinterpretq_u32_u16(row46.val[1]));
    206  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
    207                                       vreinterpretq_u32_u16(row57.val[0]));
    208  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
    209                                       vreinterpretq_u32_u16(row57.val[1]));
    210  *p3 = vreinterpretq_u8_u32(row04.val[0]);
    211  *p2 = vreinterpretq_u8_u32(row15.val[0]);
    212  *p1 = vreinterpretq_u8_u32(row26.val[0]);
    213  *p0 = vreinterpretq_u8_u32(row37.val[0]);
    214  *q0 = vreinterpretq_u8_u32(row04.val[1]);
    215  *q1 = vreinterpretq_u8_u32(row15.val[1]);
    216  *q2 = vreinterpretq_u8_u32(row26.val[1]);
    217  *q3 = vreinterpretq_u8_u32(row37.val[1]);
    218 }
    219 #undef LOAD_UV_8
    220 
    221 #endif  // !WORK_AROUND_GCC
    222 
    223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
    224                                      uint8_t* const dst, int stride) {
    225  vst2_lane_u8(dst + 0 * stride, v, 0);
    226  vst2_lane_u8(dst + 1 * stride, v, 1);
    227  vst2_lane_u8(dst + 2 * stride, v, 2);
    228  vst2_lane_u8(dst + 3 * stride, v, 3);
    229  vst2_lane_u8(dst + 4 * stride, v, 4);
    230  vst2_lane_u8(dst + 5 * stride, v, 5);
    231  vst2_lane_u8(dst + 6 * stride, v, 6);
    232  vst2_lane_u8(dst + 7 * stride, v, 7);
    233 }
    234 
    235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
    236                                       uint8_t* const dst, int stride) {
    237  uint8x8x2_t lo, hi;
    238  lo.val[0] = vget_low_u8(p0);
    239  lo.val[1] = vget_low_u8(q0);
    240  hi.val[0] = vget_high_u8(p0);
    241  hi.val[1] = vget_high_u8(q0);
    242  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
    243  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
    244 }
    245 
    246 #if !defined(WORK_AROUND_GCC)
    247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
    248                                      uint8_t* const dst, int stride) {
    249  vst4_lane_u8(dst + 0 * stride, v, 0);
    250  vst4_lane_u8(dst + 1 * stride, v, 1);
    251  vst4_lane_u8(dst + 2 * stride, v, 2);
    252  vst4_lane_u8(dst + 3 * stride, v, 3);
    253  vst4_lane_u8(dst + 4 * stride, v, 4);
    254  vst4_lane_u8(dst + 5 * stride, v, 5);
    255  vst4_lane_u8(dst + 6 * stride, v, 6);
    256  vst4_lane_u8(dst + 7 * stride, v, 7);
    257 }
    258 
    259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
    260                                       const uint8x16_t q0, const uint8x16_t q1,
    261                                       uint8_t* const dst, int stride) {
    262  uint8x8x4_t lo, hi;
    263  INIT_VECTOR4(lo,
    264               vget_low_u8(p1), vget_low_u8(p0),
    265               vget_low_u8(q0), vget_low_u8(q1));
    266  INIT_VECTOR4(hi,
    267               vget_high_u8(p1), vget_high_u8(p0),
    268               vget_high_u8(q0), vget_high_u8(q1));
    269  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
    270  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
    271 }
    272 #endif  // !WORK_AROUND_GCC
    273 
    274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
    275                                       uint8_t* const dst, int stride) {
    276  vst1q_u8(dst - stride, p0);
    277  vst1q_u8(dst, q0);
    278 }
    279 
    280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
    281                                       const uint8x16_t q0, const uint8x16_t q1,
    282                                       uint8_t* const dst, int stride) {
    283  Store16x2_NEON(p1, p0, dst - stride, stride);
    284  Store16x2_NEON(q0, q1, dst + stride, stride);
    285 }
    286 
    287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
    288                                        const uint8x16_t q0,
    289                                        uint8_t* const u, uint8_t* const v,
    290                                        int stride) {
    291  // p0 and q0 contain the u+v samples packed in low/high halves.
    292  vst1_u8(u - stride, vget_low_u8(p0));
    293  vst1_u8(u,          vget_low_u8(q0));
    294  vst1_u8(v - stride, vget_high_u8(p0));
    295  vst1_u8(v,          vget_high_u8(q0));
    296 }
    297 
    298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
    299                                        const uint8x16_t p0,
    300                                        const uint8x16_t q0,
    301                                        const uint8x16_t q1,
    302                                        uint8_t* const u, uint8_t* const v,
    303                                        int stride) {
    304  // The p1...q1 registers contain the u+v samples packed in low/high halves.
    305  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
    306  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
    307 }
    308 
    309 #if !defined(WORK_AROUND_GCC)
    310 
    311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
    312  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
    313  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
    314  (DST) += stride;                                \
    315 } while (0)
    316 
    317 static WEBP_INLINE void Store6x8x2_NEON(
    318    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    319    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    320    uint8_t* u, uint8_t* v, int stride) {
    321  uint8x8x3_t u0, u1, v0, v1;
    322  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
    323  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
    324  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
    325  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
    326  STORE6_LANE(u, u0, u1, 0);
    327  STORE6_LANE(u, u0, u1, 1);
    328  STORE6_LANE(u, u0, u1, 2);
    329  STORE6_LANE(u, u0, u1, 3);
    330  STORE6_LANE(u, u0, u1, 4);
    331  STORE6_LANE(u, u0, u1, 5);
    332  STORE6_LANE(u, u0, u1, 6);
    333  STORE6_LANE(u, u0, u1, 7);
    334  STORE6_LANE(v, v0, v1, 0);
    335  STORE6_LANE(v, v0, v1, 1);
    336  STORE6_LANE(v, v0, v1, 2);
    337  STORE6_LANE(v, v0, v1, 3);
    338  STORE6_LANE(v, v0, v1, 4);
    339  STORE6_LANE(v, v0, v1, 5);
    340  STORE6_LANE(v, v0, v1, 6);
    341  STORE6_LANE(v, v0, v1, 7);
    342 }
    343 #undef STORE6_LANE
    344 
    345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
    346                                        const uint8x16_t p0,
    347                                        const uint8x16_t q0,
    348                                        const uint8x16_t q1,
    349                                        uint8_t* const u, uint8_t* const v,
    350                                        int stride) {
    351  uint8x8x4_t u0, v0;
    352  INIT_VECTOR4(u0,
    353               vget_low_u8(p1), vget_low_u8(p0),
    354               vget_low_u8(q0), vget_low_u8(q1));
    355  INIT_VECTOR4(v0,
    356               vget_high_u8(p1), vget_high_u8(p0),
    357               vget_high_u8(q0), vget_high_u8(q1));
    358  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
    359  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
    360  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
    361  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
    362  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
    363  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
    364  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
    365  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
    366  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
    367  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
    368  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
    369  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
    370  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
    371  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
    372  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
    373  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
    374 }
    375 
    376 #endif  // !WORK_AROUND_GCC
    377 
    378 // Zero extend 'v' to an int16x8_t.
    379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
    380  return vreinterpretq_s16_u16(vmovl_u8(v));
    381 }
    382 
    383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
    384 // to the corresponding rows of 'dst'.
    385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
    386                                                 const int16x8_t dst01,
    387                                                 const int16x8_t dst23) {
    388  // Unsigned saturate to 8b.
    389  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
    390  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
    391 
    392  // Store the results.
    393  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
    394  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
    395  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
    396  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
    397 }
    398 
    399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
    400                                    const int16x8_t row23,
    401                                    uint8_t* const dst) {
    402  uint32x2_t dst01 = vdup_n_u32(0);
    403  uint32x2_t dst23 = vdup_n_u32(0);
    404 
    405  // Load the source pixels.
    406  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
    407  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
    408  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
    409  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
    410 
    411  {
    412    // Convert to 16b.
    413    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
    414    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
    415 
    416    // Descale with rounding.
    417    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    418    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    419    // Add the inverse transform.
    420    SaturateAndStore4x4_NEON(dst, out01, out23);
    421  }
    422 }
    423 
    424 //-----------------------------------------------------------------------------
    425 // Simple In-loop filtering (Paragraph 15.2)
    426 
    427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
    428                                   const uint8x16_t q0, const uint8x16_t q1,
    429                                   int thresh) {
    430  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
    431  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
    432  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
    433  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
    434  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
    435  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
    436  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
    437  return mask;
    438 }
    439 
    440 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
    441  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
    442  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
    443 }
    444 
    445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
    446  const int8x16_t sign_bit = vdupq_n_s8(0x80);
    447  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
    448 }
    449 
    450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
    451                                   const int8x16_t q0, const int8x16_t q1) {
    452  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    453  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
    454  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
    455  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
    456  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
    457  return s3;
    458 }
    459 
    460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
    461  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    462  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
    463  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
    464  return s2;
    465 }
    466 
    467 //------------------------------------------------------------------------------
    468 
    469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
    470                                    const int8x16_t delta,
    471                                    int8x16_t* const op0,
    472                                    int8x16_t* const oq0) {
    473  const int8x16_t kCst3 = vdupq_n_s8(0x03);
    474  const int8x16_t kCst4 = vdupq_n_s8(0x04);
    475  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    476  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    477  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    478  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    479  *op0 = vqaddq_s8(p0s, delta3);
    480  *oq0 = vqsubq_s8(q0s, delta4);
    481 }
    482 
    483 #if defined(WEBP_USE_INTRINSICS)
    484 
    485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
    486                              const int8x16_t delta,
    487                              uint8x16_t* const op0, uint8x16_t* const oq0) {
    488  const int8x16_t kCst3 = vdupq_n_s8(0x03);
    489  const int8x16_t kCst4 = vdupq_n_s8(0x04);
    490  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    491  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    492  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    493  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    494  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
    495  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
    496  *op0 = FlipSignBack_NEON(sp0);
    497  *oq0 = FlipSignBack_NEON(sq0);
    498 }
    499 
    500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
    501                           const uint8x16_t q0, const uint8x16_t q1,
    502                           const uint8x16_t mask,
    503                           uint8x16_t* const op0, uint8x16_t* const oq0) {
    504  const int8x16_t p1s = FlipSign_NEON(p1);
    505  const int8x16_t p0s = FlipSign_NEON(p0);
    506  const int8x16_t q0s = FlipSign_NEON(q0);
    507  const int8x16_t q1s = FlipSign_NEON(q1);
    508  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
    509  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
    510  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
    511 }
    512 
    513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
    514  uint8x16_t p1, p0, q0, q1, op0, oq0;
    515  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
    516  {
    517    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
    518    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
    519  }
    520  Store16x2_NEON(op0, oq0, p, stride);
    521 }
    522 
    523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
    524  uint8x16_t p1, p0, q0, q1, oq0, op0;
    525  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
    526  {
    527    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
    528    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
    529  }
    530  Store2x16_NEON(op0, oq0, p, stride);
    531 }
    532 
    533 #else
    534 
    535 // Load/Store vertical edge
    536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
    537  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
    538  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
    539  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
    540  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
    541  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
    542  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
    543  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
    544  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
    545 
    546 #define STORE8x2(c1, c2, p, stride)                                            \
    547  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
    548  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
    549  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
    550  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
    551  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
    552  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
    553  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
    554  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
    555 
    556 #define QRegs "q0", "q1", "q2", "q3",                                          \
    557              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
    558 
    559 #define FLIP_SIGN_BIT2(a, b, s)                                                \
    560  "veor     " #a "," #a "," #s "               \n"                             \
    561  "veor     " #b "," #b "," #s "               \n"                             \
    562 
    563 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
    564  FLIP_SIGN_BIT2(a, b, s)                                                      \
    565  FLIP_SIGN_BIT2(c, d, s)                                                      \
    566 
    567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
    568  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
    569  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
    570  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
    571  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
    572  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
    573  "vdup.8     q14, " #thresh "            \n"                                  \
    574  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
    575 
    576 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
    577  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
    578  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
    579  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
    580  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
    581  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
    582 
    583 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
    584  "vmov.i8    q15, #0x03                  \n"                                  \
    585  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
    586  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
    587  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
    588                                                                               \
    589  "vmov.i8    q15, #0x04                  \n"                                  \
    590  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
    591  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
    592  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
    593 
    594 // Applies filter on 2 pixels (p0 and q0)
    595 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
    596  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
    597  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
    598  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
    599  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
    600  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
    601  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
    602  FLIP_SIGN_BIT2(p0, q0, q10)
    603 
    604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
    605  __asm__ volatile (
    606    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    607 
    608    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    609    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    610    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
    611    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
    612 
    613    DO_FILTER2(q1, q2, q3, q12, %[thresh])
    614 
    615    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    616 
    617    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
    618    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
    619    : [p] "+r"(p)
    620    : [stride] "r"(stride), [thresh] "r"(thresh)
    621    : "memory", QRegs
    622  );
    623 }
    624 
    625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
    626  __asm__ volatile (
    627    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
    628    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
    629    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
    630 
    631    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
    632    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
    633    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
    634    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
    635    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
    636 
    637    DO_FILTER2(q1, q2, q12, q13, %[thresh])
    638 
    639    "sub        %[p], %[p], #1                 \n"  // p - 1
    640 
    641    "vswp        d5, d24                       \n"
    642    STORE8x2(d4, d5, [%[p]], %[stride])
    643    STORE8x2(d24, d25, [%[p]], %[stride])
    644 
    645    : [p] "+r"(p)
    646    : [stride] "r"(stride), [thresh] "r"(thresh)
    647    : "memory", "r4", "r5", "r6", QRegs
    648  );
    649 }
    650 
    651 #undef LOAD8x4
    652 #undef STORE8x2
    653 
    654 #endif    // WEBP_USE_INTRINSICS
    655 
    656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
    657  uint32_t k;
    658  for (k = 3; k != 0; --k) {
    659    p += 4 * stride;
    660    SimpleVFilter16_NEON(p, stride, thresh);
    661  }
    662 }
    663 
    664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
    665  uint32_t k;
    666  for (k = 3; k != 0; --k) {
    667    p += 4;
    668    SimpleHFilter16_NEON(p, stride, thresh);
    669  }
    670 }
    671 
    672 //------------------------------------------------------------------------------
    673 // Complex In-loop filtering (Paragraph 15.3)
    674 
    675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
    676                                const uint8x16_t q0, const uint8x16_t q1,
    677                                int hev_thresh) {
    678  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
    679  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    680  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    681  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
    682  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
    683  return mask;
    684 }
    685 
    686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
    687                                    const uint8x16_t p1, const uint8x16_t p0,
    688                                    const uint8x16_t q0, const uint8x16_t q1,
    689                                    const uint8x16_t q2, const uint8x16_t q3,
    690                                    int ithresh, int thresh) {
    691  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
    692  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
    693  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
    694  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    695  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
    696  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
    697  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    698  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
    699  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
    700  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
    701  const uint8x16_t max12 = vmaxq_u8(max1, max2);
    702  const uint8x16_t max123 = vmaxq_u8(max12, max3);
    703  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
    704  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
    705  const uint8x16_t mask = vandq_u8(mask1, mask2);
    706  return mask;
    707 }
    708 
    709 //  4-points filter
    710 
    711 static void ApplyFilter4_NEON(
    712    const int8x16_t p1, const int8x16_t p0,
    713    const int8x16_t q0, const int8x16_t q1,
    714    const int8x16_t delta0,
    715    uint8x16_t* const op1, uint8x16_t* const op0,
    716    uint8x16_t* const oq0, uint8x16_t* const oq1) {
    717  const int8x16_t kCst3 = vdupq_n_s8(0x03);
    718  const int8x16_t kCst4 = vdupq_n_s8(0x04);
    719  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
    720  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
    721  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
    722  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
    723  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
    724  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
    725  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
    726  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
    727  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
    728 }
    729 
    730 static void DoFilter4_NEON(
    731    const uint8x16_t p1, const uint8x16_t p0,
    732    const uint8x16_t q0, const uint8x16_t q1,
    733    const uint8x16_t mask, const uint8x16_t hev_mask,
    734    uint8x16_t* const op1, uint8x16_t* const op0,
    735    uint8x16_t* const oq0, uint8x16_t* const oq1) {
    736  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    737  const int8x16_t p1s = FlipSign_NEON(p1);
    738  int8x16_t p0s = FlipSign_NEON(p0);
    739  int8x16_t q0s = FlipSign_NEON(q0);
    740  const int8x16_t q1s = FlipSign_NEON(q1);
    741  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    742 
    743  // do_filter2 part (simple loopfilter on pixels with hev)
    744  {
    745    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
    746    const int8x16_t simple_lf_delta =
    747        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
    748    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
    749  }
    750 
    751  // do_filter4 part (complex loopfilter on pixels without hev)
    752  {
    753    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
    754    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    755    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    756    const int8x16_t complex_lf_delta =
    757        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    758    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
    759  }
    760 }
    761 
    762 //  6-points filter
    763 
    764 static void ApplyFilter6_NEON(
    765    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
    766    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
    767    const int8x16_t delta,
    768    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    769    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    770  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
    771  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
    772  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
    773  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
    774  const int8x8_t delta_lo = vget_low_s8(delta);
    775  const int8x8_t delta_hi = vget_high_s8(delta);
    776  const int8x8_t kCst9 = vdup_n_s8(9);
    777  const int16x8_t kCstm1 = vdupq_n_s16(-1);
    778  const int8x8_t kCst18 = vdup_n_s8(18);
    779  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
    780  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
    781  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
    782  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
    783  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
    784  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
    785  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
    786  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
    787  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
    788  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
    789  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
    790  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
    791  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
    792 
    793  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
    794  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
    795  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
    796  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
    797  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
    798  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
    799 }
    800 
    801 static void DoFilter6_NEON(
    802    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    803    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    804    const uint8x16_t mask, const uint8x16_t hev_mask,
    805    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    806    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    807  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    808  const int8x16_t p2s = FlipSign_NEON(p2);
    809  const int8x16_t p1s = FlipSign_NEON(p1);
    810  int8x16_t p0s = FlipSign_NEON(p0);
    811  int8x16_t q0s = FlipSign_NEON(q0);
    812  const int8x16_t q1s = FlipSign_NEON(q1);
    813  const int8x16_t q2s = FlipSign_NEON(q2);
    814  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    815  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
    816 
    817  // do_filter2 part (simple loopfilter on pixels with hev)
    818  {
    819    const int8x16_t simple_lf_delta =
    820        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
    821    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
    822  }
    823 
    824  // do_filter6 part (complex loopfilter on pixels without hev)
    825  {
    826    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    827    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    828    const int8x16_t complex_lf_delta =
    829        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    830    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
    831                      op2, op1, op0, oq0, oq1, oq2);
    832  }
    833 }
    834 
    835 // on macroblock edges
    836 
    837 static void VFilter16_NEON(uint8_t* p, int stride,
    838                           int thresh, int ithresh, int hev_thresh) {
    839  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    840  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    841  {
    842    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    843                                              ithresh, thresh);
    844    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    845    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    846    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    847                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
    848    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
    849    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
    850    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
    851  }
    852 }
    853 
    854 static void HFilter16_NEON(uint8_t* p, int stride,
    855                           int thresh, int ithresh, int hev_thresh) {
    856  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    857  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    858  {
    859    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    860                                              ithresh, thresh);
    861    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    862    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    863    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    864                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
    865    Store2x16_NEON(op2, op1, p - 2, stride);
    866    Store2x16_NEON(op0, oq0, p + 0, stride);
    867    Store2x16_NEON(oq1, oq2, p + 2, stride);
    868  }
    869 }
    870 
    871 // on three inner edges
    872 static void VFilter16i_NEON(uint8_t* p, int stride,
    873                            int thresh, int ithresh, int hev_thresh) {
    874  uint32_t k;
    875  uint8x16_t p3, p2, p1, p0;
    876  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
    877  for (k = 3; k != 0; --k) {
    878    uint8x16_t q0, q1, q2, q3;
    879    p += 4 * stride;
    880    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
    881    {
    882      const uint8x16_t mask =
    883          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    884      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    885      // p3 and p2 are not just temporary variables here: they will be
    886      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
    887      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    888      Store16x4_NEON(p1, p0, p3, p2, p, stride);
    889      p1 = q2;
    890      p0 = q3;
    891    }
    892  }
    893 }
    894 
    895 #if !defined(WORK_AROUND_GCC)
    896 static void HFilter16i_NEON(uint8_t* p, int stride,
    897                            int thresh, int ithresh, int hev_thresh) {
    898  uint32_t k;
    899  uint8x16_t p3, p2, p1, p0;
    900  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
    901  for (k = 3; k != 0; --k) {
    902    uint8x16_t q0, q1, q2, q3;
    903    p += 4;
    904    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
    905    {
    906      const uint8x16_t mask =
    907          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    908      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    909      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    910      Store4x16_NEON(p1, p0, p3, p2, p, stride);
    911      p1 = q2;
    912      p0 = q3;
    913    }
    914  }
    915 }
    916 #endif  // !WORK_AROUND_GCC
    917 
    918 // 8-pixels wide variant, for chroma filtering
    919 static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    920                          int stride, int thresh, int ithresh, int hev_thresh) {
    921  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    922  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    923  {
    924    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    925                                              ithresh, thresh);
    926    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    927    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    928    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    929                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
    930    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
    931    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
    932    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
    933  }
    934 }
    935 static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    936                           int stride,
    937                           int thresh, int ithresh, int hev_thresh) {
    938  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    939  u += 4 * stride;
    940  v += 4 * stride;
    941  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    942  {
    943    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    944                                              ithresh, thresh);
    945    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    946    uint8x16_t op1, op0, oq0, oq1;
    947    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    948    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
    949  }
    950 }
    951 
    952 #if !defined(WORK_AROUND_GCC)
    953 static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    954                          int stride, int thresh, int ithresh, int hev_thresh) {
    955  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    956  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    957  {
    958    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    959                                              ithresh, thresh);
    960    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    961    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    962    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    963                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
    964    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
    965  }
    966 }
    967 
    968 static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    969                           int stride,
    970                           int thresh, int ithresh, int hev_thresh) {
    971  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    972  u += 4;
    973  v += 4;
    974  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    975  {
    976    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    977                                              ithresh, thresh);
    978    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    979    uint8x16_t op1, op0, oq0, oq1;
    980    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    981    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
    982  }
    983 }
    984 #endif  // !WORK_AROUND_GCC
    985 
    986 //-----------------------------------------------------------------------------
    987 // Inverse transforms (Paragraph 14.4)
    988 
    989 // Technically these are unsigned but vqdmulh is only available in signed.
    990 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
    991 // changing the >> 16 to >> 15 and requiring an additional >> 1.
    992 // We use this to our advantage with kC2. The canonical value is 35468.
    993 // However, the high bit is set so treating it as signed will give incorrect
    994 // results. We avoid this by down shifting by 1 here to clear the highest bit.
    995 // Combined with the doubling effect of vqdmulh we get >> 16.
    996 // This can not be applied to kC1 because the lowest bit is set. Down shifting
    997 // the constant would reduce precision.
    998 
    999 // libwebp uses a trick to avoid some extra addition that libvpx does.
   1000 // Instead of:
   1001 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
   1002 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
   1003 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
   1004 
   1005 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
   1006 static const int16_t kC2 =
   1007    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
   1008 
   1009 #if defined(WEBP_USE_INTRINSICS)
   1010 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
   1011                                          const int16x8_t in1,
   1012                                          int16x8x2_t* const out) {
   1013  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
   1014  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
   1015  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
   1016                                                  // b0 d0 b1 d1 b2 d2 ...
   1017  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
   1018 }
   1019 
   1020 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   1021  // {rows} = in0 | in4
   1022  //          in8 | in12
   1023  // B1 = in4 | in12
   1024  const int16x8_t B1 =
   1025      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
   1026  // C0 = kC1 * in4 | kC1 * in12
   1027  // C1 = kC2 * in4 | kC2 * in12
   1028  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
   1029  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
   1030  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
   1031                                vget_low_s16(rows->val[1]));   // in0 + in8
   1032  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
   1033                                vget_low_s16(rows->val[1]));   // in0 - in8
   1034  // c = kC2 * in4 - kC1 * in12
   1035  // d = kC1 * in4 + kC2 * in12
   1036  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
   1037  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
   1038  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
   1039  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
   1040  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   1041  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   1042  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
   1043  Transpose8x2_NEON(E0, E1, rows);
   1044 }
   1045 
   1046 static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
   1047                              uint8_t* WEBP_RESTRICT dst) {
   1048  int16x8x2_t rows;
   1049  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   1050  TransformPass_NEON(&rows);
   1051  TransformPass_NEON(&rows);
   1052  Add4x4_NEON(rows.val[0], rows.val[1], dst);
   1053 }
   1054 
   1055 #else
   1056 
   1057 static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
   1058                              uint8_t* WEBP_RESTRICT dst) {
   1059  const int kBPS = BPS;
   1060  // kC1, kC2. Padded because vld1.16 loads 8 bytes
   1061  const int16_t constants[4] = { kC1, kC2, 0, 0 };
   1062  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
   1063  __asm__ volatile (
   1064    "vld1.16         {q1, q2}, [%[in]]           \n"
   1065    "vld1.16         {d0}, [%[constants]]        \n"
   1066 
   1067    /* d2: in[0]
   1068     * d3: in[8]
   1069     * d4: in[4]
   1070     * d5: in[12]
   1071     */
   1072    "vswp            d3, d4                      \n"
   1073 
   1074    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
   1075     * q9 = {in[4], in[12]} * kC2 >> 16
   1076     */
   1077    "vqdmulh.s16     q8, q2, d0[0]               \n"
   1078    "vqdmulh.s16     q9, q2, d0[1]               \n"
   1079 
   1080    /* d22 = a = in[0] + in[8]
   1081     * d23 = b = in[0] - in[8]
   1082     */
   1083    "vqadd.s16       d22, d2, d3                 \n"
   1084    "vqsub.s16       d23, d2, d3                 \n"
   1085 
   1086    /* The multiplication should be x * kC1 >> 16
   1087     * However, with vqdmulh we get x * kC1 * 2 >> 16
   1088     * (multiply, double, return high half)
   1089     * We avoided this in kC2 by pre-shifting the constant.
   1090     * q8 = in[4]/[12] * kC1 >> 16
   1091     */
   1092    "vshr.s16        q8, q8, #1                  \n"
   1093 
   1094    /* Add {in[4], in[12]} back after the multiplication. This is handled by
   1095     * adding 1 << 16 to kC1 in the libwebp C code.
   1096     */
   1097    "vqadd.s16       q8, q2, q8                  \n"
   1098 
   1099    /* d20 = c = in[4]*kC2 - in[12]*kC1
   1100     * d21 = d = in[4]*kC1 + in[12]*kC2
   1101     */
   1102    "vqsub.s16       d20, d18, d17               \n"
   1103    "vqadd.s16       d21, d19, d16               \n"
   1104 
   1105    /* d2 = tmp[0] = a + d
   1106     * d3 = tmp[1] = b + c
   1107     * d4 = tmp[2] = b - c
   1108     * d5 = tmp[3] = a - d
   1109     */
   1110    "vqadd.s16       d2, d22, d21                \n"
   1111    "vqadd.s16       d3, d23, d20                \n"
   1112    "vqsub.s16       d4, d23, d20                \n"
   1113    "vqsub.s16       d5, d22, d21                \n"
   1114 
   1115    "vzip.16         q1, q2                      \n"
   1116    "vzip.16         q1, q2                      \n"
   1117 
   1118    "vswp            d3, d4                      \n"
   1119 
   1120    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
   1121     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
   1122     */
   1123    "vqdmulh.s16     q8, q2, d0[0]               \n"
   1124    "vqdmulh.s16     q9, q2, d0[1]               \n"
   1125 
   1126    /* d22 = a = tmp[0] + tmp[8]
   1127     * d23 = b = tmp[0] - tmp[8]
   1128     */
   1129    "vqadd.s16       d22, d2, d3                 \n"
   1130    "vqsub.s16       d23, d2, d3                 \n"
   1131 
   1132    /* See long winded explanations prior */
   1133    "vshr.s16        q8, q8, #1                  \n"
   1134    "vqadd.s16       q8, q2, q8                  \n"
   1135 
   1136    /* d20 = c = in[4]*kC2 - in[12]*kC1
   1137     * d21 = d = in[4]*kC1 + in[12]*kC2
   1138     */
   1139    "vqsub.s16       d20, d18, d17               \n"
   1140    "vqadd.s16       d21, d19, d16               \n"
   1141 
   1142    /* d2 = tmp[0] = a + d
   1143     * d3 = tmp[1] = b + c
   1144     * d4 = tmp[2] = b - c
   1145     * d5 = tmp[3] = a - d
   1146     */
   1147    "vqadd.s16       d2, d22, d21                \n"
   1148    "vqadd.s16       d3, d23, d20                \n"
   1149    "vqsub.s16       d4, d23, d20                \n"
   1150    "vqsub.s16       d5, d22, d21                \n"
   1151 
   1152    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
   1153    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
   1154    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
   1155    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
   1156 
   1157    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
   1158 
   1159    /* (val) + 4 >> 3 */
   1160    "vrshr.s16       d2, d2, #3                  \n"
   1161    "vrshr.s16       d3, d3, #3                  \n"
   1162    "vrshr.s16       d4, d4, #3                  \n"
   1163    "vrshr.s16       d5, d5, #3                  \n"
   1164 
   1165    "vzip.16         q1, q2                      \n"
   1166    "vzip.16         q1, q2                      \n"
   1167 
   1168    /* Must accumulate before saturating */
   1169    "vmovl.u8        q8, d6                      \n"
   1170    "vmovl.u8        q9, d7                      \n"
   1171 
   1172    "vqadd.s16       q1, q1, q8                  \n"
   1173    "vqadd.s16       q2, q2, q9                  \n"
   1174 
   1175    "vqmovun.s16     d0, q1                      \n"
   1176    "vqmovun.s16     d1, q2                      \n"
   1177 
   1178    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
   1179    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
   1180    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
   1181    "vst1.32         d1[1], [%[dst]]             \n"
   1182 
   1183    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
   1184    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
   1185    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
   1186  );
   1187 }
   1188 
   1189 #endif    // WEBP_USE_INTRINSICS
   1190 
   1191 static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
   1192                              uint8_t* WEBP_RESTRICT dst, int do_two) {
   1193  TransformOne_NEON(in, dst);
   1194  if (do_two) {
   1195    TransformOne_NEON(in + 16, dst + 4);
   1196  }
   1197 }
   1198 
   1199 static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
   1200                             uint8_t* WEBP_RESTRICT dst) {
   1201  const int16x8_t DC = vdupq_n_s16(in[0]);
   1202  Add4x4_NEON(DC, DC, dst);
   1203 }
   1204 
   1205 //------------------------------------------------------------------------------
   1206 
   1207 #define STORE_WHT(dst, col, rows) do {                  \
   1208  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
   1209  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
   1210  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
   1211  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
   1212 } while (0)
   1213 
   1214 static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
   1215                              int16_t* WEBP_RESTRICT out) {
   1216  int32x4x4_t tmp;
   1217 
   1218  {
   1219    // Load the source.
   1220    const int16x4_t in00_03 = vld1_s16(in + 0);
   1221    const int16x4_t in04_07 = vld1_s16(in + 4);
   1222    const int16x4_t in08_11 = vld1_s16(in + 8);
   1223    const int16x4_t in12_15 = vld1_s16(in + 12);
   1224    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
   1225    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
   1226    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
   1227    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
   1228    tmp.val[0] = vaddq_s32(a0, a1);
   1229    tmp.val[1] = vaddq_s32(a3, a2);
   1230    tmp.val[2] = vsubq_s32(a0, a1);
   1231    tmp.val[3] = vsubq_s32(a3, a2);
   1232    // Arrange the temporary results column-wise.
   1233    tmp = Transpose4x4_NEON(tmp);
   1234  }
   1235 
   1236  {
   1237    const int32x4_t kCst3 = vdupq_n_s32(3);
   1238    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
   1239    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
   1240    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
   1241    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
   1242    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
   1243 
   1244    tmp.val[0] = vaddq_s32(a0, a1);
   1245    tmp.val[1] = vaddq_s32(a3, a2);
   1246    tmp.val[2] = vsubq_s32(a0, a1);
   1247    tmp.val[3] = vsubq_s32(a3, a2);
   1248 
   1249    // right shift the results by 3.
   1250    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
   1251    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
   1252    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
   1253    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
   1254 
   1255    STORE_WHT(out, 0, tmp);
   1256    STORE_WHT(out, 1, tmp);
   1257    STORE_WHT(out, 2, tmp);
   1258    STORE_WHT(out, 3, tmp);
   1259  }
   1260 }
   1261 
   1262 #undef STORE_WHT
   1263 
   1264 //------------------------------------------------------------------------------
   1265 
   1266 static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
   1267                              uint8_t* WEBP_RESTRICT dst) {
   1268  const int16x4_t A = vld1_dup_s16(in);
   1269  const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
   1270  const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
   1271  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
   1272  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
   1273  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
   1274                      (uint64_t)( c1 & 0xffff) << 16 |
   1275                      (uint64_t)(-c1 & 0xffff) << 32 |
   1276                      (uint64_t)(-d1 & 0xffff) << 48;
   1277  const int16x4_t CD = vcreate_s16(cd);
   1278  const int16x4_t B = vqadd_s16(A, CD);
   1279  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
   1280  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
   1281  Add4x4_NEON(m0_m1, m2_m3, dst);
   1282 }
   1283 
   1284 //------------------------------------------------------------------------------
   1285 // 4x4
   1286 
   1287 static void DC4_NEON(uint8_t* dst) {    // DC
   1288  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   1289  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   1290  const uint16x4_t p1 = vpadd_u16(p0, p0);
   1291  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
   1292  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
   1293  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
   1294  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
   1295  const uint16x8_t s0 = vaddl_u8(L0, L1);
   1296  const uint16x8_t s1 = vaddl_u8(L2, L3);
   1297  const uint16x8_t s01 = vaddq_u16(s0, s1);
   1298  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   1299  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
   1300  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
   1301  int i;
   1302  for (i = 0; i < 4; ++i) {
   1303    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
   1304  }
   1305 }
   1306 
   1307 // TrueMotion (4x4 + 8x8)
   1308 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
   1309  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   1310  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
   1311  const uint16x8_t d = vsubl_u8(T, TL);  // A[c] - A[-1]
   1312  int y;
   1313  for (y = 0; y < size; y += 4) {
   1314    // left edge
   1315    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
   1316    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
   1317    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
   1318    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
   1319    // L[r] + A[c] - A[-1]
   1320    const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
   1321    const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
   1322    const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
   1323    const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
   1324    // Saturate and store the result.
   1325    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
   1326    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
   1327    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
   1328    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
   1329    if (size == 4) {
   1330      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
   1331      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
   1332      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
   1333      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
   1334    } else {
   1335      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
   1336      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
   1337      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
   1338      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
   1339    }
   1340    dst += 4 * BPS;
   1341  }
   1342 }
   1343 
   1344 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
   1345 
   1346 static void VE4_NEON(uint8_t* dst) {    // vertical
   1347  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
   1348  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
   1349  const uint64x1_t A1 = vshr_n_u64(A0, 8);
   1350  const uint64x1_t A2 = vshr_n_u64(A0, 16);
   1351  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
   1352  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
   1353  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
   1354  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
   1355  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
   1356  int i;
   1357  for (i = 0; i < 4; ++i) {
   1358    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
   1359  }
   1360 }
   1361 
   1362 static void RD4_NEON(uint8_t* dst) {   // Down-right
   1363  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
   1364  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
   1365  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
   1366  const uint32_t I = dst[-1 + 0 * BPS];
   1367  const uint32_t J = dst[-1 + 1 * BPS];
   1368  const uint32_t K = dst[-1 + 2 * BPS];
   1369  const uint32_t L = dst[-1 + 3 * BPS];
   1370  const uint64x1_t LKJI____ =
   1371      vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
   1372  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
   1373  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
   1374  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
   1375  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
   1376  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
   1377  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
   1378  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
   1379  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
   1380  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   1381  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   1382  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   1383  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   1384  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
   1385  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
   1386  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
   1387  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
   1388  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
   1389 }
   1390 
   1391 static void LD4_NEON(uint8_t* dst) {    // Down-left
   1392  // Note using the same shift trick as VE4() is slower here.
   1393  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
   1394  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
   1395  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
   1396  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
   1397  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
   1398  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
   1399  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   1400  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
   1401  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   1402  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   1403  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
   1404  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
   1405  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
   1406  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
   1407  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
   1408 }
   1409 
   1410 //------------------------------------------------------------------------------
   1411 // Chroma
   1412 
   1413 static void VE8uv_NEON(uint8_t* dst) {    // vertical
   1414  const uint8x8_t top = vld1_u8(dst - BPS);
   1415  int j;
   1416  for (j = 0; j < 8; ++j) {
   1417    vst1_u8(dst + j * BPS, top);
   1418  }
   1419 }
   1420 
   1421 static void HE8uv_NEON(uint8_t* dst) {    // horizontal
   1422  int j;
   1423  for (j = 0; j < 8; ++j) {
   1424    const uint8x8_t left = vld1_dup_u8(dst - 1);
   1425    vst1_u8(dst, left);
   1426    dst += BPS;
   1427  }
   1428 }
   1429 
   1430 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
   1431  uint16x8_t sum_top;
   1432  uint16x8_t sum_left;
   1433  uint8x8_t dc0;
   1434 
   1435  if (do_top) {
   1436    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   1437 #if WEBP_AARCH64
   1438    const uint16_t p2 = vaddlv_u8(A);
   1439    sum_top = vdupq_n_u16(p2);
   1440 #else
   1441    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   1442    const uint16x4_t p1 = vpadd_u16(p0, p0);
   1443    const uint16x4_t p2 = vpadd_u16(p1, p1);
   1444    sum_top = vcombine_u16(p2, p2);
   1445 #endif
   1446  }
   1447 
   1448  if (do_left) {
   1449    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
   1450    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
   1451    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
   1452    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
   1453    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
   1454    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
   1455    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
   1456    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
   1457    const uint16x8_t s0 = vaddl_u8(L0, L1);
   1458    const uint16x8_t s1 = vaddl_u8(L2, L3);
   1459    const uint16x8_t s2 = vaddl_u8(L4, L5);
   1460    const uint16x8_t s3 = vaddl_u8(L6, L7);
   1461    const uint16x8_t s01 = vaddq_u16(s0, s1);
   1462    const uint16x8_t s23 = vaddq_u16(s2, s3);
   1463    sum_left = vaddq_u16(s01, s23);
   1464  }
   1465 
   1466  if (do_top && do_left) {
   1467    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   1468    dc0 = vrshrn_n_u16(sum, 4);
   1469  } else if (do_top) {
   1470    dc0 = vrshrn_n_u16(sum_top, 3);
   1471  } else if (do_left) {
   1472    dc0 = vrshrn_n_u16(sum_left, 3);
   1473  } else {
   1474    dc0 = vdup_n_u8(0x80);
   1475  }
   1476 
   1477  {
   1478    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
   1479    int i;
   1480    for (i = 0; i < 8; ++i) {
   1481      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
   1482    }
   1483  }
   1484 }
   1485 
   1486 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
   1487 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
   1488 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
   1489 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
   1490 
   1491 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
   1492 
   1493 //------------------------------------------------------------------------------
   1494 // 16x16
   1495 
   1496 static void VE16_NEON(uint8_t* dst) {     // vertical
   1497  const uint8x16_t top = vld1q_u8(dst - BPS);
   1498  int j;
   1499  for (j = 0; j < 16; ++j) {
   1500    vst1q_u8(dst + j * BPS, top);
   1501  }
   1502 }
   1503 
   1504 static void HE16_NEON(uint8_t* dst) {     // horizontal
   1505  int j;
   1506  for (j = 0; j < 16; ++j) {
   1507    const uint8x16_t left = vld1q_dup_u8(dst - 1);
   1508    vst1q_u8(dst, left);
   1509    dst += BPS;
   1510  }
   1511 }
   1512 
   1513 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
   1514  uint16x8_t sum_top;
   1515  uint16x8_t sum_left;
   1516  uint8x8_t dc0;
   1517 
   1518  if (do_top) {
   1519    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
   1520 #if WEBP_AARCH64
   1521    const uint16_t p3 = vaddlvq_u8(A);
   1522    sum_top = vdupq_n_u16(p3);
   1523 #else
   1524    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
   1525    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
   1526    const uint16x4_t p2 = vpadd_u16(p1, p1);
   1527    const uint16x4_t p3 = vpadd_u16(p2, p2);
   1528    sum_top = vcombine_u16(p3, p3);
   1529 #endif
   1530  }
   1531 
   1532  if (do_left) {
   1533    int i;
   1534    sum_left = vdupq_n_u16(0);
   1535    for (i = 0; i < 16; i += 8) {
   1536      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
   1537      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
   1538      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
   1539      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
   1540      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
   1541      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
   1542      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
   1543      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
   1544      const uint16x8_t s0 = vaddl_u8(L0, L1);
   1545      const uint16x8_t s1 = vaddl_u8(L2, L3);
   1546      const uint16x8_t s2 = vaddl_u8(L4, L5);
   1547      const uint16x8_t s3 = vaddl_u8(L6, L7);
   1548      const uint16x8_t s01 = vaddq_u16(s0, s1);
   1549      const uint16x8_t s23 = vaddq_u16(s2, s3);
   1550      const uint16x8_t sum = vaddq_u16(s01, s23);
   1551      sum_left = vaddq_u16(sum_left, sum);
   1552    }
   1553  }
   1554 
   1555  if (do_top && do_left) {
   1556    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   1557    dc0 = vrshrn_n_u16(sum, 5);
   1558  } else if (do_top) {
   1559    dc0 = vrshrn_n_u16(sum_top, 4);
   1560  } else if (do_left) {
   1561    dc0 = vrshrn_n_u16(sum_left, 4);
   1562  } else {
   1563    dc0 = vdup_n_u8(0x80);
   1564  }
   1565 
   1566  {
   1567    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
   1568    int i;
   1569    for (i = 0; i < 16; ++i) {
   1570      vst1q_u8(dst + i * BPS, dc);
   1571    }
   1572  }
   1573 }
   1574 
   1575 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
   1576 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
   1577 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
   1578 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
   1579 
   1580 static void TM16_NEON(uint8_t* dst) {
   1581  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   1582  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   1583  // A[c] - A[-1]
   1584  const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
   1585  const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
   1586  int y;
   1587  for (y = 0; y < 16; y += 4) {
   1588    // left edge
   1589    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
   1590    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
   1591    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
   1592    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
   1593    // L[r] + A[c] - A[-1]
   1594    const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
   1595    const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
   1596    const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
   1597    const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
   1598    const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
   1599    const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
   1600    const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
   1601    const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
   1602    // Saturate and store the result.
   1603    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
   1604    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
   1605    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
   1606    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
   1607    vst1q_u8(dst + 0 * BPS, row0);
   1608    vst1q_u8(dst + 1 * BPS, row1);
   1609    vst1q_u8(dst + 2 * BPS, row2);
   1610    vst1q_u8(dst + 3 * BPS, row3);
   1611    dst += 4 * BPS;
   1612  }
   1613 }
   1614 
   1615 //------------------------------------------------------------------------------
   1616 // Entry point
   1617 
   1618 extern void VP8DspInitNEON(void);
   1619 
   1620 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
   1621  VP8Transform = TransformTwo_NEON;
   1622  VP8TransformAC3 = TransformAC3_NEON;
   1623  VP8TransformDC = TransformDC_NEON;
   1624  VP8TransformWHT = TransformWHT_NEON;
   1625 
   1626  VP8VFilter16 = VFilter16_NEON;
   1627  VP8VFilter16i = VFilter16i_NEON;
   1628  VP8HFilter16 = HFilter16_NEON;
   1629 #if !defined(WORK_AROUND_GCC)
   1630  VP8HFilter16i = HFilter16i_NEON;
   1631 #endif
   1632  VP8VFilter8 = VFilter8_NEON;
   1633  VP8VFilter8i = VFilter8i_NEON;
   1634 #if !defined(WORK_AROUND_GCC)
   1635  VP8HFilter8 = HFilter8_NEON;
   1636  VP8HFilter8i = HFilter8i_NEON;
   1637 #endif
   1638  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
   1639  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
   1640  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
   1641  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
   1642 
   1643  VP8PredLuma4[0] = DC4_NEON;
   1644  VP8PredLuma4[1] = TM4_NEON;
   1645  VP8PredLuma4[2] = VE4_NEON;
   1646  VP8PredLuma4[4] = RD4_NEON;
   1647  VP8PredLuma4[6] = LD4_NEON;
   1648 
   1649  VP8PredLuma16[0] = DC16TopLeft_NEON;
   1650  VP8PredLuma16[1] = TM16_NEON;
   1651  VP8PredLuma16[2] = VE16_NEON;
   1652  VP8PredLuma16[3] = HE16_NEON;
   1653  VP8PredLuma16[4] = DC16NoTop_NEON;
   1654  VP8PredLuma16[5] = DC16NoLeft_NEON;
   1655  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
   1656 
   1657  VP8PredChroma8[0] = DC8uv_NEON;
   1658  VP8PredChroma8[1] = TM8uv_NEON;
   1659  VP8PredChroma8[2] = VE8uv_NEON;
   1660  VP8PredChroma8[3] = HE8uv_NEON;
   1661  VP8PredChroma8[4] = DC8uvNoTop_NEON;
   1662  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
   1663  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
   1664 }
   1665 
   1666 #else  // !WEBP_USE_NEON
   1667 
   1668 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
   1669 
   1670 #endif  // WEBP_USE_NEON