tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_block_neon.c (52372B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/sum_neon.h"
     20 #include "av1/common/cdef_block.h"
     21 
     22 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
     23                                        const uint8_t *src, int sstride,
     24                                        int width, int height) {
     25  do {
     26    const uint8_t *src_ptr = src;
     27    uint16_t *dst_ptr = dst;
     28 
     29    int w = 0;
     30    while (width - w >= 16) {
     31      uint8x16_t row = vld1q_u8(src_ptr + w);
     32      uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
     33      vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
     34 
     35      w += 16;
     36    }
     37    if (width - w >= 8) {
     38      uint8x8_t row = vld1_u8(src_ptr + w);
     39      vst1q_u16(dst_ptr + w, vmovl_u8(row));
     40      w += 8;
     41    }
     42    if (width - w == 4) {
     43      for (int i = w; i < w + 4; i++) {
     44        dst_ptr[i] = src_ptr[i];
     45      }
     46    }
     47 
     48    src += sstride;
     49    dst += dstride;
     50  } while (--height != 0);
     51 }
     52 
     53 #if CONFIG_AV1_HIGHBITDEPTH
     54 void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
     55                                         const uint16_t *src, int sstride,
     56                                         int width, int height) {
     57  do {
     58    const uint16_t *src_ptr = src;
     59    uint16_t *dst_ptr = dst;
     60 
     61    int w = 0;
     62    while (width - w >= 8) {
     63      uint16x8_t row = vld1q_u16(src_ptr + w);
     64      vst1q_u16(dst_ptr + w, row);
     65 
     66      w += 8;
     67    }
     68    if (width - w == 4) {
     69      uint16x4_t row = vld1_u16(src_ptr + w);
     70      vst1_u16(dst_ptr + w, row);
     71    }
     72 
     73    src += sstride;
     74    dst += dstride;
     75  } while (--height != 0);
     76 }
     77 #endif  // CONFIG_AV1_HIGHBITDEPTH
     78 
     79 // partial A is a 16-bit vector of the form:
     80 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
     81 // [0  y1 y2 y3 y4 y5 y6 y7].
     82 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
     83 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
     84 // and const2.
     85 static inline uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
     86                                               int16x8_t partialb,
     87                                               uint32x4_t const1,
     88                                               uint32x4_t const2) {
     89  // Reverse partial B.
     90  // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }.
     91  uint8x16_t pattern = vreinterpretq_u8_u64(
     92      vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
     93                   vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
     94 
     95 #if AOM_ARCH_AARCH64
     96  partialb =
     97      vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
     98 #else
     99  int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
    100                     vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
    101  int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
    102  int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
    103  partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
    104 #endif
    105 
    106  // Square and add the corresponding x and y values.
    107  int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala));
    108  cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb));
    109  int32x4_t cost_hi =
    110      vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala));
    111  cost_hi =
    112      vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb));
    113 
    114  // Multiply by constant.
    115  uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1);
    116  cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2);
    117  return cost;
    118 }
    119 
    120 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
    121 // down-right, 6 is vertical).
    122 //
    123 // For each direction the lines are shifted so that we can perform a
    124 // basic sum on each vector element. For example, direction 5 is "south by
    125 // southeast", so we need to add the pixels along each line i below:
    126 //
    127 // 0  1 2 3 4 5 6 7
    128 // 0  1 2 3 4 5 6 7
    129 // 8  0 1 2 3 4 5 6
    130 // 8  0 1 2 3 4 5 6
    131 // 9  8 0 1 2 3 4 5
    132 // 9  8 0 1 2 3 4 5
    133 // 10 9 8 0 1 2 3 4
    134 // 10 9 8 0 1 2 3 4
    135 //
    136 // For this to fit nicely in vectors, the lines need to be shifted like so:
    137 //        0 1 2 3 4 5 6 7
    138 //        0 1 2 3 4 5 6 7
    139 //      8 0 1 2 3 4 5 6
    140 //      8 0 1 2 3 4 5 6
    141 //    9 8 0 1 2 3 4 5
    142 //    9 8 0 1 2 3 4 5
    143 // 10 9 8 0 1 2 3 4
    144 // 10 9 8 0 1 2 3 4
    145 //
    146 // In this configuration we can now perform SIMD additions to get the cost
    147 // along direction 5. Since this won't fit into a single 128-bit vector, we use
    148 // two of them to compute each half of the new configuration, and pad the empty
    149 // spaces with zeros. Similar shifting is done for other directions, except
    150 // direction 6 which is straightforward as it's the vertical direction.
    151 static inline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
    152                                                      uint32_t cost[4]) {
    153  const int16x8_t zero = vdupq_n_s16(0);
    154 
    155  // Partial sums for lines 0 and 1.
    156  int16x8_t partial4a = vextq_s16(zero, lines[0], 1);
    157  partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2));
    158  int16x8_t partial4b = vextq_s16(lines[0], zero, 1);
    159  partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2));
    160  int16x8_t tmp = vaddq_s16(lines[0], lines[1]);
    161  int16x8_t partial5a = vextq_s16(zero, tmp, 3);
    162  int16x8_t partial5b = vextq_s16(tmp, zero, 3);
    163  int16x8_t partial7a = vextq_s16(zero, tmp, 6);
    164  int16x8_t partial7b = vextq_s16(tmp, zero, 6);
    165  int16x8_t partial6 = tmp;
    166 
    167  // Partial sums for lines 2 and 3.
    168  partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3));
    169  partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4));
    170  partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3));
    171  partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4));
    172  tmp = vaddq_s16(lines[2], lines[3]);
    173  partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4));
    174  partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4));
    175  partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5));
    176  partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5));
    177  partial6 = vaddq_s16(partial6, tmp);
    178 
    179  // Partial sums for lines 4 and 5.
    180  partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5));
    181  partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6));
    182  partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5));
    183  partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6));
    184  tmp = vaddq_s16(lines[4], lines[5]);
    185  partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5));
    186  partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5));
    187  partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4));
    188  partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4));
    189  partial6 = vaddq_s16(partial6, tmp);
    190 
    191  // Partial sums for lines 6 and 7.
    192  partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7));
    193  partial4a = vaddq_s16(partial4a, lines[7]);
    194  partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7));
    195  tmp = vaddq_s16(lines[6], lines[7]);
    196  partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6));
    197  partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6));
    198  partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3));
    199  partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3));
    200  partial6 = vaddq_s16(partial6, tmp);
    201 
    202  uint32x4_t const0 = vreinterpretq_u32_u64(
    203      vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
    204                   vcreate_u64((uint64_t)210 << 32 | 280)));
    205  uint32x4_t const1 = vreinterpretq_u32_u64(
    206      vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
    207                   vcreate_u64((uint64_t)105 << 32 | 120)));
    208  uint32x4_t const2 = vreinterpretq_u32_u64(
    209      vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
    210  uint32x4_t const3 = vreinterpretq_u32_u64(
    211      vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
    212                   vcreate_u64((uint64_t)105 << 32 | 105)));
    213 
    214  // Compute costs in terms of partial sums.
    215  int32x4_t partial6_s32 =
    216      vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6));
    217  partial6_s32 =
    218      vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6));
    219 
    220  uint32x4_t costs[4];
    221  costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
    222  costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
    223  costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105);
    224  costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
    225 
    226  costs[0] = horizontal_add_4d_u32x4(costs);
    227  vst1q_u32(cost, costs[0]);
    228  return costs[0];
    229 }
    230 
    231 static inline uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
    232                                                        int16x8_t partialb,
    233                                                        int16x8_t partialc,
    234                                                        uint32x4_t const0) {
    235  // Reverse partial c.
    236  // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }.
    237  uint8x16_t pattern = vreinterpretq_u8_u64(
    238      vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a),
    239                   vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302)));
    240 
    241 #if AOM_ARCH_AARCH64
    242  partialc =
    243      vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern));
    244 #else
    245  int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)),
    246                     vget_high_s8(vreinterpretq_s8_s16(partialc)) } };
    247  int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
    248  int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
    249  partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
    250 #endif
    251 
    252  int32x4_t partiala_s32 = vpaddlq_s16(partiala);
    253  int32x4_t partialb_s32 = vpaddlq_s16(partialb);
    254  int32x4_t partialc_s32 = vpaddlq_s16(partialc);
    255 
    256  partiala_s32 = vmulq_s32(partiala_s32, partiala_s32);
    257  partialb_s32 = vmulq_s32(partialb_s32, partialb_s32);
    258  partialc_s32 = vmulq_s32(partialc_s32, partialc_s32);
    259 
    260  partiala_s32 = vaddq_s32(partiala_s32, partialc_s32);
    261 
    262  uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105);
    263  cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0);
    264  return cost;
    265 }
    266 
    267 // This function computes the cost along directions 0, 1, 2, 3. (0 means
    268 // 45-degree up-right, 2 is horizontal).
    269 //
    270 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted
    271 // lines need three vectors instead of two. For direction 1 for example, we need
    272 // to compute the sums along the line i below:
    273 // 0 0 1 1 2 2 3  3
    274 // 1 1 2 2 3 3 4  4
    275 // 2 2 3 3 4 4 5  5
    276 // 3 3 4 4 5 5 6  6
    277 // 4 4 5 5 6 6 7  7
    278 // 5 5 6 6 7 7 8  8
    279 // 6 6 7 7 8 8 9  9
    280 // 7 7 8 8 9 9 10 10
    281 //
    282 // Which means we need the following configuration:
    283 // 0 0 1 1 2 2 3 3
    284 //     1 1 2 2 3 3 4 4
    285 //         2 2 3 3 4 4 5 5
    286 //             3 3 4 4 5 5 6 6
    287 //                 4 4 5 5 6 6 7 7
    288 //                     5 5 6 6 7 7 8 8
    289 //                         6 6 7 7 8 8 9 9
    290 //                             7 7 8 8 9 9 10 10
    291 //
    292 // Three vectors are needed to compute this, as well as some extra pairwise
    293 // additions.
    294 static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8],
    295                                                uint32_t cost[4]) {
    296  const int16x8_t zero = vdupq_n_s16(0);
    297 
    298  // Compute diagonal directions (1, 2, 3).
    299  // Partial sums for lines 0 and 1.
    300  int16x8_t partial0a = lines[0];
    301  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7));
    302  int16x8_t partial0b = vextq_s16(lines[1], zero, 7);
    303  int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6));
    304  int16x8_t partial1b = vextq_s16(lines[1], zero, 6);
    305  int16x8_t partial3a = vextq_s16(lines[0], zero, 2);
    306  partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4));
    307  int16x8_t partial3b = vextq_s16(zero, lines[0], 2);
    308  partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4));
    309 
    310  // Partial sums for lines 2 and 3.
    311  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6));
    312  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5));
    313  partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6));
    314  partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5));
    315  partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4));
    316  partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2));
    317  partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4));
    318  partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2));
    319  partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6));
    320  partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6));
    321  partial3b = vaddq_s16(partial3b, lines[3]);
    322 
    323  // Partial sums for lines 4 and 5.
    324  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4));
    325  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3));
    326  partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4));
    327  partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3));
    328  partial1b = vaddq_s16(partial1b, lines[4]);
    329  partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6));
    330  int16x8_t partial1c = vextq_s16(lines[5], zero, 6);
    331  partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2));
    332  partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4));
    333  int16x8_t partial3c = vextq_s16(zero, lines[4], 2);
    334  partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4));
    335 
    336  // Partial sums for lines 6 and 7.
    337  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2));
    338  partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1));
    339  partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2));
    340  partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1));
    341  partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4));
    342  partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2));
    343  partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4));
    344  partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2));
    345  partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6));
    346  partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6));
    347  partial3c = vaddq_s16(partial3c, lines[7]);
    348 
    349  // Special case for direction 2 as it's just a sum along each line.
    350  int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
    351  int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
    352  int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
    353  int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
    354 
    355  uint32x4_t partial2a_u32 =
    356      vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a));
    357  uint32x4_t partial2b_u32 =
    358      vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b));
    359 
    360  uint32x4_t const0 = vreinterpretq_u32_u64(
    361      vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
    362                   vcreate_u64((uint64_t)210 << 32 | 280)));
    363  uint32x4_t const1 = vreinterpretq_u32_u64(
    364      vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
    365                   vcreate_u64((uint64_t)105 << 32 | 120)));
    366  uint32x4_t const2 = vreinterpretq_u32_u64(
    367      vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420),
    368                   vcreate_u64((uint64_t)105 << 32 | 140)));
    369 
    370  uint32x4_t costs[4];
    371  costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1);
    372  costs[1] =
    373      fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2);
    374  costs[2] = vaddq_u32(partial2a_u32, partial2b_u32);
    375  costs[2] = vmulq_n_u32(costs[2], 105);
    376  costs[3] =
    377      fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2);
    378 
    379  costs[0] = horizontal_add_4d_u32x4(costs);
    380  vst1q_u32(cost, costs[0]);
    381  return costs[0];
    382 }
    383 
    384 int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
    385                       int coeff_shift) {
    386  uint32_t cost[8];
    387  uint32_t best_cost = 0;
    388  int best_dir = 0;
    389  int16x8_t lines[8];
    390  for (int i = 0; i < 8; i++) {
    391    uint16x8_t s = vld1q_u16(&img[i * stride]);
    392    lines[i] = vreinterpretq_s16_u16(
    393        vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
    394  }
    395 
    396  // Compute "mostly vertical" directions.
    397  uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4);
    398 
    399  // Compute "mostly horizontal" directions.
    400  uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost);
    401 
    402  // Find max cost as well as its index to get best_dir.
    403  // The max cost needs to be propagated in the whole vector to find its
    404  // position in the original cost vectors cost03 and cost47.
    405  uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
    406 #if AOM_ARCH_AARCH64
    407  best_cost = vmaxvq_u32(cost07);
    408  uint32x4_t max_cost = vdupq_n_u32(best_cost);
    409  uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
    410                           vreinterpretq_u8_u32(
    411                               vceqq_u32(max_cost, cost47)) } };
    412  // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
    413  uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
    414  // Get the lowest 8 bit of each 32-bit elements and reverse them.
    415  uint8x8_t tbl = vqtbl2_u8(costs, idx);
    416  uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
    417  best_dir = aom_clzll(a) >> 3;
    418 #else
    419  uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
    420  cost64 = vpmax_u32(cost64, cost64);
    421  uint32x4_t max_cost = vcombine_u32(cost64, cost64);
    422  best_cost = vget_lane_u32(cost64, 0);
    423  uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
    424                                  vmovn_u32(vceqq_u32(max_cost, cost47)));
    425  uint8x8_t idx =
    426      vand_u8(vmovn_u16(costs),
    427              vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
    428  int sum = horizontal_add_u8x8(idx);
    429  best_dir = get_msb(sum ^ (sum - 1));
    430 #endif
    431 
    432  // Difference between the optimal variance and the variance along the
    433  // orthogonal direction. Again, the sum(x^2) terms cancel out.
    434  *var = best_cost - cost[(best_dir + 4) & 7];
    435  // We'd normally divide by 840, but dividing by 1024 is close enough
    436  // for what we're going to do with this.
    437  *var >>= 10;
    438  return best_dir;
    439 }
    440 
    441 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
    442                             int stride, int32_t *var_out_1st,
    443                             int32_t *var_out_2nd, int coeff_shift,
    444                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
    445  // Process first 8x8.
    446  *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
    447 
    448  // Process second 8x8.
    449  *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
    450 }
    451 
    452 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
    453 static inline int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
    454                                    unsigned int threshold, int adjdamp) {
    455  uint16x8_t diff = vabdq_u16(a, b);
    456  const uint16x8_t a_gt_b = vcgtq_u16(a, b);
    457  const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold),
    458                                  vshlq_u16(diff, vdupq_n_s16(-adjdamp)));
    459  const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s));
    460  return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
    461 }
    462 
    463 static inline void primary_filter(uint16x8_t s, uint16x8_t tap[4],
    464                                  const int *pri_taps, int pri_strength,
    465                                  int pri_damping, int16x8_t *sum) {
    466  // Near taps
    467  int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping);
    468  int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping);
    469  // sum += pri_taps[0] * (n0 + n1)
    470  n0 = vaddq_s16(n0, n1);
    471  *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]);
    472 
    473  // Far taps
    474  int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping);
    475  int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping);
    476  // sum += pri_taps[1] * (f0 + f1)
    477  f0 = vaddq_s16(f0, f1);
    478  *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
    479 }
    480 
    481 static inline void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
    482                                    const int *sec_taps, int sec_strength,
    483                                    int sec_damping, int16x8_t *sum) {
    484  // Near taps
    485  int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping);
    486  int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping);
    487  int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping);
    488  int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping);
    489 
    490  // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
    491  s0 = vaddq_s16(s0, s1);
    492  s2 = vaddq_s16(s2, s3);
    493  s0 = vaddq_s16(s0, s2);
    494  *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]);
    495 
    496  // Far taps
    497  s0 = constrain16(tap[4], s, sec_strength, sec_damping);
    498  s1 = constrain16(tap[5], s, sec_strength, sec_damping);
    499  s2 = constrain16(tap[6], s, sec_strength, sec_damping);
    500  s3 = constrain16(tap[7], s, sec_strength, sec_damping);
    501 
    502  // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
    503  s0 = vaddq_s16(s0, s1);
    504  s2 = vaddq_s16(s2, s3);
    505  s0 = vaddq_s16(s0, s2);
    506  *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]);
    507 }
    508 
    509 void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
    510                          int pri_strength, int sec_strength, int dir,
    511                          int pri_damping, int sec_damping, int coeff_shift,
    512                          int block_width, int block_height) {
    513  uint16x8_t max, min;
    514  const uint16x8_t cdef_large_value_mask =
    515      vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
    516  const int po1 = cdef_directions[dir][0];
    517  const int po2 = cdef_directions[dir][1];
    518  const int s1o1 = cdef_directions[dir + 2][0];
    519  const int s1o2 = cdef_directions[dir + 2][1];
    520  const int s2o1 = cdef_directions[dir - 2][0];
    521  const int s2o2 = cdef_directions[dir - 2][1];
    522  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
    523  const int *sec_taps = cdef_sec_taps;
    524 
    525  if (pri_strength) {
    526    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
    527  }
    528  if (sec_strength) {
    529    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
    530  }
    531 
    532  if (block_width == 8) {
    533    uint8_t *dst8 = (uint8_t *)dest;
    534 
    535    int h = block_height;
    536    do {
    537      int16x8_t sum = vdupq_n_s16(0);
    538      uint16x8_t s = vld1q_u16(in);
    539      max = min = s;
    540 
    541      uint16x8_t pri_src[4];
    542 
    543      // Primary near taps
    544      pri_src[0] = vld1q_u16(in + po1);
    545      pri_src[1] = vld1q_u16(in - po1);
    546 
    547      // Primary far taps
    548      pri_src[2] = vld1q_u16(in + po2);
    549      pri_src[3] = vld1q_u16(in - po2);
    550 
    551      primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
    552 
    553      // The source is 16 bits, however, we only really care about the lower
    554      // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
    555      // primary max has been calculated, zero out the upper 8 bits.  Use this
    556      // to find the "16 bit" max.
    557      uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
    558                                     vreinterpretq_u8_u16(pri_src[1]));
    559      uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
    560                                     vreinterpretq_u8_u16(pri_src[3]));
    561      pri_max0 = vmaxq_u8(pri_max0, pri_max1);
    562      max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
    563                                     cdef_large_value_mask));
    564 
    565      uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
    566      uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
    567      pri_min0 = vminq_u16(pri_min0, pri_min1);
    568      min = vminq_u16(min, pri_min0);
    569 
    570      uint16x8_t sec_src[8];
    571 
    572      // Secondary near taps
    573      sec_src[0] = vld1q_u16(in + s1o1);
    574      sec_src[1] = vld1q_u16(in - s1o1);
    575      sec_src[2] = vld1q_u16(in + s2o1);
    576      sec_src[3] = vld1q_u16(in - s2o1);
    577 
    578      // Secondary far taps
    579      sec_src[4] = vld1q_u16(in + s1o2);
    580      sec_src[5] = vld1q_u16(in - s1o2);
    581      sec_src[6] = vld1q_u16(in + s2o2);
    582      sec_src[7] = vld1q_u16(in - s2o2);
    583 
    584      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
    585 
    586      // The source is 16 bits, however, we only really care about the lower
    587      // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
    588      // primary max has been calculated, zero out the upper 8 bits.  Use this
    589      // to find the "16 bit" max.
    590      uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
    591                                     vreinterpretq_u8_u16(sec_src[1]));
    592      uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
    593                                     vreinterpretq_u8_u16(sec_src[3]));
    594      uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
    595                                     vreinterpretq_u8_u16(sec_src[5]));
    596      uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
    597                                     vreinterpretq_u8_u16(sec_src[7]));
    598      sec_max0 = vmaxq_u8(sec_max0, sec_max1);
    599      sec_max2 = vmaxq_u8(sec_max2, sec_max3);
    600      sec_max0 = vmaxq_u8(sec_max0, sec_max2);
    601      max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
    602                                     cdef_large_value_mask));
    603 
    604      uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
    605      uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
    606      uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
    607      uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
    608      sec_min0 = vminq_u16(sec_min0, sec_min1);
    609      sec_min2 = vminq_u16(sec_min2, sec_min3);
    610      sec_min0 = vminq_u16(sec_min0, sec_min2);
    611      min = vminq_u16(min, sec_min0);
    612 
    613      // res = s + ((sum - (sum < 0) + 8) >> 4)
    614      sum =
    615          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
    616      int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
    617 
    618      res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
    619                          vreinterpretq_s16_u16(max));
    620 
    621      const uint8x8_t res_u8 = vqmovun_s16(res_s16);
    622      vst1_u8(dst8, res_u8);
    623 
    624      in += CDEF_BSTRIDE;
    625      dst8 += dstride;
    626    } while (--h != 0);
    627  } else {
    628    uint8_t *dst8 = (uint8_t *)dest;
    629 
    630    int h = block_height;
    631    do {
    632      int16x8_t sum = vdupq_n_s16(0);
    633      uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
    634      max = min = s;
    635 
    636      uint16x8_t pri_src[4];
    637 
    638      // Primary near taps
    639      pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
    640      pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
    641 
    642      // Primary far taps
    643      pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
    644      pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
    645 
    646      primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
    647 
    648      // The source is 16 bits, however, we only really care about the lower
    649      // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
    650      // primary max has been calculated, zero out the upper 8 bits.  Use this
    651      // to find the "16 bit" max.
    652      uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
    653                                     vreinterpretq_u8_u16(pri_src[1]));
    654      uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
    655                                     vreinterpretq_u8_u16(pri_src[3]));
    656      pri_max0 = vmaxq_u8(pri_max0, pri_max1);
    657      max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
    658                                     cdef_large_value_mask));
    659 
    660      uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
    661      uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
    662      pri_min1 = vminq_u16(pri_min1, pri_min2);
    663      min = vminq_u16(min, pri_min1);
    664 
    665      uint16x8_t sec_src[8];
    666 
    667      // Secondary near taps
    668      sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
    669      sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
    670      sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
    671      sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
    672 
    673      // Secondary far taps
    674      sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
    675      sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
    676      sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
    677      sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
    678 
    679      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
    680 
    681      // The source is 16 bits, however, we only really care about the lower
    682      // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
    683      // primary max has been calculated, zero out the upper 8 bits.  Use this
    684      // to find the "16 bit" max.
    685      uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
    686                                     vreinterpretq_u8_u16(sec_src[1]));
    687      uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
    688                                     vreinterpretq_u8_u16(sec_src[3]));
    689      uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
    690                                     vreinterpretq_u8_u16(sec_src[5]));
    691      uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
    692                                     vreinterpretq_u8_u16(sec_src[7]));
    693      sec_max0 = vmaxq_u8(sec_max0, sec_max1);
    694      sec_max2 = vmaxq_u8(sec_max2, sec_max3);
    695      sec_max0 = vmaxq_u8(sec_max0, sec_max2);
    696      max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
    697                                     cdef_large_value_mask));
    698 
    699      uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
    700      uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
    701      uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
    702      uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
    703      sec_min0 = vminq_u16(sec_min0, sec_min1);
    704      sec_min2 = vminq_u16(sec_min2, sec_min3);
    705      sec_min0 = vminq_u16(sec_min0, sec_min2);
    706      min = vminq_u16(min, sec_min0);
    707 
    708      // res = s + ((sum - (sum < 0) + 8) >> 4)
    709      sum =
    710          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
    711      int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
    712 
    713      res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
    714                          vreinterpretq_s16_u16(max));
    715 
    716      const uint8x8_t res_u8 = vqmovun_s16(res_s16);
    717      store_u8x4_strided_x2(dst8, dstride, res_u8);
    718 
    719      in += 2 * CDEF_BSTRIDE;
    720      dst8 += 2 * dstride;
    721      h -= 2;
    722    } while (h != 0);
    723  }
    724 }
    725 
    726 void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
    727                          int pri_strength, int sec_strength, int dir,
    728                          int pri_damping, int sec_damping, int coeff_shift,
    729                          int block_width, int block_height) {
    730  (void)sec_strength;
    731  (void)sec_damping;
    732 
    733  const int po1 = cdef_directions[dir][0];
    734  const int po2 = cdef_directions[dir][1];
    735  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
    736 
    737  if (pri_strength) {
    738    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
    739  }
    740 
    741  if (block_width == 8) {
    742    uint8_t *dst8 = (uint8_t *)dest;
    743 
    744    int h = block_height;
    745    do {
    746      int16x8_t sum = vdupq_n_s16(0);
    747      uint16x8_t s = vld1q_u16(in);
    748 
    749      uint16x8_t tap[4];
    750 
    751      // Primary near taps
    752      tap[0] = vld1q_u16(in + po1);
    753      tap[1] = vld1q_u16(in - po1);
    754 
    755      // Primary far taps
    756      tap[2] = vld1q_u16(in + po2);
    757      tap[3] = vld1q_u16(in - po2);
    758 
    759      primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
    760 
    761      // res = s + ((sum - (sum < 0) + 8) >> 4)
    762      sum =
    763          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
    764      const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
    765 
    766      const uint8x8_t res_u8 = vqmovun_s16(res_s16);
    767      vst1_u8(dst8, res_u8);
    768 
    769      in += CDEF_BSTRIDE;
    770      dst8 += dstride;
    771    } while (--h != 0);
    772 
    773  } else {
    774    uint8_t *dst8 = (uint8_t *)dest;
    775 
    776    int h = block_height;
    777    do {
    778      int16x8_t sum = vdupq_n_s16(0);
    779      uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
    780 
    781      uint16x8_t pri_src[4];
    782 
    783      // Primary near taps
    784      pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
    785      pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
    786 
    787      // Primary far taps
    788      pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
    789      pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
    790 
    791      primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
    792 
    793      // res = s + ((sum - (sum < 0) + 8) >> 4)
    794      sum =
    795          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
    796      const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
    797 
    798      const uint8x8_t res_u8 = vqmovun_s16(res_s16);
    799      store_u8x4_strided_x2(dst8, dstride, res_u8);
    800 
    801      in += 2 * CDEF_BSTRIDE;
    802      dst8 += 2 * dstride;
    803      h -= 2;
    804    } while (h != 0);
    805  }
    806 }
    807 
    808 void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
    809                          int pri_strength, int sec_strength, int dir,
    810                          int pri_damping, int sec_damping, int coeff_shift,
    811                          int block_width, int block_height) {
    812  (void)pri_strength;
    813  (void)pri_damping;
    814  (void)coeff_shift;
    815 
    816  const int s1o1 = cdef_directions[dir + 2][0];
    817  const int s1o2 = cdef_directions[dir + 2][1];
    818  const int s2o1 = cdef_directions[dir - 2][0];
    819  const int s2o2 = cdef_directions[dir - 2][1];
    820  const int *sec_taps = cdef_sec_taps;
    821 
    822  if (sec_strength) {
    823    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
    824  }
    825 
    826  if (block_width == 8) {
    827    uint8_t *dst8 = (uint8_t *)dest;
    828 
    829    int h = block_height;
    830    do {
    831      int16x8_t sum = vdupq_n_s16(0);
    832      uint16x8_t s = vld1q_u16(in);
    833 
    834      uint16x8_t sec_src[8];
    835 
    836      // Secondary near taps
    837      sec_src[0] = vld1q_u16(in + s1o1);
    838      sec_src[1] = vld1q_u16(in - s1o1);
    839      sec_src[2] = vld1q_u16(in + s2o1);
    840      sec_src[3] = vld1q_u16(in - s2o1);
    841 
    842      // Secondary far taps
    843      sec_src[4] = vld1q_u16(in + s1o2);
    844      sec_src[5] = vld1q_u16(in - s1o2);
    845      sec_src[6] = vld1q_u16(in + s2o2);
    846      sec_src[7] = vld1q_u16(in - s2o2);
    847 
    848      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
    849 
    850      // res = s + ((sum - (sum < 0) + 8) >> 4)
    851      sum =
    852          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
    853      const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
    854 
    855      const uint8x8_t res_u8 = vqmovun_s16(res_s16);
    856      vst1_u8(dst8, res_u8);
    857 
    858      in += CDEF_BSTRIDE;
    859      dst8 += dstride;
    860    } while (--h != 0);
    861  } else {
    862    uint8_t *dst8 = (uint8_t *)dest;
    863 
    864    int h = block_height;
    865    do {
    866      int16x8_t sum = vdupq_n_s16(0);
    867      uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
    868 
    869      uint16x8_t sec_src[8];
    870 
    871      // Secondary near taps
    872      sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
    873      sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
    874      sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
    875      sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
    876 
    877      // Secondary far taps
    878      sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
    879      sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
    880      sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
    881      sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
    882 
    883      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
    884 
    885      // res = s + ((sum - (sum < 0) + 8) >> 4)
    886      sum =
    887          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
    888      const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
    889 
    890      const uint8x8_t res_u8 = vqmovun_s16(res_s16);
    891      store_u8x4_strided_x2(dst8, dstride, res_u8);
    892 
    893      in += 2 * CDEF_BSTRIDE;
    894      dst8 += 2 * dstride;
    895      h -= 2;
    896    } while (h != 0);
    897  }
    898 }
    899 
    900 void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
    901                          int pri_strength, int sec_strength, int dir,
    902                          int pri_damping, int sec_damping, int coeff_shift,
    903                          int block_width, int block_height) {
    904  (void)pri_strength;
    905  (void)sec_strength;
    906  (void)dir;
    907  (void)pri_damping;
    908  (void)sec_damping;
    909  (void)coeff_shift;
    910  (void)block_width;
    911  if (block_width == 8) {
    912    uint8_t *dst8 = (uint8_t *)dest;
    913 
    914    int h = block_height;
    915    do {
    916      const uint16x8_t s = vld1q_u16(in);
    917      const uint8x8_t res = vqmovn_u16(s);
    918      vst1_u8(dst8, res);
    919 
    920      in += CDEF_BSTRIDE;
    921      dst8 += dstride;
    922    } while (--h != 0);
    923  } else {
    924    uint8_t *dst8 = (uint8_t *)dest;
    925 
    926    int h = block_height;
    927    do {
    928      const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
    929      const uint8x8_t res = vqmovn_u16(s);
    930      store_u8x4_strided_x2(dst8, dstride, res);
    931 
    932      in += 2 * CDEF_BSTRIDE;
    933      dst8 += 2 * dstride;
    934      h -= 2;
    935    } while (h != 0);
    936  }
    937 }
    938 
    939 void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
    940                           int pri_strength, int sec_strength, int dir,
    941                           int pri_damping, int sec_damping, int coeff_shift,
    942                           int block_width, int block_height) {
    943  uint16x8_t max, min;
    944  const uint16x8_t cdef_large_value_mask =
    945      vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
    946  const int po1 = cdef_directions[dir][0];
    947  const int po2 = cdef_directions[dir][1];
    948  const int s1o1 = cdef_directions[dir + 2][0];
    949  const int s1o2 = cdef_directions[dir + 2][1];
    950  const int s2o1 = cdef_directions[dir - 2][0];
    951  const int s2o2 = cdef_directions[dir - 2][1];
    952  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
    953  const int *sec_taps = cdef_sec_taps;
    954 
    955  if (pri_strength) {
    956    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
    957  }
    958  if (sec_strength) {
    959    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
    960  }
    961 
    962  if (block_width == 8) {
    963    uint16_t *dst16 = (uint16_t *)dest;
    964 
    965    int h = block_height;
    966    do {
    967      int16x8_t sum = vdupq_n_s16(0);
    968      uint16x8_t s = vld1q_u16(in);
    969      max = min = s;
    970 
    971      uint16x8_t pri_src[4];
    972 
    973      // Primary near taps
    974      pri_src[0] = vld1q_u16(in + po1);
    975      pri_src[1] = vld1q_u16(in - po1);
    976 
    977      // Primary far taps
    978      pri_src[2] = vld1q_u16(in + po2);
    979      pri_src[3] = vld1q_u16(in - po2);
    980 
    981      primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
    982 
    983      uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
    984      uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
    985      pri_min0 = vminq_u16(pri_min0, pri_min1);
    986      min = vminq_u16(min, pri_min0);
    987 
    988      /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
    989      pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
    990      pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
    991      pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
    992      pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
    993 
    994      uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
    995      uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
    996      pri_max0 = vmaxq_u16(pri_max0, pri_max1);
    997      max = vmaxq_u16(max, pri_max0);
    998 
    999      uint16x8_t sec_src[8];
   1000 
   1001      // Secondary near taps
   1002      sec_src[0] = vld1q_u16(in + s1o1);
   1003      sec_src[1] = vld1q_u16(in - s1o1);
   1004      sec_src[2] = vld1q_u16(in + s2o1);
   1005      sec_src[3] = vld1q_u16(in - s2o1);
   1006 
   1007      // Secondary far taps
   1008      sec_src[4] = vld1q_u16(in + s1o2);
   1009      sec_src[5] = vld1q_u16(in - s1o2);
   1010      sec_src[6] = vld1q_u16(in + s2o2);
   1011      sec_src[7] = vld1q_u16(in - s2o2);
   1012 
   1013      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
   1014 
   1015      uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
   1016      uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
   1017      uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
   1018      uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
   1019      sec_min0 = vminq_u16(sec_min0, sec_min1);
   1020      sec_min2 = vminq_u16(sec_min2, sec_min3);
   1021      sec_min0 = vminq_u16(sec_min0, sec_min2);
   1022      min = vminq_u16(min, sec_min0);
   1023 
   1024      /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
   1025      sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
   1026      sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
   1027      sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
   1028      sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
   1029      sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
   1030      sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
   1031      sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
   1032      sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
   1033 
   1034      uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
   1035      uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
   1036      uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
   1037      uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
   1038      sec_max0 = vmaxq_u16(sec_max0, sec_max1);
   1039      sec_max2 = vmaxq_u16(sec_max2, sec_max3);
   1040      sec_max0 = vmaxq_u16(sec_max0, sec_max2);
   1041      max = vmaxq_u16(max, sec_max0);
   1042 
   1043      // res = s + ((sum - (sum < 0) + 8) >> 4)
   1044      sum =
   1045          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
   1046      int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
   1047 
   1048      res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
   1049                      vreinterpretq_s16_u16(max));
   1050 
   1051      vst1q_u16(dst16, vreinterpretq_u16_s16(res));
   1052 
   1053      in += CDEF_BSTRIDE;
   1054      dst16 += dstride;
   1055    } while (--h != 0);
   1056  } else {
   1057    uint16_t *dst16 = (uint16_t *)dest;
   1058 
   1059    int h = block_height;
   1060    do {
   1061      int16x8_t sum = vdupq_n_s16(0);
   1062      uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
   1063      max = min = s;
   1064 
   1065      uint16x8_t pri_src[4];
   1066 
   1067      // Primary near taps
   1068      pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
   1069      pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
   1070 
   1071      // Primary far taps
   1072      pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
   1073      pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
   1074 
   1075      primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
   1076 
   1077      uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
   1078      uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
   1079      pri_min1 = vminq_u16(pri_min1, pri_min2);
   1080      min = vminq_u16(min, pri_min1);
   1081 
   1082      /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
   1083      pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
   1084      pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
   1085      pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
   1086      pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
   1087      uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
   1088      uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
   1089      pri_max0 = vmaxq_u16(pri_max0, pri_max1);
   1090      max = vmaxq_u16(max, pri_max0);
   1091 
   1092      uint16x8_t sec_src[8];
   1093 
   1094      // Secondary near taps
   1095      sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
   1096      sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
   1097      sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
   1098      sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
   1099 
   1100      // Secondary far taps
   1101      sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
   1102      sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
   1103      sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
   1104      sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
   1105 
   1106      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
   1107 
   1108      uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
   1109      uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
   1110      uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
   1111      uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
   1112      sec_min0 = vminq_u16(sec_min0, sec_min1);
   1113      sec_min2 = vminq_u16(sec_min2, sec_min3);
   1114      sec_min0 = vminq_u16(sec_min0, sec_min2);
   1115      min = vminq_u16(min, sec_min0);
   1116 
   1117      /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
   1118      sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
   1119      sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
   1120      sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
   1121      sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
   1122      sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
   1123      sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
   1124      sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
   1125      sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
   1126 
   1127      uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
   1128      uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
   1129      uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
   1130      uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
   1131      sec_max0 = vmaxq_u16(sec_max0, sec_max1);
   1132      sec_max2 = vmaxq_u16(sec_max2, sec_max3);
   1133      sec_max0 = vmaxq_u16(sec_max0, sec_max2);
   1134      max = vmaxq_u16(max, sec_max0);
   1135 
   1136      // res = s + ((sum - (sum < 0) + 8) >> 4)
   1137      sum =
   1138          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
   1139      int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
   1140 
   1141      res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
   1142                      vreinterpretq_s16_u16(max));
   1143 
   1144      store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
   1145 
   1146      in += 2 * CDEF_BSTRIDE;
   1147      dst16 += 2 * dstride;
   1148      h -= 2;
   1149    } while (h != 0);
   1150  }
   1151 }
   1152 
   1153 void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
   1154                           int pri_strength, int sec_strength, int dir,
   1155                           int pri_damping, int sec_damping, int coeff_shift,
   1156                           int block_width, int block_height) {
   1157  (void)sec_strength;
   1158  (void)sec_damping;
   1159 
   1160  const int po1 = cdef_directions[dir][0];
   1161  const int po2 = cdef_directions[dir][1];
   1162  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
   1163 
   1164  if (pri_strength) {
   1165    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   1166  }
   1167 
   1168  if (block_width == 8) {
   1169    uint16_t *dst16 = (uint16_t *)dest;
   1170 
   1171    int h = block_height;
   1172    do {
   1173      int16x8_t sum = vdupq_n_s16(0);
   1174      uint16x8_t s = vld1q_u16(in);
   1175 
   1176      uint16x8_t tap[4];
   1177 
   1178      // Primary near taps
   1179      tap[0] = vld1q_u16(in + po1);
   1180      tap[1] = vld1q_u16(in - po1);
   1181 
   1182      // Primary far taps
   1183      tap[2] = vld1q_u16(in + po2);
   1184      tap[3] = vld1q_u16(in - po2);
   1185 
   1186      primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
   1187 
   1188      // res = s + ((sum - (sum < 0) + 8) >> 4)
   1189      sum =
   1190          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
   1191      const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
   1192 
   1193      vst1q_u16(dst16, vreinterpretq_u16_s16(res));
   1194 
   1195      in += CDEF_BSTRIDE;
   1196      dst16 += dstride;
   1197    } while (--h != 0);
   1198  } else {
   1199    uint16_t *dst16 = (uint16_t *)dest;
   1200 
   1201    int h = block_height;
   1202    do {
   1203      int16x8_t sum = vdupq_n_s16(0);
   1204      uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
   1205 
   1206      uint16x8_t pri_src[4];
   1207 
   1208      // Primary near taps
   1209      pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
   1210      pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
   1211 
   1212      // Primary far taps
   1213      pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
   1214      pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
   1215 
   1216      primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
   1217 
   1218      // res = s + ((sum - (sum < 0) + 8) >> 4)
   1219      sum =
   1220          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
   1221      const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
   1222 
   1223      store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
   1224 
   1225      in += 2 * CDEF_BSTRIDE;
   1226      dst16 += 2 * dstride;
   1227      h -= 2;
   1228    } while (h != 0);
   1229  }
   1230 }
   1231 
   1232 void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
   1233                           int pri_strength, int sec_strength, int dir,
   1234                           int pri_damping, int sec_damping, int coeff_shift,
   1235                           int block_width, int block_height) {
   1236  (void)pri_strength;
   1237  (void)pri_damping;
   1238  (void)coeff_shift;
   1239 
   1240  const int s1o1 = cdef_directions[dir + 2][0];
   1241  const int s1o2 = cdef_directions[dir + 2][1];
   1242  const int s2o1 = cdef_directions[dir - 2][0];
   1243  const int s2o2 = cdef_directions[dir - 2][1];
   1244  const int *sec_taps = cdef_sec_taps;
   1245 
   1246  if (sec_strength) {
   1247    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
   1248  }
   1249 
   1250  if (block_width == 8) {
   1251    uint16_t *dst16 = (uint16_t *)dest;
   1252 
   1253    int h = block_height;
   1254    do {
   1255      int16x8_t sum = vdupq_n_s16(0);
   1256      uint16x8_t s = vld1q_u16(in);
   1257 
   1258      uint16x8_t sec_src[8];
   1259 
   1260      // Secondary near taps
   1261      sec_src[0] = vld1q_u16(in + s1o1);
   1262      sec_src[1] = vld1q_u16(in - s1o1);
   1263      sec_src[2] = vld1q_u16(in + s2o1);
   1264      sec_src[3] = vld1q_u16(in - s2o1);
   1265 
   1266      // Secondary far taps
   1267      sec_src[4] = vld1q_u16(in + s1o2);
   1268      sec_src[5] = vld1q_u16(in - s1o2);
   1269      sec_src[6] = vld1q_u16(in + s2o2);
   1270      sec_src[7] = vld1q_u16(in - s2o2);
   1271 
   1272      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
   1273 
   1274      // res = s + ((sum - (sum < 0) + 8) >> 4)
   1275      sum =
   1276          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
   1277      const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
   1278 
   1279      vst1q_u16(dst16, vreinterpretq_u16_s16(res));
   1280 
   1281      in += CDEF_BSTRIDE;
   1282      dst16 += dstride;
   1283    } while (--h != 0);
   1284  } else {
   1285    uint16_t *dst16 = (uint16_t *)dest;
   1286 
   1287    int h = block_height;
   1288    do {
   1289      int16x8_t sum = vdupq_n_s16(0);
   1290      uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
   1291 
   1292      uint16x8_t sec_src[8];
   1293 
   1294      // Secondary near taps
   1295      sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
   1296      sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
   1297      sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
   1298      sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
   1299 
   1300      // Secondary far taps
   1301      sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
   1302      sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
   1303      sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
   1304      sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
   1305 
   1306      secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
   1307 
   1308      // res = s + ((sum - (sum < 0) + 8) >> 4)
   1309      sum =
   1310          vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
   1311      const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
   1312 
   1313      store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
   1314 
   1315      in += 2 * CDEF_BSTRIDE;
   1316      dst16 += 2 * dstride;
   1317      h -= 2;
   1318    } while (h != 0);
   1319  }
   1320 }
   1321 
   1322 void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
   1323                           int pri_strength, int sec_strength, int dir,
   1324                           int pri_damping, int sec_damping, int coeff_shift,
   1325                           int block_width, int block_height) {
   1326  (void)pri_strength;
   1327  (void)sec_strength;
   1328  (void)dir;
   1329  (void)pri_damping;
   1330  (void)sec_damping;
   1331  (void)coeff_shift;
   1332  (void)block_width;
   1333  if (block_width == 8) {
   1334    uint16_t *dst16 = (uint16_t *)dest;
   1335 
   1336    int h = block_height;
   1337    do {
   1338      const uint16x8_t s = vld1q_u16(in);
   1339      vst1q_u16(dst16, s);
   1340 
   1341      in += CDEF_BSTRIDE;
   1342      dst16 += dstride;
   1343    } while (--h != 0);
   1344  } else {
   1345    uint16_t *dst16 = (uint16_t *)dest;
   1346 
   1347    int h = block_height;
   1348    do {
   1349      const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
   1350      store_u16x4_strided_x2(dst16, dstride, s);
   1351 
   1352      in += 2 * CDEF_BSTRIDE;
   1353      dst16 += 2 * dstride;
   1354      h -= 2;
   1355    } while (h != 0);
   1356  }
   1357 }