tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

encodetxb_neon.c (22872B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 #include <math.h>
     15 
     16 #include "config/aom_config.h"
     17 
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "av1/common/txb_common.h"
     20 #include "av1/encoder/encodetxb.h"
     21 
     22 void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
     23                              const int height, uint8_t *const levels) {
     24  const int stride = height + TX_PAD_HOR;
     25  memset(levels - TX_PAD_TOP * stride, 0,
     26         sizeof(*levels) * TX_PAD_TOP * stride);
     27  memset(levels + stride * width, 0,
     28         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
     29 
     30  const int32x4_t zeros = vdupq_n_s32(0);
     31  int i = 0;
     32  uint8_t *ls = levels;
     33  const tran_low_t *cf = coeff;
     34  if (height == 4) {
     35    do {
     36      const int32x4_t coeffA = vld1q_s32(cf);
     37      const int32x4_t coeffB = vld1q_s32(cf + height);
     38      const int16x8_t coeffAB =
     39          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
     40      const int16x8_t absAB = vqabsq_s16(coeffAB);
     41      const int8x8_t absABs = vqmovn_s16(absAB);
     42 #if AOM_ARCH_AARCH64
     43      const int8x16_t absAB8 =
     44          vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
     45      const uint8x16_t lsAB =
     46          vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
     47 #else
     48      const int32x2x2_t absAB8 =
     49          vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
     50      const uint8x16_t lsAB =
     51          vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
     52 #endif
     53      vst1q_u8(ls, lsAB);
     54      ls += (stride << 1);
     55      cf += (height << 1);
     56      i += 2;
     57    } while (i < width);
     58  } else if (height == 8) {
     59    do {
     60      const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
     61      const int16x8_t absAB = vqabsq_s16(coeffAB);
     62      const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
     63          vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
     64      vst1q_u8(ls, absAB8);
     65      ls += stride;
     66      cf += height;
     67      i += 1;
     68    } while (i < width);
     69  } else {
     70    do {
     71      int j = 0;
     72      do {
     73        const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
     74        const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
     75        const int16x8_t absAB = vqabsq_s16(coeffAB);
     76        const int16x8_t absCD = vqabsq_s16(coeffCD);
     77        const uint8x16_t absABCD = vreinterpretq_u8_s8(
     78            vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
     79        vst1q_u8((ls + j), absABCD);
     80        j += 16;
     81        cf += 16;
     82      } while (j < height);
     83      *(int32_t *)(ls + height) = 0;
     84      ls += stride;
     85      i += 1;
     86    } while (i < width);
     87  }
     88 }
     89 
     90 // get_4_nz_map_contexts_2d coefficients:
     91 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
     92  { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
     93  { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 }
     94 };
     95 
     96 // get_4_nz_map_contexts_hor coefficients:
     97 /* clang-format off */
     98 #define SIG_COEF_CONTEXTS_2D_X4_051010                        \
     99  (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
    100  ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
    101 /* clang-format on */
    102 
    103 // get_4_nz_map_contexts_ver coefficients:
    104 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = {
    105  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
    106  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
    107  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
    108  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
    109  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    110  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    111  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    112  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
    113 };
    114 
    115 // get_8_coeff_contexts_2d coefficients:
    116 // if (width == 8)
    117 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
    118  { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
    119  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
    120 };
    121 // if (width < 8)
    122 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
    123  { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 },
    124  { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 }
    125 };
    126 
    127 // if (width > 8)
    128 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
    129  { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
    130  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
    131 };
    132 
    133 // get_4_nz_map_contexts_ver coefficients:
    134 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = {
    135  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
    136  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    137  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    138  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    139  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
    140  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    141  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    142  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
    143 };
    144 
    145 // get_16n_coeff_contexts_2d coefficients:
    146 // real_width == real_height
    147 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
    148  { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
    149  { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
    150  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
    151  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
    152 };
    153 
    154 // real_width < real_height
    155 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
    156  { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
    157  { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
    158  { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
    159 };
    160 
    161 // real_width > real_height
    162 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
    163  { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
    164  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
    165  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
    166 };
    167 
    168 // get_16n_coeff_contexts_hor coefficients:
    169 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
    170  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
    171  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    172  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    173  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    174  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    175  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    176  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
    177  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
    178 };
    179 
    180 // end of coefficients declaration area
    181 
    182 static inline uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
    183                                                const int byte_stride) {
    184 #if AOM_ARCH_AARCH64
    185  uint32x4_t v_data = vld1q_u32((uint32_t *)src);
    186  v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
    187  v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
    188  v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
    189 
    190  return vreinterpretq_u8_u32(v_data);
    191 #else
    192  return load_unaligned_u8q(src, byte_stride);
    193 #endif
    194 }
    195 
    196 static inline uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
    197                                                const int byte_stride) {
    198 #if AOM_ARCH_AARCH64
    199  uint64x2_t v_data = vld1q_u64((uint64_t *)src);
    200  v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
    201 
    202  return vreinterpretq_u8_u64(v_data);
    203 #else
    204  uint8x8_t v_data_low = vld1_u8(src);
    205  uint8x8_t v_data_high = vld1_u8(src + byte_stride);
    206 
    207  return vcombine_u8(v_data_low, v_data_high);
    208 #endif
    209 }
    210 
    211 static inline uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
    212                                                 const int byte_stride) {
    213  (void)byte_stride;
    214  return vld1q_u8(src);
    215 }
    216 
    217 static inline void load_levels_4x4x5(const uint8_t *const src, const int stride,
    218                                     const ptrdiff_t *const offsets,
    219                                     uint8x16_t *const level) {
    220  level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
    221  level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
    222  level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
    223  level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
    224  level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
    225 }
    226 
    227 static inline void load_levels_8x2x5(const uint8_t *const src, const int stride,
    228                                     const ptrdiff_t *const offsets,
    229                                     uint8x16_t *const level) {
    230  level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
    231  level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
    232  level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
    233  level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
    234  level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
    235 }
    236 
    237 static inline void load_levels_16x1x5(const uint8_t *const src,
    238                                      const int stride,
    239                                      const ptrdiff_t *const offsets,
    240                                      uint8x16_t *const level) {
    241  level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
    242  level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
    243  level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
    244  level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
    245  level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
    246 }
    247 
    248 static inline uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
    249  const uint8x16_t const_3 = vdupq_n_u8(3);
    250  const uint8x16_t const_4 = vdupq_n_u8(4);
    251  uint8x16_t count;
    252 
    253  count = vminq_u8(level[0], const_3);
    254  level[1] = vminq_u8(level[1], const_3);
    255  level[2] = vminq_u8(level[2], const_3);
    256  level[3] = vminq_u8(level[3], const_3);
    257  level[4] = vminq_u8(level[4], const_3);
    258  count = vaddq_u8(count, level[1]);
    259  count = vaddq_u8(count, level[2]);
    260  count = vaddq_u8(count, level[3]);
    261  count = vaddq_u8(count, level[4]);
    262 
    263  count = vrshrq_n_u8(count, 1);
    264  count = vminq_u8(count, const_4);
    265  return count;
    266 }
    267 
    268 static inline void get_4_nz_map_contexts_2d(const uint8_t *levels,
    269                                            const int width,
    270                                            const ptrdiff_t *const offsets,
    271                                            uint8_t *const coeff_contexts) {
    272  const int stride = 4 + TX_PAD_HOR;
    273  const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
    274 
    275  uint8x16_t pos_to_offset =
    276      (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
    277 
    278  uint8x16_t count;
    279  uint8x16_t level[5];
    280  uint8_t *cc = coeff_contexts;
    281 
    282  assert(!(width % 4));
    283 
    284  int col = width;
    285  do {
    286    load_levels_4x4x5(levels, stride, offsets, level);
    287    count = get_coeff_contexts_kernel(level);
    288    count = vaddq_u8(count, pos_to_offset);
    289    vst1q_u8(cc, count);
    290    pos_to_offset = pos_to_offset_large;
    291    levels += 4 * stride;
    292    cc += 16;
    293    col -= 4;
    294  } while (col);
    295 
    296  coeff_contexts[0] = 0;
    297 }
    298 
    299 static inline void get_4_nz_map_contexts_ver(const uint8_t *levels,
    300                                             const int width,
    301                                             const ptrdiff_t *const offsets,
    302                                             uint8_t *coeff_contexts) {
    303  const int stride = 4 + TX_PAD_HOR;
    304 
    305  const uint8x16_t pos_to_offset =
    306      vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
    307 
    308  uint8x16_t count;
    309  uint8x16_t level[5];
    310 
    311  assert(!(width % 4));
    312 
    313  int col = width;
    314  do {
    315    load_levels_4x4x5(levels, stride, offsets, level);
    316    count = get_coeff_contexts_kernel(level);
    317    count = vaddq_u8(count, pos_to_offset);
    318    vst1q_u8(coeff_contexts, count);
    319    levels += 4 * stride;
    320    coeff_contexts += 16;
    321    col -= 4;
    322  } while (col);
    323 }
    324 
    325 static inline void get_4_nz_map_contexts_hor(const uint8_t *levels,
    326                                             const int width,
    327                                             const ptrdiff_t *const offsets,
    328                                             uint8_t *coeff_contexts) {
    329  const int stride = 4 + TX_PAD_HOR;
    330  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
    331 
    332  uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);
    333 
    334  uint8x16_t count;
    335  uint8x16_t level[5];
    336 
    337  assert(!(width % 4));
    338 
    339  int col = width;
    340  do {
    341    load_levels_4x4x5(levels, stride, offsets, level);
    342    count = get_coeff_contexts_kernel(level);
    343    count = vaddq_u8(count, pos_to_offset);
    344    vst1q_u8(coeff_contexts, count);
    345    pos_to_offset = pos_to_offset_large;
    346    levels += 4 * stride;
    347    coeff_contexts += 16;
    348    col -= 4;
    349  } while (col);
    350 }
    351 
    352 static inline void get_8_coeff_contexts_2d(const uint8_t *levels,
    353                                           const int width,
    354                                           const ptrdiff_t *const offsets,
    355                                           uint8_t *coeff_contexts) {
    356  const int stride = 8 + TX_PAD_HOR;
    357  uint8_t *cc = coeff_contexts;
    358  uint8x16_t count;
    359  uint8x16_t level[5];
    360  uint8x16_t pos_to_offset[3];
    361 
    362  assert(!(width % 2));
    363 
    364  if (width == 8) {
    365    pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
    366    pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
    367  } else if (width < 8) {
    368    pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
    369    pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
    370  } else {
    371    pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
    372    pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
    373  }
    374  pos_to_offset[2] = vdupq_n_u8(21);
    375 
    376  int col = width;
    377  do {
    378    load_levels_8x2x5(levels, stride, offsets, level);
    379    count = get_coeff_contexts_kernel(level);
    380    count = vaddq_u8(count, pos_to_offset[0]);
    381    vst1q_u8(cc, count);
    382    pos_to_offset[0] = pos_to_offset[1];
    383    pos_to_offset[1] = pos_to_offset[2];
    384    levels += 2 * stride;
    385    cc += 16;
    386    col -= 2;
    387  } while (col);
    388 
    389  coeff_contexts[0] = 0;
    390 }
    391 
    392 static inline void get_8_coeff_contexts_ver(const uint8_t *levels,
    393                                            const int width,
    394                                            const ptrdiff_t *const offsets,
    395                                            uint8_t *coeff_contexts) {
    396  const int stride = 8 + TX_PAD_HOR;
    397 
    398  const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);
    399 
    400  uint8x16_t count;
    401  uint8x16_t level[5];
    402 
    403  assert(!(width % 2));
    404 
    405  int col = width;
    406  do {
    407    load_levels_8x2x5(levels, stride, offsets, level);
    408    count = get_coeff_contexts_kernel(level);
    409    count = vaddq_u8(count, pos_to_offset);
    410    vst1q_u8(coeff_contexts, count);
    411    levels += 2 * stride;
    412    coeff_contexts += 16;
    413    col -= 2;
    414  } while (col);
    415 }
    416 
    417 static inline void get_8_coeff_contexts_hor(const uint8_t *levels,
    418                                            const int width,
    419                                            const ptrdiff_t *const offsets,
    420                                            uint8_t *coeff_contexts) {
    421  const int stride = 8 + TX_PAD_HOR;
    422  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
    423 
    424  uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
    425                                         vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
    426 
    427  uint8x16_t count;
    428  uint8x16_t level[5];
    429 
    430  assert(!(width % 2));
    431 
    432  int col = width;
    433  do {
    434    load_levels_8x2x5(levels, stride, offsets, level);
    435    count = get_coeff_contexts_kernel(level);
    436    count = vaddq_u8(count, pos_to_offset);
    437    vst1q_u8(coeff_contexts, count);
    438    pos_to_offset = pos_to_offset_large;
    439    levels += 2 * stride;
    440    coeff_contexts += 16;
    441    col -= 2;
    442  } while (col);
    443 }
    444 
    445 static inline void get_16n_coeff_contexts_2d(const uint8_t *levels,
    446                                             const int real_width,
    447                                             const int real_height,
    448                                             const int width, const int height,
    449                                             const ptrdiff_t *const offsets,
    450                                             uint8_t *coeff_contexts) {
    451  const int stride = height + TX_PAD_HOR;
    452  uint8_t *cc = coeff_contexts;
    453  int col = width;
    454  uint8x16_t pos_to_offset[5];
    455  uint8x16_t pos_to_offset_large[3];
    456  uint8x16_t count;
    457  uint8x16_t level[5];
    458 
    459  assert(!(height % 16));
    460 
    461  pos_to_offset_large[2] = vdupq_n_u8(21);
    462  if (real_width == real_height) {
    463    pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
    464    pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
    465    pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
    466    pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
    467    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
    468        pos_to_offset_large[2];
    469  } else if (real_width < real_height) {
    470    pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
    471    pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
    472    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
    473        vld1q_u8(c_16_po_2d_g[2]);
    474    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
    475  } else {  // real_width > real_height
    476    pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
    477    pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
    478    pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
    479    pos_to_offset[4] = pos_to_offset_large[2];
    480    pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16);
    481  }
    482 
    483  do {
    484    int h = height;
    485 
    486    do {
    487      load_levels_16x1x5(levels, stride, offsets, level);
    488      count = get_coeff_contexts_kernel(level);
    489      count = vaddq_u8(count, pos_to_offset[0]);
    490      vst1q_u8(cc, count);
    491      levels += 16;
    492      cc += 16;
    493      h -= 16;
    494      pos_to_offset[0] = pos_to_offset_large[0];
    495    } while (h);
    496 
    497    pos_to_offset[0] = pos_to_offset[1];
    498    pos_to_offset[1] = pos_to_offset[2];
    499    pos_to_offset[2] = pos_to_offset[3];
    500    pos_to_offset[3] = pos_to_offset[4];
    501    pos_to_offset_large[0] = pos_to_offset_large[1];
    502    pos_to_offset_large[1] = pos_to_offset_large[2];
    503    levels += TX_PAD_HOR;
    504  } while (--col);
    505 
    506  coeff_contexts[0] = 0;
    507 }
    508 
    509 static inline void get_16n_coeff_contexts_ver(const uint8_t *levels,
    510                                              const int width, const int height,
    511                                              const ptrdiff_t *const offsets,
    512                                              uint8_t *coeff_contexts) {
    513  const int stride = height + TX_PAD_HOR;
    514 
    515  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
    516 
    517  uint8x16_t count;
    518  uint8x16_t level[5];
    519 
    520  assert(!(height % 16));
    521 
    522  int col = width;
    523  do {
    524    uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);
    525 
    526    int h = height;
    527    do {
    528      load_levels_16x1x5(levels, stride, offsets, level);
    529      count = get_coeff_contexts_kernel(level);
    530      count = vaddq_u8(count, pos_to_offset);
    531      vst1q_u8(coeff_contexts, count);
    532      pos_to_offset = pos_to_offset_large;
    533      levels += 16;
    534      coeff_contexts += 16;
    535      h -= 16;
    536    } while (h);
    537 
    538    levels += TX_PAD_HOR;
    539  } while (--col);
    540 }
    541 
    542 static inline void get_16n_coeff_contexts_hor(const uint8_t *levels,
    543                                              const int width, const int height,
    544                                              const ptrdiff_t *const offsets,
    545                                              uint8_t *coeff_contexts) {
    546  const int stride = height + TX_PAD_HOR;
    547 
    548  uint8x16_t pos_to_offset[3];
    549  uint8x16_t count;
    550  uint8x16_t level[5];
    551 
    552  assert(!(height % 16));
    553 
    554  pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
    555  pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
    556  pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
    557 
    558  int col = width;
    559  do {
    560    int h = height;
    561    do {
    562      load_levels_16x1x5(levels, stride, offsets, level);
    563      count = get_coeff_contexts_kernel(level);
    564      count = vaddq_u8(count, pos_to_offset[0]);
    565      vst1q_u8(coeff_contexts, count);
    566      levels += 16;
    567      coeff_contexts += 16;
    568      h -= 16;
    569    } while (h);
    570 
    571    pos_to_offset[0] = pos_to_offset[1];
    572    pos_to_offset[1] = pos_to_offset[2];
    573    levels += TX_PAD_HOR;
    574  } while (--col);
    575 }
    576 
    577 // Note: levels[] must be in the range [0, 127], inclusive.
    578 void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
    579                                  const int16_t *const scan, const uint16_t eob,
    580                                  const TX_SIZE tx_size,
    581                                  const TX_CLASS tx_class,
    582                                  int8_t *const coeff_contexts) {
    583  const int last_idx = eob - 1;
    584  if (!last_idx) {
    585    coeff_contexts[0] = 0;
    586    return;
    587  }
    588 
    589  uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
    590 
    591  const int real_width = tx_size_wide[tx_size];
    592  const int real_height = tx_size_high[tx_size];
    593  const int width = get_txb_wide(tx_size);
    594  const int height = get_txb_high(tx_size);
    595  const int stride = height + TX_PAD_HOR;
    596  ptrdiff_t offsets[3];
    597 
    598  /* coeff_contexts must be 16 byte aligned. */
    599  assert(!((intptr_t)coeff_contexts & 0xf));
    600 
    601  if (tx_class == TX_CLASS_2D) {
    602    offsets[0] = 0 * stride + 2;
    603    offsets[1] = 1 * stride + 1;
    604    offsets[2] = 2 * stride + 0;
    605 
    606    if (height == 4) {
    607      get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
    608    } else if (height == 8) {
    609      get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
    610    } else {
    611      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
    612                                offsets, coefficients);
    613    }
    614  } else if (tx_class == TX_CLASS_HORIZ) {
    615    offsets[0] = 2 * stride;
    616    offsets[1] = 3 * stride;
    617    offsets[2] = 4 * stride;
    618    if (height == 4) {
    619      get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
    620    } else if (height == 8) {
    621      get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
    622    } else {
    623      get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
    624    }
    625  } else {  // TX_CLASS_VERT
    626    offsets[0] = 2;
    627    offsets[1] = 3;
    628    offsets[2] = 4;
    629    if (height == 4) {
    630      get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
    631    } else if (height == 8) {
    632      get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
    633    } else {
    634      get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
    635    }
    636  }
    637 
    638  const int bhl = get_txb_bhl(tx_size);
    639  const int pos = scan[last_idx];
    640  if (last_idx <= (width << bhl) / 8)
    641    coeff_contexts[pos] = 1;
    642  else if (last_idx <= (width << bhl) / 4)
    643    coeff_contexts[pos] = 2;
    644  else
    645    coeff_contexts[pos] = 3;
    646 }