tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

wedge_utils_neon.c (4813B)


      1 /*
      2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "aom_dsp/arm/sum_neon.h"
     16 #include "av1/common/reconinter.h"
     17 
     18 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
     19 
     20 /**
     21 * See av1_wedge_sse_from_residuals_c for details of the parameters and
     22 * computation.
     23 */
     24 uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d,
     25                                           const uint8_t *m, int N) {
     26  assert(N % 64 == 0);
     27 
     28  uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
     29 
     30  int i = 0;
     31  do {
     32    int32x4_t sum[4];
     33    int32x4_t sse[2];
     34    int16x4_t sum_s16[4];
     35 
     36    const int16x8_t r1_l = vld1q_s16(r1 + i);
     37    const int16x8_t r1_h = vld1q_s16(r1 + i + 8);
     38    const int16x8_t d_l = vld1q_s16(d + i);
     39    const int16x8_t d_h = vld1q_s16(d + i + 8);
     40    // The following three lines are a bit inelegant compared to using a pair
     41    // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair -
     42    // which can be executed in parallel with the subsequent SSHL instructions.
     43    // (SSHL can only be executed on half of the Neon pipes in modern Arm
     44    // cores, whereas ZIP1/2 can be executed on all of them.)
     45    const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0));
     46    const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]);
     47    const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]);
     48 
     49    sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS);
     50    sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS);
     51    sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS);
     52    sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS);
     53 
     54    sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l));
     55    sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l));
     56    sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h));
     57    sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h));
     58 
     59    sum_s16[0] = vqmovn_s32(sum[0]);
     60    sum_s16[1] = vqmovn_s32(sum[1]);
     61    sum_s16[2] = vqmovn_s32(sum[2]);
     62    sum_s16[3] = vqmovn_s32(sum[3]);
     63 
     64    sse[0] = vmull_s16(sum_s16[0], sum_s16[0]);
     65    sse[1] = vmull_s16(sum_s16[2], sum_s16[2]);
     66    sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]);
     67    sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]);
     68 
     69    v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0]));
     70    v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1]));
     71 
     72    i += 16;
     73  } while (i < N);
     74 
     75  uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
     76  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
     77 }
     78 
     79 int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m,
     80                                          int N, int64_t limit) {
     81  int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
     82                       vdupq_n_s32(0) };
     83 
     84  do {
     85    int16x8_t ds_l = vld1q_s16(ds);
     86    int16x8_t ds_h = vld1q_s16(ds + 8);
     87 
     88    int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m));
     89    int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8));
     90    int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8));
     91 
     92    acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l));
     93    acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l));
     94    acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h));
     95    acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h));
     96 
     97    ds += 16;
     98    m += 16;
     99    N -= 16;
    100  } while (N != 0);
    101 
    102  int64x2_t sum = vpaddlq_s32(acc[0]);
    103  sum = vpadalq_s32(sum, acc[1]);
    104  sum = vpadalq_s32(sum, acc[2]);
    105  sum = vpadalq_s32(sum, acc[3]);
    106 
    107  return (horizontal_add_s64x2(sum) > limit);
    108 }
    109 
    110 void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr,
    111                                          const int16_t *b_ptr, int N) {
    112  do {
    113    int16x8_t a = vld1q_s16(a_ptr);
    114    int16x8_t b = vld1q_s16(b_ptr);
    115 
    116    int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a));
    117    int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a));
    118 
    119    sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b));
    120    sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b));
    121 
    122    int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi));
    123 
    124    vst1q_s16(d_ptr, res);
    125 
    126    d_ptr += 8;
    127    a_ptr += 8;
    128    b_ptr += 8;
    129    N -= 8;
    130  } while (N != 0);
    131 }