tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sad_neon_dotprod.c (9928B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/sum_neon.h"
     20 
     21 static inline unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
     22                                               int src_stride,
     23                                               const uint8_t *ref_ptr,
     24                                               int ref_stride, int w, int h) {
     25  // Only two accumulators are required for optimal instruction throughput of
     26  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
     27  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
     28 
     29  int i = h;
     30  do {
     31    int j = 0;
     32    do {
     33      uint8x16_t s0, s1, r0, r1, diff0, diff1;
     34 
     35      s0 = vld1q_u8(src_ptr + j);
     36      r0 = vld1q_u8(ref_ptr + j);
     37      diff0 = vabdq_u8(s0, r0);
     38      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
     39 
     40      s1 = vld1q_u8(src_ptr + j + 16);
     41      r1 = vld1q_u8(ref_ptr + j + 16);
     42      diff1 = vabdq_u8(s1, r1);
     43      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
     44 
     45      j += 32;
     46    } while (j < w);
     47 
     48    src_ptr += src_stride;
     49    ref_ptr += ref_stride;
     50  } while (--i != 0);
     51 
     52  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
     53 }
     54 
     55 static inline unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
     56                                                 int src_stride,
     57                                                 const uint8_t *ref_ptr,
     58                                                 int ref_stride, int h) {
     59  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
     60 }
     61 
     62 static inline unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
     63                                                int src_stride,
     64                                                const uint8_t *ref_ptr,
     65                                                int ref_stride, int h) {
     66  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
     67 }
     68 
     69 static inline unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
     70                                                int src_stride,
     71                                                const uint8_t *ref_ptr,
     72                                                int ref_stride, int h) {
     73  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
     74 }
     75 
     76 static inline unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
     77                                                int src_stride,
     78                                                const uint8_t *ref_ptr,
     79                                                int ref_stride, int h) {
     80  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
     81 
     82  int i = h / 2;
     83  do {
     84    uint8x16_t s0, s1, r0, r1, diff0, diff1;
     85 
     86    s0 = vld1q_u8(src_ptr);
     87    r0 = vld1q_u8(ref_ptr);
     88    diff0 = vabdq_u8(s0, r0);
     89    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
     90 
     91    src_ptr += src_stride;
     92    ref_ptr += ref_stride;
     93 
     94    s1 = vld1q_u8(src_ptr);
     95    r1 = vld1q_u8(ref_ptr);
     96    diff1 = vabdq_u8(s1, r1);
     97    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
     98 
     99    src_ptr += src_stride;
    100    ref_ptr += ref_stride;
    101  } while (--i != 0);
    102 
    103  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
    104 }
    105 
    106 #define SAD_WXH_NEON_DOTPROD(w, h)                                         \
    107  unsigned int aom_sad##w##x##h##_neon_dotprod(                            \
    108      const uint8_t *src, int src_stride, const uint8_t *ref,              \
    109      int ref_stride) {                                                    \
    110    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
    111  }
    112 
    113 SAD_WXH_NEON_DOTPROD(16, 8)
    114 SAD_WXH_NEON_DOTPROD(16, 16)
    115 SAD_WXH_NEON_DOTPROD(16, 32)
    116 
    117 SAD_WXH_NEON_DOTPROD(32, 16)
    118 SAD_WXH_NEON_DOTPROD(32, 32)
    119 SAD_WXH_NEON_DOTPROD(32, 64)
    120 
    121 SAD_WXH_NEON_DOTPROD(64, 32)
    122 SAD_WXH_NEON_DOTPROD(64, 64)
    123 SAD_WXH_NEON_DOTPROD(64, 128)
    124 
    125 SAD_WXH_NEON_DOTPROD(128, 64)
    126 SAD_WXH_NEON_DOTPROD(128, 128)
    127 
    128 #if !CONFIG_REALTIME_ONLY
    129 SAD_WXH_NEON_DOTPROD(16, 4)
    130 SAD_WXH_NEON_DOTPROD(16, 64)
    131 SAD_WXH_NEON_DOTPROD(32, 8)
    132 SAD_WXH_NEON_DOTPROD(64, 16)
    133 #endif  // !CONFIG_REALTIME_ONLY
    134 
    135 #undef SAD_WXH_NEON_DOTPROD
    136 
    137 #define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
    138  unsigned int aom_sad_skip_##w##x##h##_neon_dotprod(            \
    139      const uint8_t *src, int src_stride, const uint8_t *ref,    \
    140      int ref_stride) {                                          \
    141    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
    142                                       2 * ref_stride, (h) / 2); \
    143  }
    144 
    145 SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
    146 SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
    147 
    148 SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
    149 SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
    150 SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
    151 
    152 SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
    153 SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
    154 SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
    155 
    156 SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
    157 SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
    158 
    159 #if !CONFIG_REALTIME_ONLY
    160 SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
    161 SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
    162 #endif  // !CONFIG_REALTIME_ONLY
    163 
    164 #undef SAD_SKIP_WXH_NEON_DOTPROD
    165 
    166 static inline unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
    167                                                   int src_stride,
    168                                                   const uint8_t *ref_ptr,
    169                                                   int ref_stride, int w, int h,
    170                                                   const uint8_t *second_pred) {
    171  // Only two accumulators are required for optimal instruction throughput of
    172  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
    173  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
    174 
    175  int i = h;
    176  do {
    177    int j = 0;
    178    do {
    179      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
    180 
    181      s0 = vld1q_u8(src_ptr + j);
    182      r0 = vld1q_u8(ref_ptr + j);
    183      p0 = vld1q_u8(second_pred);
    184      avg0 = vrhaddq_u8(r0, p0);
    185      diff0 = vabdq_u8(s0, avg0);
    186      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
    187 
    188      s1 = vld1q_u8(src_ptr + j + 16);
    189      r1 = vld1q_u8(ref_ptr + j + 16);
    190      p1 = vld1q_u8(second_pred + 16);
    191      avg1 = vrhaddq_u8(r1, p1);
    192      diff1 = vabdq_u8(s1, avg1);
    193      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
    194 
    195      j += 32;
    196      second_pred += 32;
    197    } while (j < w);
    198 
    199    src_ptr += src_stride;
    200    ref_ptr += ref_stride;
    201  } while (--i != 0);
    202 
    203  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
    204 }
    205 
    206 static inline unsigned int sad128xh_avg_neon_dotprod(
    207    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
    208    int ref_stride, int h, const uint8_t *second_pred) {
    209  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
    210                                 h, second_pred);
    211 }
    212 
    213 static inline unsigned int sad64xh_avg_neon_dotprod(
    214    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
    215    int ref_stride, int h, const uint8_t *second_pred) {
    216  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
    217                                 h, second_pred);
    218 }
    219 
    220 static inline unsigned int sad32xh_avg_neon_dotprod(
    221    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
    222    int ref_stride, int h, const uint8_t *second_pred) {
    223  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
    224                                 h, second_pred);
    225 }
    226 
    227 static inline unsigned int sad16xh_avg_neon_dotprod(
    228    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
    229    int ref_stride, int h, const uint8_t *second_pred) {
    230  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
    231 
    232  int i = h / 2;
    233  do {
    234    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
    235 
    236    s0 = vld1q_u8(src_ptr);
    237    r0 = vld1q_u8(ref_ptr);
    238    p0 = vld1q_u8(second_pred);
    239    avg0 = vrhaddq_u8(r0, p0);
    240    diff0 = vabdq_u8(s0, avg0);
    241    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
    242 
    243    src_ptr += src_stride;
    244    ref_ptr += ref_stride;
    245    second_pred += 16;
    246 
    247    s1 = vld1q_u8(src_ptr);
    248    r1 = vld1q_u8(ref_ptr);
    249    p1 = vld1q_u8(second_pred);
    250    avg1 = vrhaddq_u8(r1, p1);
    251    diff1 = vabdq_u8(s1, avg1);
    252    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
    253 
    254    src_ptr += src_stride;
    255    ref_ptr += ref_stride;
    256    second_pred += 16;
    257  } while (--i != 0);
    258 
    259  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
    260 }
    261 
    262 #define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
    263  unsigned int aom_sad##w##x##h##_avg_neon_dotprod(                           \
    264      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
    265      const uint8_t *second_pred) {                                           \
    266    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
    267                                       second_pred);                          \
    268  }
    269 
    270 SAD_WXH_AVG_NEON_DOTPROD(16, 8)
    271 SAD_WXH_AVG_NEON_DOTPROD(16, 16)
    272 SAD_WXH_AVG_NEON_DOTPROD(16, 32)
    273 
    274 SAD_WXH_AVG_NEON_DOTPROD(32, 16)
    275 SAD_WXH_AVG_NEON_DOTPROD(32, 32)
    276 SAD_WXH_AVG_NEON_DOTPROD(32, 64)
    277 
    278 SAD_WXH_AVG_NEON_DOTPROD(64, 32)
    279 SAD_WXH_AVG_NEON_DOTPROD(64, 64)
    280 SAD_WXH_AVG_NEON_DOTPROD(64, 128)
    281 
    282 SAD_WXH_AVG_NEON_DOTPROD(128, 64)
    283 SAD_WXH_AVG_NEON_DOTPROD(128, 128)
    284 
    285 #if !CONFIG_REALTIME_ONLY
    286 SAD_WXH_AVG_NEON_DOTPROD(16, 64)
    287 SAD_WXH_AVG_NEON_DOTPROD(32, 8)
    288 SAD_WXH_AVG_NEON_DOTPROD(64, 16)
    289 #endif  // !CONFIG_REALTIME_ONLY
    290 
    291 #undef SAD_WXH_AVG_NEON_DOTPROD