tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_sad_neon.c (16686B)


      1 /*
      2 * Copyright (c) 2023 The WebM project authors. All rights reserved.
      3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      4 *
      5 * This source code is subject to the terms of the BSD 2 Clause License and
      6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7 * was not distributed with this source code in the LICENSE file, you can
      8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9 * Media Patent License 1.0 was not distributed with this source code in the
     10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11 */
     12 
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom/aom_integer.h"
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_dsp/arm/sum_neon.h"
     21 
     22 static inline uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
     23                                                int src_stride,
     24                                                const uint8_t *ref_ptr,
     25                                                int ref_stride, int h) {
     26  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
     27  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
     28  uint32x4_t sum = vdupq_n_u32(0);
     29 
     30  int i = h;
     31  do {
     32    uint16x4_t s = vld1_u16(src16_ptr);
     33    uint16x4_t r = vld1_u16(ref16_ptr);
     34    sum = vabal_u16(sum, s, r);
     35 
     36    src16_ptr += src_stride;
     37    ref16_ptr += ref_stride;
     38  } while (--i != 0);
     39 
     40  return horizontal_add_u32x4(sum);
     41 }
     42 
     43 static inline uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
     44                                                int src_stride,
     45                                                const uint8_t *ref_ptr,
     46                                                int ref_stride, int h) {
     47  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
     48  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
     49  uint16x8_t sum = vdupq_n_u16(0);
     50 
     51  int i = h;
     52  do {
     53    uint16x8_t s = vld1q_u16(src16_ptr);
     54    uint16x8_t r = vld1q_u16(ref16_ptr);
     55    sum = vabaq_u16(sum, s, r);
     56 
     57    src16_ptr += src_stride;
     58    ref16_ptr += ref_stride;
     59  } while (--i != 0);
     60 
     61  return horizontal_add_u16x8(sum);
     62 }
     63 
     64 #if !CONFIG_REALTIME_ONLY
     65 static inline uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
     66                                                int src_stride,
     67                                                const uint8_t *ref_ptr,
     68                                                int ref_stride, int h) {
     69  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
     70  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
     71  uint32x4_t sum_u32 = vdupq_n_u32(0);
     72 
     73  int i = h;
     74  do {
     75    uint16x8_t s = vld1q_u16(src16_ptr);
     76    uint16x8_t r = vld1q_u16(ref16_ptr);
     77    uint16x8_t sum_u16 = vabdq_u16(s, r);
     78    sum_u32 = vpadalq_u16(sum_u32, sum_u16);
     79 
     80    src16_ptr += src_stride;
     81    ref16_ptr += ref_stride;
     82  } while (--i != 0);
     83 
     84  return horizontal_add_u32x4(sum_u32);
     85 }
     86 #endif  // !CONFIG_REALTIME_ONLY
     87 
     88 static inline uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
     89                                                 int src_stride,
     90                                                 const uint8_t *ref_ptr,
     91                                                 int ref_stride, int h) {
     92  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
     93  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
     94  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
     95 
     96  int i = h;
     97  do {
     98    uint16x8_t s0 = vld1q_u16(src16_ptr);
     99    uint16x8_t r0 = vld1q_u16(ref16_ptr);
    100    uint16x8_t diff0 = vabdq_u16(s0, r0);
    101    sum[0] = vpadalq_u16(sum[0], diff0);
    102 
    103    uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
    104    uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
    105    uint16x8_t diff1 = vabdq_u16(s1, r1);
    106    sum[1] = vpadalq_u16(sum[1], diff1);
    107 
    108    src16_ptr += src_stride;
    109    ref16_ptr += ref_stride;
    110  } while (--i != 0);
    111 
    112  sum[0] = vaddq_u32(sum[0], sum[1]);
    113  return horizontal_add_u32x4(sum[0]);
    114 }
    115 
    116 static inline uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
    117                                                int src_stride,
    118                                                const uint8_t *ref_ptr,
    119                                                int ref_stride, int w, int h) {
    120  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
    121  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
    122  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
    123                        vdupq_n_u32(0) };
    124 
    125  int i = h;
    126  do {
    127    int j = 0;
    128    do {
    129      uint16x8_t s0 = vld1q_u16(src16_ptr + j);
    130      uint16x8_t r0 = vld1q_u16(ref16_ptr + j);
    131      uint16x8_t diff0 = vabdq_u16(s0, r0);
    132      sum[0] = vpadalq_u16(sum[0], diff0);
    133 
    134      uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8);
    135      uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
    136      uint16x8_t diff1 = vabdq_u16(s1, r1);
    137      sum[1] = vpadalq_u16(sum[1], diff1);
    138 
    139      uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
    140      uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
    141      uint16x8_t diff2 = vabdq_u16(s2, r2);
    142      sum[2] = vpadalq_u16(sum[2], diff2);
    143 
    144      uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
    145      uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
    146      uint16x8_t diff3 = vabdq_u16(s3, r3);
    147      sum[3] = vpadalq_u16(sum[3], diff3);
    148 
    149      j += 32;
    150    } while (j < w);
    151 
    152    src16_ptr += src_stride;
    153    ref16_ptr += ref_stride;
    154  } while (--i != 0);
    155 
    156  sum[0] = vaddq_u32(sum[0], sum[1]);
    157  sum[2] = vaddq_u32(sum[2], sum[3]);
    158  sum[0] = vaddq_u32(sum[0], sum[2]);
    159 
    160  return horizontal_add_u32x4(sum[0]);
    161 }
    162 
    163 static inline unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
    164                                                      int src_stride,
    165                                                      const uint8_t *ref_ptr,
    166                                                      int ref_stride, int h) {
    167  return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
    168                                  h);
    169 }
    170 
    171 static inline unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
    172                                                     int src_stride,
    173                                                     const uint8_t *ref_ptr,
    174                                                     int ref_stride, int h) {
    175  return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64,
    176                                  h);
    177 }
    178 
    179 static inline unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
    180                                                     int src_stride,
    181                                                     const uint8_t *ref_ptr,
    182                                                     int ref_stride, int h) {
    183  return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32,
    184                                  h);
    185 }
    186 
    187 #define HBD_SAD_WXH_SMALL_NEON(w, h)                                      \
    188  unsigned int aom_highbd_sad##w##x##h##_neon(                            \
    189      const uint8_t *src, int src_stride, const uint8_t *ref,             \
    190      int ref_stride) {                                                   \
    191    return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \
    192                                        (h));                             \
    193  }
    194 
    195 #define HBD_SAD_WXH_LARGE_NEON(w, h)                                      \
    196  unsigned int aom_highbd_sad##w##x##h##_neon(                            \
    197      const uint8_t *src, int src_stride, const uint8_t *ref,             \
    198      int ref_stride) {                                                   \
    199    return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \
    200                                        (h));                             \
    201  }
    202 
    203 HBD_SAD_WXH_SMALL_NEON(4, 4)
    204 HBD_SAD_WXH_SMALL_NEON(4, 8)
    205 
    206 HBD_SAD_WXH_SMALL_NEON(8, 4)
    207 HBD_SAD_WXH_SMALL_NEON(8, 8)
    208 HBD_SAD_WXH_SMALL_NEON(8, 16)
    209 
    210 HBD_SAD_WXH_LARGE_NEON(16, 8)
    211 HBD_SAD_WXH_LARGE_NEON(16, 16)
    212 HBD_SAD_WXH_LARGE_NEON(16, 32)
    213 
    214 HBD_SAD_WXH_LARGE_NEON(32, 16)
    215 HBD_SAD_WXH_LARGE_NEON(32, 32)
    216 HBD_SAD_WXH_LARGE_NEON(32, 64)
    217 
    218 HBD_SAD_WXH_LARGE_NEON(64, 32)
    219 HBD_SAD_WXH_LARGE_NEON(64, 64)
    220 HBD_SAD_WXH_LARGE_NEON(64, 128)
    221 
    222 HBD_SAD_WXH_LARGE_NEON(128, 64)
    223 HBD_SAD_WXH_LARGE_NEON(128, 128)
    224 
    225 #if !CONFIG_REALTIME_ONLY
    226 HBD_SAD_WXH_SMALL_NEON(4, 16)
    227 
    228 HBD_SAD_WXH_LARGE_NEON(8, 32)
    229 
    230 HBD_SAD_WXH_LARGE_NEON(16, 4)
    231 HBD_SAD_WXH_LARGE_NEON(16, 64)
    232 
    233 HBD_SAD_WXH_LARGE_NEON(32, 8)
    234 
    235 HBD_SAD_WXH_LARGE_NEON(64, 16)
    236 #endif  // !CONFIG_REALTIME_ONLY
    237 
    238 #define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h)                             \
    239  unsigned int aom_highbd_sad_skip_##w##x##h##_neon(                  \
    240      const uint8_t *src, int src_stride, const uint8_t *ref,         \
    241      int ref_stride) {                                               \
    242    return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \
    243                                            2 * ref_stride, (h) / 2); \
    244  }
    245 
    246 #define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h)                             \
    247  unsigned int aom_highbd_sad_skip_##w##x##h##_neon(                  \
    248      const uint8_t *src, int src_stride, const uint8_t *ref,         \
    249      int ref_stride) {                                               \
    250    return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \
    251                                            2 * ref_stride, (h) / 2); \
    252  }
    253 
    254 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16)
    255 
    256 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16)
    257 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32)
    258 
    259 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16)
    260 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32)
    261 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64)
    262 
    263 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32)
    264 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64)
    265 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128)
    266 
    267 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64)
    268 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128)
    269 
    270 #if !CONFIG_REALTIME_ONLY
    271 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16)
    272 
    273 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32)
    274 
    275 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64)
    276 
    277 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
    278 #endif  // !CONFIG_REALTIME_ONLY
    279 
    280 static inline uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
    281                                              int src_stride,
    282                                              const uint8_t *ref_ptr,
    283                                              int ref_stride, int h,
    284                                              const uint8_t *second_pred) {
    285  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
    286  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
    287  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
    288  uint32x4_t sum = vdupq_n_u32(0);
    289 
    290  int i = h;
    291  do {
    292    uint16x8_t s = vld1q_u16(src16_ptr);
    293    uint16x8_t r = vld1q_u16(ref16_ptr);
    294    uint16x8_t p = vld1q_u16(pred16_ptr);
    295 
    296    uint16x8_t avg = vrhaddq_u16(r, p);
    297    uint16x8_t diff = vabdq_u16(s, avg);
    298    sum = vpadalq_u16(sum, diff);
    299 
    300    src16_ptr += src_stride;
    301    ref16_ptr += ref_stride;
    302    pred16_ptr += 8;
    303  } while (--i != 0);
    304 
    305  return horizontal_add_u32x4(sum);
    306 }
    307 
    308 static inline uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
    309                                               int src_stride,
    310                                               const uint8_t *ref_ptr,
    311                                               int ref_stride, int h,
    312                                               const uint8_t *second_pred) {
    313  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
    314  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
    315  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
    316  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
    317 
    318  int i = h;
    319  do {
    320    uint16x8_t s0, s1, r0, r1, p0, p1;
    321    uint16x8_t avg0, avg1, diff0, diff1;
    322 
    323    s0 = vld1q_u16(src16_ptr);
    324    r0 = vld1q_u16(ref16_ptr);
    325    p0 = vld1q_u16(pred16_ptr);
    326    avg0 = vrhaddq_u16(r0, p0);
    327    diff0 = vabdq_u16(s0, avg0);
    328    sum[0] = vpadalq_u16(sum[0], diff0);
    329 
    330    s1 = vld1q_u16(src16_ptr + 8);
    331    r1 = vld1q_u16(ref16_ptr + 8);
    332    p1 = vld1q_u16(pred16_ptr + 8);
    333    avg1 = vrhaddq_u16(r1, p1);
    334    diff1 = vabdq_u16(s1, avg1);
    335    sum[1] = vpadalq_u16(sum[1], diff1);
    336 
    337    src16_ptr += src_stride;
    338    ref16_ptr += ref_stride;
    339    pred16_ptr += 16;
    340  } while (--i != 0);
    341 
    342  sum[0] = vaddq_u32(sum[0], sum[1]);
    343  return horizontal_add_u32x4(sum[0]);
    344 }
    345 
    346 static inline uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
    347                                              int src_stride,
    348                                              const uint8_t *ref_ptr,
    349                                              int ref_stride, int w, int h,
    350                                              const uint8_t *second_pred) {
    351  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
    352  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
    353  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
    354  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
    355                        vdupq_n_u32(0) };
    356 
    357  int i = h;
    358  do {
    359    int j = 0;
    360    do {
    361      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
    362      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
    363 
    364      s0 = vld1q_u16(src16_ptr + j);
    365      r0 = vld1q_u16(ref16_ptr + j);
    366      p0 = vld1q_u16(pred16_ptr + j);
    367      avg0 = vrhaddq_u16(r0, p0);
    368      diff0 = vabdq_u16(s0, avg0);
    369      sum[0] = vpadalq_u16(sum[0], diff0);
    370 
    371      s1 = vld1q_u16(src16_ptr + j + 8);
    372      r1 = vld1q_u16(ref16_ptr + j + 8);
    373      p1 = vld1q_u16(pred16_ptr + j + 8);
    374      avg1 = vrhaddq_u16(r1, p1);
    375      diff1 = vabdq_u16(s1, avg1);
    376      sum[1] = vpadalq_u16(sum[1], diff1);
    377 
    378      s2 = vld1q_u16(src16_ptr + j + 16);
    379      r2 = vld1q_u16(ref16_ptr + j + 16);
    380      p2 = vld1q_u16(pred16_ptr + j + 16);
    381      avg2 = vrhaddq_u16(r2, p2);
    382      diff2 = vabdq_u16(s2, avg2);
    383      sum[2] = vpadalq_u16(sum[2], diff2);
    384 
    385      s3 = vld1q_u16(src16_ptr + j + 24);
    386      r3 = vld1q_u16(ref16_ptr + j + 24);
    387      p3 = vld1q_u16(pred16_ptr + j + 24);
    388      avg3 = vrhaddq_u16(r3, p3);
    389      diff3 = vabdq_u16(s3, avg3);
    390      sum[3] = vpadalq_u16(sum[3], diff3);
    391 
    392      j += 32;
    393    } while (j < w);
    394 
    395    src16_ptr += src_stride;
    396    ref16_ptr += ref_stride;
    397    pred16_ptr += w;
    398  } while (--i != 0);
    399 
    400  sum[0] = vaddq_u32(sum[0], sum[1]);
    401  sum[2] = vaddq_u32(sum[2], sum[3]);
    402  sum[0] = vaddq_u32(sum[0], sum[2]);
    403 
    404  return horizontal_add_u32x4(sum[0]);
    405 }
    406 
    407 static inline unsigned int highbd_sad128xh_avg_neon(
    408    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
    409    int ref_stride, int h, const uint8_t *second_pred) {
    410  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
    411                                h, second_pred);
    412 }
    413 
    414 static inline unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
    415                                                   int src_stride,
    416                                                   const uint8_t *ref_ptr,
    417                                                   int ref_stride, int h,
    418                                                   const uint8_t *second_pred) {
    419  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
    420                                second_pred);
    421 }
    422 
    423 static inline unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
    424                                                   int src_stride,
    425                                                   const uint8_t *ref_ptr,
    426                                                   int ref_stride, int h,
    427                                                   const uint8_t *second_pred) {
    428  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
    429                                second_pred);
    430 }
    431 
    432 #define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
    433  uint32_t aom_highbd_sad##w##x##h##_avg_neon(                                \
    434      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
    435      const uint8_t *second_pred) {                                           \
    436    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
    437                                      second_pred);                           \
    438  }
    439 
    440 HBD_SAD_WXH_AVG_NEON(8, 8)
    441 HBD_SAD_WXH_AVG_NEON(8, 16)
    442 
    443 HBD_SAD_WXH_AVG_NEON(16, 8)
    444 HBD_SAD_WXH_AVG_NEON(16, 16)
    445 HBD_SAD_WXH_AVG_NEON(16, 32)
    446 
    447 HBD_SAD_WXH_AVG_NEON(32, 16)
    448 HBD_SAD_WXH_AVG_NEON(32, 32)
    449 HBD_SAD_WXH_AVG_NEON(32, 64)
    450 
    451 HBD_SAD_WXH_AVG_NEON(64, 32)
    452 HBD_SAD_WXH_AVG_NEON(64, 64)
    453 HBD_SAD_WXH_AVG_NEON(64, 128)
    454 
    455 HBD_SAD_WXH_AVG_NEON(128, 64)
    456 HBD_SAD_WXH_AVG_NEON(128, 128)
    457 
    458 #if !CONFIG_REALTIME_ONLY
    459 HBD_SAD_WXH_AVG_NEON(8, 32)
    460 
    461 HBD_SAD_WXH_AVG_NEON(16, 64)
    462 
    463 HBD_SAD_WXH_AVG_NEON(32, 8)
    464 
    465 HBD_SAD_WXH_AVG_NEON(64, 16)
    466 #endif  // !CONFIG_REALTIME_ONLY