tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_obmc_sad_neon.c (7649B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/sum_neon.h"
     20 
     21 static inline void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
     22                                                const int32_t *mask,
     23                                                const int32_t *wsrc,
     24                                                uint32x4_t *sum) {
     25  int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
     26 
     27  int32x4_t wsrc_lo = vld1q_s32(wsrc);
     28  int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
     29 
     30  int32x4_t mask_lo = vld1q_s32(mask);
     31  int32x4_t mask_hi = vld1q_s32(mask + 4);
     32 
     33  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
     34 
     35  int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
     36  int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
     37 
     38  uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
     39  uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
     40 
     41  *sum = vrsraq_n_u32(*sum, abs_lo, 12);
     42  *sum = vrsraq_n_u32(*sum, abs_hi, 12);
     43 }
     44 
     45 static inline unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
     46                                                    int ref_stride,
     47                                                    const int32_t *wsrc,
     48                                                    const int32_t *mask,
     49                                                    int height) {
     50  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
     51  uint32x4_t sum = vdupq_n_u32(0);
     52 
     53  int h = height / 2;
     54  do {
     55    uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
     56 
     57    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
     58 
     59    ref_ptr += 2 * ref_stride;
     60    wsrc += 8;
     61    mask += 8;
     62  } while (--h != 0);
     63 
     64  return horizontal_add_u32x4(sum);
     65 }
     66 
     67 static inline unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
     68                                                    int ref_stride,
     69                                                    const int32_t *wsrc,
     70                                                    const int32_t *mask,
     71                                                    int height) {
     72  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
     73  uint32x4_t sum = vdupq_n_u32(0);
     74 
     75  do {
     76    uint16x8_t r = vld1q_u16(ref_ptr);
     77 
     78    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
     79 
     80    ref_ptr += ref_stride;
     81    wsrc += 8;
     82    mask += 8;
     83  } while (--height != 0);
     84 
     85  return horizontal_add_u32x4(sum);
     86 }
     87 
     88 static inline unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
     89                                                      int ref_stride,
     90                                                      const int32_t *wsrc,
     91                                                      const int32_t *mask,
     92                                                      int width, int height) {
     93  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
     94  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
     95 
     96  do {
     97    int i = 0;
     98    do {
     99      uint16x8_t r0 = vld1q_u16(ref_ptr + i);
    100      highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
    101 
    102      uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
    103      highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
    104 
    105      wsrc += 16;
    106      mask += 16;
    107      i += 16;
    108    } while (i < width);
    109 
    110    ref_ptr += ref_stride;
    111  } while (--height != 0);
    112 
    113  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
    114 }
    115 
    116 static inline unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
    117                                                     int ref_stride,
    118                                                     const int32_t *wsrc,
    119                                                     const int32_t *mask,
    120                                                     int h) {
    121  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
    122 }
    123 
    124 static inline unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
    125                                                     int ref_stride,
    126                                                     const int32_t *wsrc,
    127                                                     const int32_t *mask,
    128                                                     int height) {
    129  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
    130                        vdupq_n_u32(0) };
    131  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
    132 
    133  do {
    134    uint16x8_t r0 = vld1q_u16(ref_ptr);
    135    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
    136    uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
    137    uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
    138 
    139    highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
    140    highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
    141    highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
    142    highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
    143 
    144    wsrc += 32;
    145    mask += 32;
    146    ref_ptr += ref_stride;
    147  } while (--height != 0);
    148 
    149  sum[0] = vaddq_u32(sum[0], sum[1]);
    150  sum[2] = vaddq_u32(sum[2], sum[3]);
    151 
    152  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
    153 }
    154 
    155 static inline unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
    156                                                     int ref_stride,
    157                                                     const int32_t *wsrc,
    158                                                     const int32_t *mask,
    159                                                     int h) {
    160  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
    161 }
    162 
    163 static inline unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
    164                                                      int ref_stride,
    165                                                      const int32_t *wsrc,
    166                                                      const int32_t *mask,
    167                                                      int h) {
    168  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
    169 }
    170 
    171 #define HIGHBD_OBMC_SAD_WXH_NEON(w, h)                                   \
    172  unsigned int aom_highbd_obmc_sad##w##x##h##_neon(                      \
    173      const uint8_t *ref, int ref_stride, const int32_t *wsrc,           \
    174      const int32_t *mask) {                                             \
    175    return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
    176  }
    177 
    178 HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
    179 HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
    180 
    181 HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
    182 HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
    183 HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
    184 
    185 HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
    186 HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
    187 HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
    188 
    189 HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
    190 HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
    191 HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
    192 
    193 HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
    194 HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
    195 HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
    196 
    197 HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
    198 HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
    199 
    200 #if !CONFIG_REALTIME_ONLY
    201 HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
    202 
    203 HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
    204 
    205 HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
    206 HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
    207 
    208 HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
    209 
    210 HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
    211 #endif  // !CONFIG_REALTIME_ONLY