tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dist_wtd_avg_neon.h (2609B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
     13 #define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
     14 
     15 #include <arm_neon.h>
     16 
     17 #include "aom_dsp/aom_dsp_common.h"
     18 #include "av1/common/enums.h"
     19 
     20 static inline uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
     21                                          uint8x8_t wta, uint8x8_t wtb) {
     22  uint16x8_t wtd_sum = vmull_u8(a, wta);
     23 
     24  wtd_sum = vmlal_u8(wtd_sum, b, wtb);
     25 
     26  return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
     27 }
     28 
     29 static inline uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
     30                                            uint16x4_t wta, uint16x4_t wtb) {
     31  uint32x4_t wtd_sum = vmull_u16(a, wta);
     32 
     33  wtd_sum = vmlal_u16(wtd_sum, b, wtb);
     34 
     35  return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
     36 }
     37 
     38 static inline uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
     39                                            uint8x16_t wta, uint8x16_t wtb) {
     40  uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
     41  uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
     42 
     43  wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
     44  wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
     45 
     46  uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
     47  uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
     48 
     49  return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
     50 }
     51 
     52 static inline uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
     53                                            uint16x8_t wta, uint16x8_t wtb) {
     54  uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
     55  uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
     56 
     57  wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
     58  wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
     59 
     60  uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
     61  uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
     62 
     63  return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
     64 }
     65 
     66 #endif  // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_