tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_avg_neon.c (4730B)


      1 /*
      2 * Copyright (c) 2023 The WebM project authors. All rights reserved.
      3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      4 *
      5 *  This source code is subject to the terms of the BSD 2 Clause License and
      6 *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7 *  was not distributed with this source code in the LICENSE file, you can
      8 *  obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9 *  Media Patent License 1.0 was not distributed with this source code in the
     10 *  PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11 */
     12 
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/sum_neon.h"
     20 #include "aom_ports/mem.h"
     21 
     22 uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
     23  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
     24  uint16x4_t sum, a0, a1, a2, a3;
     25 
     26  load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3);
     27 
     28  sum = vadd_u16(a0, a1);
     29  sum = vadd_u16(sum, a2);
     30  sum = vadd_u16(sum, a3);
     31 
     32  return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4;
     33 }
     34 
     35 uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
     36  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
     37  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
     38 
     39  load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
     40 
     41  sum = vaddq_u16(a0, a1);
     42  sum = vaddq_u16(sum, a2);
     43  sum = vaddq_u16(sum, a3);
     44  sum = vaddq_u16(sum, a4);
     45  sum = vaddq_u16(sum, a5);
     46  sum = vaddq_u16(sum, a6);
     47  sum = vaddq_u16(sum, a7);
     48 
     49  return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6;
     50 }
     51 
     52 void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
     53                                int dp, int *min, int *max) {
     54  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
     55  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
     56 
     57  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
     58  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
     59  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
     60  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
     61  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
     62  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
     63  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
     64  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
     65 
     66  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
     67  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
     68  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
     69  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
     70  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
     71  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
     72  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
     73  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
     74 
     75  const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
     76  const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
     77  const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
     78  const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
     79  const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
     80  const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
     81  const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
     82  const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
     83 
     84  const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
     85  const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
     86  const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
     87  const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
     88 
     89  const uint16x8_t max0123 = vmaxq_u16(max01, max23);
     90  const uint16x8_t max4567 = vmaxq_u16(max45, max67);
     91  const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
     92 
     93  const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
     94  const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
     95  const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
     96  const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
     97 
     98  const uint16x8_t min0123 = vminq_u16(min01, min23);
     99  const uint16x8_t min4567 = vminq_u16(min45, min67);
    100  const uint16x8_t min07 = vminq_u16(min0123, min4567);
    101 
    102 #if AOM_ARCH_AARCH64
    103  *max = (int)vmaxvq_u16(max07);
    104  *min = (int)vminvq_u16(min07);
    105 #else
    106  // Split into 64-bit vectors and execute pairwise min/max.
    107  uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
    108  uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
    109 
    110  // Enough runs of vpmax/min propagate the max/min values to every position.
    111  ab_max = vpmax_u16(ab_max, ab_max);
    112  ab_min = vpmin_u16(ab_min, ab_min);
    113 
    114  ab_max = vpmax_u16(ab_max, ab_max);
    115  ab_min = vpmin_u16(ab_min, ab_min);
    116 
    117  ab_max = vpmax_u16(ab_max, ab_max);
    118  ab_min = vpmin_u16(ab_min, ab_min);
    119 
    120  *min = *max = 0;  // Clear high bits
    121  // Store directly to avoid costly neon->gpr transfer.
    122  vst1_lane_u16((uint16_t *)max, ab_max, 0);
    123  vst1_lane_u16((uint16_t *)min, ab_min, 0);
    124 #endif
    125 }