tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_avg_pred_neon.c (3869B)


      1 /*
      2 * Copyright (c) 2023 The WebM project authors. All rights reserved.
      3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      4 *
      5 * This source code is subject to the terms of the BSD 2 Clause License and
      6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7 * was not distributed with this source code in the LICENSE file, you can
      8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9 * Media Patent License 1.0 was not distributed with this source code in the
     10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11 */
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 
     16 #include "config/aom_config.h"
     17 #include "config/aom_dsp_rtcd.h"
     18 
     19 #include "aom_dsp/arm/blend_neon.h"
     20 #include "aom_dsp/arm/dist_wtd_avg_neon.h"
     21 #include "aom_dsp/arm/mem_neon.h"
     22 #include "aom_dsp/blend.h"
     23 
     24 void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
     25                                   int width, int height, const uint8_t *ref8,
     26                                   int ref_stride) {
     27  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
     28  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     29  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
     30 
     31  int i = height;
     32  if (width > 8) {
     33    do {
     34      int j = 0;
     35      do {
     36        const uint16x8_t p = vld1q_u16(pred + j);
     37        const uint16x8_t r = vld1q_u16(ref + j);
     38 
     39        uint16x8_t avg = vrhaddq_u16(p, r);
     40        vst1q_u16(comp_pred + j, avg);
     41 
     42        j += 8;
     43      } while (j < width);
     44 
     45      comp_pred += width;
     46      pred += width;
     47      ref += ref_stride;
     48    } while (--i != 0);
     49  } else if (width == 8) {
     50    do {
     51      const uint16x8_t p = vld1q_u16(pred);
     52      const uint16x8_t r = vld1q_u16(ref);
     53 
     54      uint16x8_t avg = vrhaddq_u16(p, r);
     55      vst1q_u16(comp_pred, avg);
     56 
     57      comp_pred += width;
     58      pred += width;
     59      ref += ref_stride;
     60    } while (--i != 0);
     61  } else {
     62    assert(width == 4);
     63    do {
     64      const uint16x4_t p = vld1_u16(pred);
     65      const uint16x4_t r = vld1_u16(ref);
     66 
     67      uint16x4_t avg = vrhadd_u16(p, r);
     68      vst1_u16(comp_pred, avg);
     69 
     70      comp_pred += width;
     71      pred += width;
     72      ref += ref_stride;
     73    } while (--i != 0);
     74  }
     75 }
     76 
     77 void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
     78                                    int width, int height, const uint8_t *ref8,
     79                                    int ref_stride, const uint8_t *mask,
     80                                    int mask_stride, int invert_mask) {
     81  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
     82  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     83  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
     84 
     85  const uint16_t *src0 = invert_mask ? pred : ref;
     86  const uint16_t *src1 = invert_mask ? ref : pred;
     87  const int src_stride0 = invert_mask ? width : ref_stride;
     88  const int src_stride1 = invert_mask ? ref_stride : width;
     89 
     90  if (width >= 8) {
     91    do {
     92      int j = 0;
     93 
     94      do {
     95        const uint16x8_t s0 = vld1q_u16(src0 + j);
     96        const uint16x8_t s1 = vld1q_u16(src1 + j);
     97        const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
     98 
     99        uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
    100 
    101        vst1q_u16(comp_pred + j, blend_u16);
    102 
    103        j += 8;
    104      } while (j < width);
    105 
    106      src0 += src_stride0;
    107      src1 += src_stride1;
    108      mask += mask_stride;
    109      comp_pred += width;
    110    } while (--height != 0);
    111  } else {
    112    assert(width == 4);
    113 
    114    do {
    115      const uint16x4_t s0 = vld1_u16(src0);
    116      const uint16x4_t s1 = vld1_u16(src1);
    117      const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
    118 
    119      uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
    120 
    121      vst1_u16(comp_pred, blend_u16);
    122 
    123      src0 += src_stride0;
    124      src1 += src_stride1;
    125      mask += mask_stride;
    126      comp_pred += 4;
    127    } while (--height != 0);
    128  }
    129 }