tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

subtract_neon.c (6474B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_ports/mem.h"
     19 
     20 void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
     21                             ptrdiff_t diff_stride, const uint8_t *src,
     22                             ptrdiff_t src_stride, const uint8_t *pred,
     23                             ptrdiff_t pred_stride) {
     24  if (cols > 16) {
     25    int r = rows;
     26    do {
     27      int c = 0;
     28      do {
     29        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
     30        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
     31        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
     32        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
     33        const uint16x8_t v_diff_lo_00 =
     34            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
     35        const uint16x8_t v_diff_hi_00 =
     36            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
     37        const uint16x8_t v_diff_lo_16 =
     38            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
     39        const uint16x8_t v_diff_hi_16 =
     40            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
     41        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
     42        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
     43        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
     44        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
     45        c += 32;
     46      } while (c < cols);
     47      diff += diff_stride;
     48      pred += pred_stride;
     49      src += src_stride;
     50    } while (--r != 0);
     51  } else if (cols > 8) {
     52    int r = rows;
     53    do {
     54      const uint8x16_t v_src = vld1q_u8(&src[0]);
     55      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
     56      const uint16x8_t v_diff_lo =
     57          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
     58      const uint16x8_t v_diff_hi =
     59          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
     60      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
     61      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
     62      diff += diff_stride;
     63      pred += pred_stride;
     64      src += src_stride;
     65    } while (--r != 0);
     66  } else if (cols > 4) {
     67    int r = rows;
     68    do {
     69      const uint8x8_t v_src = vld1_u8(&src[0]);
     70      const uint8x8_t v_pred = vld1_u8(&pred[0]);
     71      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
     72      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
     73      diff += diff_stride;
     74      pred += pred_stride;
     75      src += src_stride;
     76    } while (--r != 0);
     77  } else {
     78    int r = rows;
     79    do {
     80      int c = 0;
     81      do {
     82        diff[c] = src[c] - pred[c];
     83      } while (++c < cols);
     84      diff += diff_stride;
     85      pred += pred_stride;
     86      src += src_stride;
     87    } while (--r != 0);
     88  }
     89 }
     90 
     91 #if CONFIG_AV1_HIGHBITDEPTH
     92 void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff,
     93                                    ptrdiff_t diff_stride, const uint8_t *src8,
     94                                    ptrdiff_t src_stride, const uint8_t *pred8,
     95                                    ptrdiff_t pred_stride) {
     96  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     97  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
     98 
     99  if (cols > 16) {
    100    int r = rows;
    101    do {
    102      int c = 0;
    103      do {
    104        const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]);
    105        const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]);
    106        const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
    107        const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]);
    108        const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]);
    109        const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
    110        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00));
    111        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08));
    112        c += 16;
    113      } while (c < cols);
    114      diff += diff_stride;
    115      pred += pred_stride;
    116      src += src_stride;
    117    } while (--r != 0);
    118  } else if (cols > 8) {
    119    int r = rows;
    120    do {
    121      const uint16x8_t v_src_00 = vld1q_u16(&src[0]);
    122      const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]);
    123      const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
    124      const uint16x8_t v_src_08 = vld1q_u16(&src[8]);
    125      const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]);
    126      const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
    127      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00));
    128      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08));
    129      diff += diff_stride;
    130      pred += pred_stride;
    131      src += src_stride;
    132    } while (--r != 0);
    133  } else if (cols > 4) {
    134    int r = rows;
    135    do {
    136      const uint16x8_t v_src_r0 = vld1q_u16(&src[0]);
    137      const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]);
    138      const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]);
    139      const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]);
    140      const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0);
    141      const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1);
    142      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0));
    143      vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1));
    144      diff += diff_stride << 1;
    145      pred += pred_stride << 1;
    146      src += src_stride << 1;
    147      r -= 2;
    148    } while (r != 0);
    149  } else {
    150    int r = rows;
    151    do {
    152      const uint16x4_t v_src_r0 = vld1_u16(&src[0]);
    153      const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]);
    154      const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]);
    155      const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]);
    156      const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0);
    157      const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1);
    158      vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0));
    159      vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1));
    160      diff += diff_stride << 1;
    161      pred += pred_stride << 1;
    162      src += src_stride << 1;
    163      r -= 2;
    164    } while (r != 0);
    165  }
    166 }
    167 #endif  // CONFIG_AV1_HIGHBITDEPTH