tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

disflow_neon.h (4716B)


      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_
     13 #define AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_
     14 
     15 #include "aom_dsp/flow_estimation/disflow.h"
     16 
     17 #include <arm_neon.h>
     18 #include <math.h>
     19 
     20 #include "aom_dsp/arm/mem_neon.h"
     21 #include "config/aom_config.h"
     22 #include "config/aom_dsp_rtcd.h"
     23 
     24 static inline void get_cubic_kernel_dbl(double x, double kernel[4]) {
     25  // Check that the fractional position is in range.
     26  //
     27  // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
     28  // Mathematically, this implies that 0 <= x < 1. However, in practice it is
     29  // possible to have x == 1 due to floating point rounding. This is fine,
     30  // and we still interpolate correctly if we allow x = 1.
     31  assert(0 <= x && x <= 1);
     32 
     33  double x2 = x * x;
     34  double x3 = x2 * x;
     35  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
     36  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
     37  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
     38  kernel[3] = -0.5 * x2 + 0.5 * x3;
     39 }
     40 
     41 static inline void get_cubic_kernel_int(double x, int kernel[4]) {
     42  double kernel_dbl[4];
     43  get_cubic_kernel_dbl(x, kernel_dbl);
     44 
     45  kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
     46  kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
     47  kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
     48  kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
     49 }
     50 
     51 static inline void sobel_filter_x(const uint8_t *src, int src_stride,
     52                                  int16_t *dst, int dst_stride) {
     53  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
     54 
     55  // Horizontal filter, using kernel {1, 0, -1}.
     56  const uint8_t *src_start = src - 1 * src_stride - 1;
     57 
     58  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
     59    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
     60    uint8x8_t s0 = vget_low_u8(s);
     61    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
     62 
     63    // Given that the kernel is {1, 0, -1} the convolution is a simple
     64    // subtraction.
     65    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
     66 
     67    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
     68  }
     69 
     70  // Vertical filter, using kernel {1, 2, 1}.
     71  // This kernel can be split into two 2-taps kernels of value {1, 1}.
     72  // That way we need only 3 add operations to perform the convolution, one of
     73  // which can be reused for the next line.
     74  int16x8_t s0 = vld1q_s16(tmp);
     75  int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
     76  int16x8_t sum01 = vaddq_s16(s0, s1);
     77  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
     78    int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
     79 
     80    int16x8_t sum12 = vaddq_s16(s1, s2);
     81    int16x8_t sum = vaddq_s16(sum01, sum12);
     82 
     83    vst1q_s16(dst + i * dst_stride, sum);
     84 
     85    sum01 = sum12;
     86    s1 = s2;
     87  }
     88 }
     89 
     90 static inline void sobel_filter_y(const uint8_t *src, int src_stride,
     91                                  int16_t *dst, int dst_stride) {
     92  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
     93 
     94  // Horizontal filter, using kernel {1, 2, 1}.
     95  // This kernel can be split into two 2-taps kernels of value {1, 1}.
     96  // That way we need only 3 add operations to perform the convolution.
     97  const uint8_t *src_start = src - 1 * src_stride - 1;
     98 
     99  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
    100    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
    101    uint8x8_t s0 = vget_low_u8(s);
    102    uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
    103    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
    104 
    105    uint16x8_t sum01 = vaddl_u8(s0, s1);
    106    uint16x8_t sum12 = vaddl_u8(s1, s2);
    107    uint16x8_t sum = vaddq_u16(sum01, sum12);
    108 
    109    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
    110  }
    111 
    112  // Vertical filter, using kernel {1, 0, -1}.
    113  // Load the whole block at once to avoid redundant loads during convolution.
    114  int16x8_t t[10];
    115  load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
    116                &t[5], &t[6], &t[7], &t[8], &t[9]);
    117 
    118  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
    119    // Given that the kernel is {1, 0, -1} the convolution is a simple
    120    // subtraction.
    121    int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
    122 
    123    vst1q_s16(dst + i * dst_stride, diff);
    124  }
    125 }
    126 
    127 #endif  // AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_