[ tor-browser ].git.dasho

sse_neon.c (6719B)
      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 #include "aom_dsp/arm/mem_neon.h"
     16 #include "aom_dsp/arm/sum_neon.h"
     17 
     18 static inline void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
     19                                 uint32x4_t *sse) {
     20  uint8x16_t s = vld1q_u8(src);
     21  uint8x16_t r = vld1q_u8(ref);
     22 
     23  uint8x16_t abs_diff = vabdq_u8(s, r);
     24  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
     25  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
     26 
     27  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
     28  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
     29 }
     30 
     31 static inline void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
     32                                uint32x4_t *sse) {
     33  uint8x8_t s = vld1_u8(src);
     34  uint8x8_t r = vld1_u8(ref);
     35 
     36  uint8x8_t abs_diff = vabd_u8(s, r);
     37 
     38  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
     39 }
     40 
     41 static inline void sse_4x2_neon(const uint8_t *src, int src_stride,
     42                                const uint8_t *ref, int ref_stride,
     43                                uint32x4_t *sse) {
     44  uint8x8_t s = load_unaligned_u8(src, src_stride);
     45  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
     46 
     47  uint8x8_t abs_diff = vabd_u8(s, r);
     48 
     49  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
     50 }
     51 
     52 static inline uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
     53                                    const uint8_t *ref, int ref_stride,
     54                                    int width, int height) {
     55  uint32x4_t sse = vdupq_n_u32(0);
     56 
     57  if ((width & 0x07) && ((width & 0x07) < 5)) {
     58    int i = height;
     59    do {
     60      int j = 0;
     61      do {
     62        sse_8x1_neon(src + j, ref + j, &sse);
     63        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
     64        j += 8;
     65      } while (j + 4 < width);
     66 
     67      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
     68      src += 2 * src_stride;
     69      ref += 2 * ref_stride;
     70      i -= 2;
     71    } while (i != 0);
     72  } else {
     73    int i = height;
     74    do {
     75      int j = 0;
     76      do {
     77        sse_8x1_neon(src + j, ref + j, &sse);
     78        j += 8;
     79      } while (j < width);
     80 
     81      src += src_stride;
     82      ref += ref_stride;
     83    } while (--i != 0);
     84  }
     85  return horizontal_add_u32x4(sse);
     86 }
     87 
     88 static inline uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
     89                                      const uint8_t *ref, int ref_stride,
     90                                      int height) {
     91  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
     92 
     93  int i = height;
     94  do {
     95    sse_16x1_neon(src, ref, &sse[0]);
     96    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
     97    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
     98    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
     99    sse_16x1_neon(src + 64, ref + 64, &sse[0]);
    100    sse_16x1_neon(src + 80, ref + 80, &sse[1]);
    101    sse_16x1_neon(src + 96, ref + 96, &sse[0]);
    102    sse_16x1_neon(src + 112, ref + 112, &sse[1]);
    103 
    104    src += src_stride;
    105    ref += ref_stride;
    106  } while (--i != 0);
    107 
    108  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
    109 }
    110 
    111 static inline uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
    112                                     const uint8_t *ref, int ref_stride,
    113                                     int height) {
    114  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
    115 
    116  int i = height;
    117  do {
    118    sse_16x1_neon(src, ref, &sse[0]);
    119    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
    120    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
    121    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
    122 
    123    src += src_stride;
    124    ref += ref_stride;
    125  } while (--i != 0);
    126 
    127  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
    128 }
    129 
    130 static inline uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
    131                                     const uint8_t *ref, int ref_stride,
    132                                     int height) {
    133  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
    134 
    135  int i = height;
    136  do {
    137    sse_16x1_neon(src, ref, &sse[0]);
    138    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
    139 
    140    src += src_stride;
    141    ref += ref_stride;
    142  } while (--i != 0);
    143 
    144  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
    145 }
    146 
    147 static inline uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
    148                                     const uint8_t *ref, int ref_stride,
    149                                     int height) {
    150  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
    151 
    152  int i = height;
    153  do {
    154    sse_16x1_neon(src, ref, &sse[0]);
    155    src += src_stride;
    156    ref += ref_stride;
    157    sse_16x1_neon(src, ref, &sse[1]);
    158    src += src_stride;
    159    ref += ref_stride;
    160    i -= 2;
    161  } while (i != 0);
    162 
    163  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
    164 }
    165 
    166 static inline uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
    167                                    const uint8_t *ref, int ref_stride,
    168                                    int height) {
    169  uint32x4_t sse = vdupq_n_u32(0);
    170 
    171  int i = height;
    172  do {
    173    sse_8x1_neon(src, ref, &sse);
    174 
    175    src += src_stride;
    176    ref += ref_stride;
    177  } while (--i != 0);
    178 
    179  return horizontal_add_u32x4(sse);
    180 }
    181 
    182 static inline uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
    183                                    const uint8_t *ref, int ref_stride,
    184                                    int height) {
    185  uint32x4_t sse = vdupq_n_u32(0);
    186 
    187  int i = height;
    188  do {
    189    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
    190 
    191    src += 2 * src_stride;
    192    ref += 2 * ref_stride;
    193    i -= 2;
    194  } while (i != 0);
    195 
    196  return horizontal_add_u32x4(sse);
    197 }
    198 
    199 int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
    200                     int ref_stride, int width, int height) {
    201  switch (width) {
    202    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
    203    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
    204    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
    205    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
    206    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
    207    case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
    208    default:
    209      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
    210  }
    211 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE