tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_convolve_copy_neon.c (4333B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <string.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
     18                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
     19  const uint8_t *src1;
     20  uint8_t *dst1;
     21  int y;
     22 
     23  if (!(w & 0x0F)) {
     24    for (y = 0; y < h; ++y) {
     25      src1 = src;
     26      dst1 = dst;
     27      for (int x = 0; x < (w >> 4); ++x) {
     28        vst1q_u8(dst1, vld1q_u8(src1));
     29        src1 += 16;
     30        dst1 += 16;
     31      }
     32      src += src_stride;
     33      dst += dst_stride;
     34    }
     35  } else if (!(w & 0x07)) {
     36    for (y = 0; y < h; ++y) {
     37      vst1_u8(dst, vld1_u8(src));
     38      src += src_stride;
     39      dst += dst_stride;
     40    }
     41  } else if (!(w & 0x03)) {
     42    for (y = 0; y < h; ++y) {
     43      memcpy(dst, src, sizeof(uint32_t));
     44      src += src_stride;
     45      dst += dst_stride;
     46    }
     47  } else if (!(w & 0x01)) {
     48    for (y = 0; y < h; ++y) {
     49      memcpy(dst, src, sizeof(uint16_t));
     50      src += src_stride;
     51      dst += dst_stride;
     52    }
     53  }
     54 }
     55 
     56 #if CONFIG_AV1_HIGHBITDEPTH
     57 void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
     58                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
     59                                   int h) {
     60  if (w < 4) {  // copy2
     61    do {
     62      memmove(dst, src, 2 * sizeof(*src));
     63      src += src_stride;
     64      dst += dst_stride;
     65 
     66      memmove(dst, src, 2 * sizeof(*src));
     67      src += src_stride;
     68      dst += dst_stride;
     69      h -= 2;
     70    } while (h != 0);
     71  } else if (w == 4) {  // copy4
     72    uint16x4_t s0, s1;
     73    do {
     74      s0 = vld1_u16(src);
     75      src += src_stride;
     76      s1 = vld1_u16(src);
     77      src += src_stride;
     78 
     79      vst1_u16(dst, s0);
     80      dst += dst_stride;
     81      vst1_u16(dst, s1);
     82      dst += dst_stride;
     83      h -= 2;
     84    } while (h != 0);
     85  } else if (w == 8) {  // copy8
     86    uint16x8_t s0, s1;
     87    do {
     88      s0 = vld1q_u16(src);
     89      src += src_stride;
     90      s1 = vld1q_u16(src);
     91      src += src_stride;
     92 
     93      vst1q_u16(dst, s0);
     94      dst += dst_stride;
     95      vst1q_u16(dst, s1);
     96      dst += dst_stride;
     97      h -= 2;
     98    } while (h != 0);
     99  } else if (w < 32) {  // copy16
    100    uint16x8_t s0, s1, s2, s3;
    101    do {
    102      s0 = vld1q_u16(src);
    103      s1 = vld1q_u16(src + 8);
    104      src += src_stride;
    105      s2 = vld1q_u16(src);
    106      s3 = vld1q_u16(src + 8);
    107      src += src_stride;
    108 
    109      vst1q_u16(dst, s0);
    110      vst1q_u16(dst + 8, s1);
    111      dst += dst_stride;
    112      vst1q_u16(dst, s2);
    113      vst1q_u16(dst + 8, s3);
    114      dst += dst_stride;
    115      h -= 2;
    116    } while (h != 0);
    117  } else if (w == 32) {  // copy32
    118    uint16x8_t s0, s1, s2, s3;
    119    do {
    120      s0 = vld1q_u16(src);
    121      s1 = vld1q_u16(src + 8);
    122      s2 = vld1q_u16(src + 16);
    123      s3 = vld1q_u16(src + 24);
    124      src += src_stride;
    125 
    126      vst1q_u16(dst, s0);
    127      vst1q_u16(dst + 8, s1);
    128      vst1q_u16(dst + 16, s2);
    129      vst1q_u16(dst + 24, s3);
    130      dst += dst_stride;
    131    } while (--h != 0);
    132  } else {  // copy64
    133    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    134    do {
    135      const uint16_t *s = src;
    136      uint16_t *d = dst;
    137      int width = w;
    138      do {
    139        s0 = vld1q_u16(s);
    140        s1 = vld1q_u16(s + 8);
    141        s2 = vld1q_u16(s + 16);
    142        s3 = vld1q_u16(s + 24);
    143        s4 = vld1q_u16(s + 32);
    144        s5 = vld1q_u16(s + 40);
    145        s6 = vld1q_u16(s + 48);
    146        s7 = vld1q_u16(s + 56);
    147 
    148        vst1q_u16(d, s0);
    149        vst1q_u16(d + 8, s1);
    150        vst1q_u16(d + 16, s2);
    151        vst1q_u16(d + 24, s3);
    152        vst1q_u16(d + 32, s4);
    153        vst1q_u16(d + 40, s5);
    154        vst1q_u16(d + 48, s6);
    155        vst1q_u16(d + 56, s7);
    156        s += 64;
    157        d += 64;
    158        width -= 64;
    159      } while (width > 0);
    160      src += src_stride;
    161      dst += dst_stride;
    162    } while (--h != 0);
    163  }
    164 }
    165 
    166 #endif  // CONFIG_AV1_HIGHBITDEPTH