aom_convolve_copy_neon.c (4333B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <string.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 17 void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, 18 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { 19 const uint8_t *src1; 20 uint8_t *dst1; 21 int y; 22 23 if (!(w & 0x0F)) { 24 for (y = 0; y < h; ++y) { 25 src1 = src; 26 dst1 = dst; 27 for (int x = 0; x < (w >> 4); ++x) { 28 vst1q_u8(dst1, vld1q_u8(src1)); 29 src1 += 16; 30 dst1 += 16; 31 } 32 src += src_stride; 33 dst += dst_stride; 34 } 35 } else if (!(w & 0x07)) { 36 for (y = 0; y < h; ++y) { 37 vst1_u8(dst, vld1_u8(src)); 38 src += src_stride; 39 dst += dst_stride; 40 } 41 } else if (!(w & 0x03)) { 42 for (y = 0; y < h; ++y) { 43 memcpy(dst, src, sizeof(uint32_t)); 44 src += src_stride; 45 dst += dst_stride; 46 } 47 } else if (!(w & 0x01)) { 48 for (y = 0; y < h; ++y) { 49 memcpy(dst, src, sizeof(uint16_t)); 50 src += src_stride; 51 dst += dst_stride; 52 } 53 } 54 } 55 56 #if CONFIG_AV1_HIGHBITDEPTH 57 void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, 58 uint16_t *dst, ptrdiff_t dst_stride, int w, 59 int h) { 60 if (w < 4) { // copy2 61 do { 62 memmove(dst, src, 2 * sizeof(*src)); 63 src += src_stride; 64 dst += dst_stride; 65 66 memmove(dst, src, 2 * sizeof(*src)); 67 src += src_stride; 68 dst += dst_stride; 69 h -= 2; 70 } while (h != 0); 71 } else if (w == 4) { // copy4 72 uint16x4_t s0, s1; 73 do { 74 s0 = vld1_u16(src); 75 src += src_stride; 76 s1 = vld1_u16(src); 77 src += src_stride; 78 79 vst1_u16(dst, s0); 80 dst += dst_stride; 81 vst1_u16(dst, s1); 82 dst += dst_stride; 83 h -= 2; 84 } while (h != 0); 85 } else if (w == 8) { // copy8 86 uint16x8_t s0, s1; 87 do { 88 s0 = vld1q_u16(src); 89 src += src_stride; 90 s1 = vld1q_u16(src); 91 src += src_stride; 92 93 vst1q_u16(dst, s0); 94 dst += dst_stride; 95 vst1q_u16(dst, s1); 96 dst += dst_stride; 97 h -= 2; 98 } while (h != 0); 99 } else if (w < 32) { // copy16 100 uint16x8_t s0, s1, s2, s3; 101 do { 102 s0 = vld1q_u16(src); 103 s1 = vld1q_u16(src + 8); 104 src += src_stride; 105 s2 = vld1q_u16(src); 106 s3 = vld1q_u16(src + 8); 107 src += src_stride; 108 109 vst1q_u16(dst, s0); 110 vst1q_u16(dst + 8, s1); 111 dst += dst_stride; 112 vst1q_u16(dst, s2); 113 vst1q_u16(dst + 8, s3); 114 dst += dst_stride; 115 h -= 2; 116 } while (h != 0); 117 } else if (w == 32) { // copy32 118 uint16x8_t s0, s1, s2, s3; 119 do { 120 s0 = vld1q_u16(src); 121 s1 = vld1q_u16(src + 8); 122 s2 = vld1q_u16(src + 16); 123 s3 = vld1q_u16(src + 24); 124 src += src_stride; 125 126 vst1q_u16(dst, s0); 127 vst1q_u16(dst + 8, s1); 128 vst1q_u16(dst + 16, s2); 129 vst1q_u16(dst + 24, s3); 130 dst += dst_stride; 131 } while (--h != 0); 132 } else { // copy64 133 uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 134 do { 135 const uint16_t *s = src; 136 uint16_t *d = dst; 137 int width = w; 138 do { 139 s0 = vld1q_u16(s); 140 s1 = vld1q_u16(s + 8); 141 s2 = vld1q_u16(s + 16); 142 s3 = vld1q_u16(s + 24); 143 s4 = vld1q_u16(s + 32); 144 s5 = vld1q_u16(s + 40); 145 s6 = vld1q_u16(s + 48); 146 s7 = vld1q_u16(s + 56); 147 148 vst1q_u16(d, s0); 149 vst1q_u16(d + 8, s1); 150 vst1q_u16(d + 16, s2); 151 vst1q_u16(d + 24, s3); 152 vst1q_u16(d + 32, s4); 153 vst1q_u16(d + 40, s5); 154 vst1q_u16(d + 48, s6); 155 vst1q_u16(d + 56, s7); 156 s += 64; 157 d += 64; 158 width -= 64; 159 } while (width > 0); 160 src += src_stride; 161 dst += dst_stride; 162 } while (--h != 0); 163 } 164 } 165 166 #endif // CONFIG_AV1_HIGHBITDEPTH