aom_convolve_copy_avx2.c (8446B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 static inline void copy_128(const uint8_t *src, uint8_t *dst) { 17 __m256i s[4]; 18 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); 19 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); 20 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32)); 21 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32)); 22 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); 23 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); 24 _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]); 25 _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]); 26 } 27 28 void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, 29 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { 30 // The w == 16 case uses _mm_store_si128(), which requires its output address 31 // be aligned on a 16-byte boundary. 32 if (w == 16) { 33 assert(!((intptr_t)dst % 16)); 34 assert(!(dst_stride % 16)); 35 } 36 37 if (w == 2) { 38 do { 39 memmove(dst, src, 2 * sizeof(*src)); 40 src += src_stride; 41 dst += dst_stride; 42 memmove(dst, src, 2 * sizeof(*src)); 43 src += src_stride; 44 dst += dst_stride; 45 h -= 2; 46 } while (h); 47 } else if (w == 4) { 48 do { 49 memmove(dst, src, 4 * sizeof(*src)); 50 src += src_stride; 51 dst += dst_stride; 52 memmove(dst, src, 4 * sizeof(*src)); 53 src += src_stride; 54 dst += dst_stride; 55 h -= 2; 56 } while (h); 57 } else if (w == 8) { 58 do { 59 __m128i s[2]; 60 s[0] = _mm_loadl_epi64((__m128i *)src); 61 src += src_stride; 62 s[1] = _mm_loadl_epi64((__m128i *)src); 63 src += src_stride; 64 _mm_storel_epi64((__m128i *)dst, s[0]); 65 dst += dst_stride; 66 _mm_storel_epi64((__m128i *)dst, s[1]); 67 dst += dst_stride; 68 h -= 2; 69 } while (h); 70 } else if (w == 16) { 71 do { 72 __m128i s[2]; 73 s[0] = _mm_loadu_si128((__m128i *)src); 74 src += src_stride; 75 s[1] = _mm_loadu_si128((__m128i *)src); 76 src += src_stride; 77 _mm_store_si128((__m128i *)dst, s[0]); 78 dst += dst_stride; 79 _mm_store_si128((__m128i *)dst, s[1]); 80 dst += dst_stride; 81 h -= 2; 82 } while (h); 83 } else if (w == 32) { 84 do { 85 __m256i s[2]; 86 s[0] = _mm256_loadu_si256((__m256i *)src); 87 src += src_stride; 88 s[1] = _mm256_loadu_si256((__m256i *)src); 89 src += src_stride; 90 _mm256_storeu_si256((__m256i *)dst, s[0]); 91 dst += dst_stride; 92 _mm256_storeu_si256((__m256i *)dst, s[1]); 93 dst += dst_stride; 94 h -= 2; 95 } while (h); 96 } else if (w == 64) { 97 do { 98 __m256i s[4]; 99 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); 100 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); 101 src += src_stride; 102 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); 103 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); 104 src += src_stride; 105 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); 106 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); 107 dst += dst_stride; 108 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]); 109 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]); 110 dst += dst_stride; 111 h -= 2; 112 } while (h); 113 } else { 114 do { 115 copy_128(src, dst); 116 src += src_stride; 117 dst += dst_stride; 118 copy_128(src, dst); 119 src += src_stride; 120 dst += dst_stride; 121 h -= 2; 122 } while (h); 123 } 124 } 125 126 #if CONFIG_AV1_HIGHBITDEPTH 127 128 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) { 129 __m256i s[4]; 130 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); 131 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); 132 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); 133 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); 134 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); 135 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); 136 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); 137 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); 138 } 139 140 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) { 141 __m256i s[8]; 142 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); 143 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); 144 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); 145 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); 146 s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16)); 147 s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16)); 148 s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16)); 149 s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16)); 150 151 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); 152 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); 153 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); 154 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); 155 _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]); 156 _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]); 157 _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]); 158 _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]); 159 } 160 161 void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, 162 uint16_t *dst, ptrdiff_t dst_stride, int w, 163 int h) { 164 // The w == 8 case uses _mm_store_si128(), which requires its output address 165 // be aligned on a 16-byte boundary. 166 if (w == 8) { 167 assert(!((intptr_t)dst % 16)); 168 assert(!(dst_stride % 8)); 169 } 170 171 if (w == 2) { 172 do { 173 memmove(dst, src, 2 * sizeof(*src)); 174 src += src_stride; 175 dst += dst_stride; 176 memmove(dst, src, 2 * sizeof(*src)); 177 src += src_stride; 178 dst += dst_stride; 179 h -= 2; 180 } while (h); 181 } else if (w == 4) { 182 do { 183 __m128i s[2]; 184 s[0] = _mm_loadl_epi64((__m128i *)src); 185 src += src_stride; 186 s[1] = _mm_loadl_epi64((__m128i *)src); 187 src += src_stride; 188 _mm_storel_epi64((__m128i *)dst, s[0]); 189 dst += dst_stride; 190 _mm_storel_epi64((__m128i *)dst, s[1]); 191 dst += dst_stride; 192 h -= 2; 193 } while (h); 194 } else if (w == 8) { 195 do { 196 __m128i s[2]; 197 s[0] = _mm_loadu_si128((__m128i *)src); 198 src += src_stride; 199 s[1] = _mm_loadu_si128((__m128i *)src); 200 src += src_stride; 201 _mm_store_si128((__m128i *)dst, s[0]); 202 dst += dst_stride; 203 _mm_store_si128((__m128i *)dst, s[1]); 204 dst += dst_stride; 205 h -= 2; 206 } while (h); 207 } else if (w == 16) { 208 do { 209 __m256i s[2]; 210 s[0] = _mm256_loadu_si256((__m256i *)src); 211 src += src_stride; 212 s[1] = _mm256_loadu_si256((__m256i *)src); 213 src += src_stride; 214 _mm256_storeu_si256((__m256i *)dst, s[0]); 215 dst += dst_stride; 216 _mm256_storeu_si256((__m256i *)dst, s[1]); 217 dst += dst_stride; 218 h -= 2; 219 } while (h); 220 } else if (w == 32) { 221 do { 222 __m256i s[4]; 223 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); 224 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); 225 src += src_stride; 226 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); 227 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); 228 src += src_stride; 229 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); 230 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); 231 dst += dst_stride; 232 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]); 233 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]); 234 dst += dst_stride; 235 h -= 2; 236 } while (h); 237 } else if (w == 64) { 238 do { 239 highbd_copy_64(src, dst); 240 src += src_stride; 241 dst += dst_stride; 242 highbd_copy_64(src, dst); 243 src += src_stride; 244 dst += dst_stride; 245 h -= 2; 246 } while (h); 247 } else { 248 assert(w == 128); 249 do { 250 highbd_copy_128(src, dst); 251 src += src_stride; 252 dst += dst_stride; 253 highbd_copy_128(src, dst); 254 src += src_stride; 255 dst += dst_stride; 256 h -= 2; 257 } while (h); 258 } 259 } 260 261 #endif // CONFIG_AV1_HIGHBITDEPTH