aom_convolve_copy_sse2.c (11394B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 16 static inline void copy_128(const uint8_t *src, uint8_t *dst) { 17 __m128i s[8]; 18 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); 19 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); 20 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); 21 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); 22 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16)); 23 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16)); 24 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16)); 25 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16)); 26 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); 27 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); 28 _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); 29 _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); 30 _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]); 31 _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]); 32 _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]); 33 _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]); 34 } 35 36 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, 37 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { 38 // The w >= 16 cases use _mm_store_si128(), which requires its output address 39 // be aligned on a 16-byte boundary. 40 if (w >= 16) { 41 assert(!((intptr_t)dst % 16)); 42 assert(!(dst_stride % 16)); 43 } 44 45 if (w == 2) { 46 do { 47 memmove(dst, src, 2 * sizeof(*src)); 48 src += src_stride; 49 dst += dst_stride; 50 memmove(dst, src, 2 * sizeof(*src)); 51 src += src_stride; 52 dst += dst_stride; 53 h -= 2; 54 } while (h); 55 } else if (w == 4) { 56 do { 57 memmove(dst, src, 4 * sizeof(*src)); 58 src += src_stride; 59 dst += dst_stride; 60 memmove(dst, src, 4 * sizeof(*src)); 61 src += src_stride; 62 dst += dst_stride; 63 h -= 2; 64 } while (h); 65 } else if (w == 8) { 66 do { 67 __m128i s[2]; 68 s[0] = _mm_loadl_epi64((__m128i *)src); 69 src += src_stride; 70 s[1] = _mm_loadl_epi64((__m128i *)src); 71 src += src_stride; 72 _mm_storel_epi64((__m128i *)dst, s[0]); 73 dst += dst_stride; 74 _mm_storel_epi64((__m128i *)dst, s[1]); 75 dst += dst_stride; 76 h -= 2; 77 } while (h); 78 } else if (w == 16) { 79 do { 80 __m128i s[2]; 81 s[0] = _mm_loadu_si128((__m128i *)src); 82 src += src_stride; 83 s[1] = _mm_loadu_si128((__m128i *)src); 84 src += src_stride; 85 _mm_store_si128((__m128i *)dst, s[0]); 86 dst += dst_stride; 87 _mm_store_si128((__m128i *)dst, s[1]); 88 dst += dst_stride; 89 h -= 2; 90 } while (h); 91 } else if (w == 32) { 92 do { 93 __m128i s[4]; 94 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); 95 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); 96 src += src_stride; 97 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); 98 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); 99 src += src_stride; 100 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); 101 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); 102 dst += dst_stride; 103 _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]); 104 _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]); 105 dst += dst_stride; 106 h -= 2; 107 } while (h); 108 } else if (w == 64) { 109 do { 110 __m128i s[8]; 111 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); 112 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); 113 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); 114 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); 115 src += src_stride; 116 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); 117 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); 118 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); 119 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); 120 src += src_stride; 121 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); 122 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); 123 _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); 124 _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); 125 dst += dst_stride; 126 _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]); 127 _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]); 128 _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]); 129 _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]); 130 dst += dst_stride; 131 h -= 2; 132 } while (h); 133 } else { 134 do { 135 copy_128(src, dst); 136 src += src_stride; 137 dst += dst_stride; 138 copy_128(src, dst); 139 src += src_stride; 140 dst += dst_stride; 141 h -= 2; 142 } while (h); 143 } 144 } 145 146 #if CONFIG_AV1_HIGHBITDEPTH 147 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) { 148 __m128i s[8]; 149 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); 150 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); 151 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); 152 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); 153 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); 154 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); 155 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); 156 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); 157 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); 158 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); 159 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); 160 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); 161 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); 162 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); 163 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); 164 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); 165 } 166 167 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) { 168 __m128i s[16]; 169 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); 170 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); 171 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); 172 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); 173 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); 174 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); 175 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); 176 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); 177 s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8)); 178 s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8)); 179 s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8)); 180 s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8)); 181 s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8)); 182 s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8)); 183 s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8)); 184 s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8)); 185 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); 186 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); 187 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); 188 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); 189 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); 190 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); 191 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); 192 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); 193 _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]); 194 _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]); 195 _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]); 196 _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]); 197 _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]); 198 _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]); 199 _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]); 200 _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]); 201 } 202 203 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, 204 uint16_t *dst, ptrdiff_t dst_stride, int w, 205 int h) { 206 // The w >= 8 cases use _mm_store_si128(), which requires its output address 207 // be aligned on a 16-byte boundary. 208 if (w >= 8) { 209 assert(!((intptr_t)dst % 16)); 210 assert(!(dst_stride % 8)); 211 } 212 213 if (w == 2) { 214 do { 215 __m128i s = _mm_loadl_epi64((__m128i *)src); 216 *(int *)dst = _mm_cvtsi128_si32(s); 217 src += src_stride; 218 dst += dst_stride; 219 s = _mm_loadl_epi64((__m128i *)src); 220 *(int *)dst = _mm_cvtsi128_si32(s); 221 src += src_stride; 222 dst += dst_stride; 223 h -= 2; 224 } while (h); 225 } else if (w == 4) { 226 do { 227 __m128i s[2]; 228 s[0] = _mm_loadl_epi64((__m128i *)src); 229 src += src_stride; 230 s[1] = _mm_loadl_epi64((__m128i *)src); 231 src += src_stride; 232 _mm_storel_epi64((__m128i *)dst, s[0]); 233 dst += dst_stride; 234 _mm_storel_epi64((__m128i *)dst, s[1]); 235 dst += dst_stride; 236 h -= 2; 237 } while (h); 238 } else if (w == 8) { 239 do { 240 __m128i s[2]; 241 s[0] = _mm_loadu_si128((__m128i *)src); 242 src += src_stride; 243 s[1] = _mm_loadu_si128((__m128i *)src); 244 src += src_stride; 245 _mm_store_si128((__m128i *)dst, s[0]); 246 dst += dst_stride; 247 _mm_store_si128((__m128i *)dst, s[1]); 248 dst += dst_stride; 249 h -= 2; 250 } while (h); 251 } else if (w == 16) { 252 do { 253 __m128i s[4]; 254 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); 255 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); 256 src += src_stride; 257 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); 258 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); 259 src += src_stride; 260 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); 261 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); 262 dst += dst_stride; 263 _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]); 264 _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]); 265 dst += dst_stride; 266 h -= 2; 267 } while (h); 268 } else if (w == 32) { 269 do { 270 __m128i s[8]; 271 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); 272 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); 273 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); 274 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); 275 src += src_stride; 276 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); 277 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); 278 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); 279 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); 280 src += src_stride; 281 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); 282 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); 283 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); 284 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); 285 dst += dst_stride; 286 _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]); 287 _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]); 288 _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]); 289 _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]); 290 dst += dst_stride; 291 h -= 2; 292 } while (h); 293 } else if (w == 64) { 294 do { 295 highbd_copy_64(src, dst); 296 src += src_stride; 297 dst += dst_stride; 298 highbd_copy_64(src, dst); 299 src += src_stride; 300 dst += dst_stride; 301 h -= 2; 302 } while (h); 303 } else { 304 do { 305 highbd_copy_128(src, dst); 306 src += src_stride; 307 dst += dst_stride; 308 highbd_copy_128(src, dst); 309 src += src_stride; 310 dst += dst_stride; 311 h -= 2; 312 } while (h); 313 } 314 } 315 #endif // CONFIG_AV1_HIGHBITDEPTH