convolve_sse2.c (19767B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> 13 14 #include "config/av1_rtcd.h" 15 16 #include "aom_dsp/aom_dsp_common.h" 17 #include "aom_dsp/aom_filter.h" 18 #include "aom_dsp/x86/convolve_common_intrin.h" 19 #include "aom_dsp/x86/synonyms.h" 20 #include "av1/common/convolve.h" 21 22 static inline void prepare_coeffs(const InterpFilterParams *const filter_params, 23 const int subpel_q4, 24 __m128i *const coeffs /* [4] */) { 25 const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( 26 filter_params, subpel_q4 & SUBPEL_MASK); 27 const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); 28 // coeffs 0 1 0 1 2 3 2 3 29 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); 30 // coeffs 4 5 4 5 6 7 6 7 31 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); 32 33 coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1 34 coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 35 coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 36 coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 37 } 38 39 static inline __m128i convolve(const __m128i *const s, 40 const __m128i *const coeffs) { 41 const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); 42 const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); 43 const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); 44 const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); 45 const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); 46 return d; 47 } 48 49 static inline __m128i convolve_lo_x(const __m128i *const s, 50 const __m128i *const coeffs) { 51 __m128i ss[4]; 52 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); 53 ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); 54 ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); 55 ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); 56 return convolve(ss, coeffs); 57 } 58 59 static inline __m128i convolve_lo_y(const __m128i *const s, 60 const __m128i *const coeffs) { 61 __m128i ss[4]; 62 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); 63 ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); 64 ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); 65 ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); 66 return convolve(ss, coeffs); 67 } 68 69 static inline __m128i convolve_hi_y(const __m128i *const s, 70 const __m128i *const coeffs) { 71 __m128i ss[4]; 72 ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); 73 ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); 74 ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); 75 ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); 76 return convolve(ss, coeffs); 77 } 78 79 static void convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride, 80 uint8_t *dst, int dst_stride, int w, int h, 81 const InterpFilterParams *filter_params_y, 82 int subpel_y_qn) { 83 const int fo_vert = filter_params_y->taps / 2 - 1; 84 const uint8_t *src_ptr = src - fo_vert * src_stride; 85 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); 86 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); 87 __m128i coeffs[6]; 88 89 prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); 90 91 int j = 0; 92 do { 93 __m128i s[12], src10, res_lo, res_hi; 94 __m128i res_lo_round, res_hi_round, res16, res; 95 const uint8_t *data = &src_ptr[j]; 96 97 src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)); 98 s[0] = 99 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), 100 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); 101 s[1] = 102 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), 103 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); 104 s[2] = 105 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), 106 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); 107 s[3] = 108 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), 109 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); 110 s[4] = 111 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), 112 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); 113 s[5] = 114 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), 115 _mm_loadl_epi64((__m128i *)(data + 6 * src_stride))); 116 s[6] = 117 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)), 118 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); 119 s[7] = 120 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), 121 _mm_loadl_epi64((__m128i *)(data + 8 * src_stride))); 122 s[8] = 123 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)), 124 _mm_loadl_epi64((__m128i *)(data + 9 * src_stride))); 125 s[9] = _mm_unpacklo_epi8( 126 _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10); 127 128 int i = 0; 129 do { 130 data = &src_ptr[i * src_stride + j]; 131 s[10] = _mm_unpacklo_epi8( 132 src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))); 133 src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)); 134 s[11] = _mm_unpacklo_epi8( 135 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10); 136 137 res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels 138 res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels 139 140 res_lo_round = 141 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); 142 res_hi_round = 143 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); 144 145 res16 = _mm_packs_epi32(res_lo_round, res_hi_round); 146 res = _mm_packus_epi16(res16, res16); 147 148 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); 149 i++; 150 151 res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels 152 res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels 153 154 res_lo_round = 155 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); 156 res_hi_round = 157 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); 158 159 res16 = _mm_packs_epi32(res_lo_round, res_hi_round); 160 res = _mm_packus_epi16(res16, res16); 161 162 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); 163 i++; 164 165 s[0] = s[2]; 166 s[1] = s[3]; 167 s[2] = s[4]; 168 s[3] = s[5]; 169 s[4] = s[6]; 170 s[5] = s[7]; 171 s[6] = s[8]; 172 s[7] = s[9]; 173 s[8] = s[10]; 174 s[9] = s[11]; 175 } while (i < h); 176 j += 8; 177 } while (j < w); 178 } 179 180 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, 181 int dst_stride, int w, int h, 182 const InterpFilterParams *filter_params_y, 183 const int subpel_y_qn) { 184 if (filter_params_y->taps > 8) { 185 if (w < 8) { 186 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, 187 filter_params_y, subpel_y_qn); 188 } else { 189 convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, 190 filter_params_y, subpel_y_qn); 191 } 192 } else { 193 const int fo_vert = filter_params_y->taps / 2 - 1; 194 const uint8_t *src_ptr = src - fo_vert * src_stride; 195 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); 196 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); 197 __m128i coeffs[4]; 198 199 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); 200 201 if (w <= 4) { 202 __m128i s[8], src6, res, res_round, res16; 203 int res_int; 204 s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride), 205 xx_loadl_32(src_ptr + 1 * src_stride)); 206 s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride), 207 xx_loadl_32(src_ptr + 2 * src_stride)); 208 s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride), 209 xx_loadl_32(src_ptr + 3 * src_stride)); 210 s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride), 211 xx_loadl_32(src_ptr + 4 * src_stride)); 212 s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride), 213 xx_loadl_32(src_ptr + 5 * src_stride)); 214 src6 = xx_loadl_32(src_ptr + 6 * src_stride); 215 s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6); 216 217 do { 218 s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride)); 219 src6 = xx_loadl_32(src_ptr + 8 * src_stride); 220 s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6); 221 222 res = convolve_lo_y(s + 0, coeffs); 223 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); 224 res16 = _mm_packs_epi32(res_round, res_round); 225 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); 226 227 if (w == 2) 228 *(uint16_t *)dst = (uint16_t)res_int; 229 else 230 *(int *)dst = res_int; 231 232 src_ptr += src_stride; 233 dst += dst_stride; 234 235 res = convolve_lo_y(s + 1, coeffs); 236 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); 237 res16 = _mm_packs_epi32(res_round, res_round); 238 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); 239 240 if (w == 2) 241 *(uint16_t *)dst = (uint16_t)res_int; 242 else 243 *(int *)dst = res_int; 244 245 src_ptr += src_stride; 246 dst += dst_stride; 247 248 s[0] = s[2]; 249 s[1] = s[3]; 250 s[2] = s[4]; 251 s[3] = s[5]; 252 s[4] = s[6]; 253 s[5] = s[7]; 254 h -= 2; 255 } while (h); 256 } else { 257 assert(!(w % 8)); 258 int j = 0; 259 do { 260 __m128i s[8], src6, res_lo, res_hi; 261 __m128i res_lo_round, res_hi_round, res16, res; 262 const uint8_t *data = &src_ptr[j]; 263 264 src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); 265 s[0] = _mm_unpacklo_epi8( 266 _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), 267 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); 268 s[1] = _mm_unpacklo_epi8( 269 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), 270 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); 271 s[2] = _mm_unpacklo_epi8( 272 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), 273 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); 274 s[3] = _mm_unpacklo_epi8( 275 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), 276 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); 277 s[4] = _mm_unpacklo_epi8( 278 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), 279 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); 280 s[5] = _mm_unpacklo_epi8( 281 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); 282 283 int i = 0; 284 do { 285 data = &src_ptr[i * src_stride + j]; 286 s[6] = _mm_unpacklo_epi8( 287 src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); 288 src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); 289 s[7] = _mm_unpacklo_epi8( 290 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); 291 292 res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels 293 res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels 294 295 res_lo_round = 296 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); 297 res_hi_round = 298 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); 299 300 res16 = _mm_packs_epi32(res_lo_round, res_hi_round); 301 res = _mm_packus_epi16(res16, res16); 302 303 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); 304 i++; 305 306 res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels 307 res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels 308 309 res_lo_round = 310 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); 311 res_hi_round = 312 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); 313 314 res16 = _mm_packs_epi32(res_lo_round, res_hi_round); 315 res = _mm_packus_epi16(res16, res16); 316 317 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); 318 i++; 319 320 s[0] = s[2]; 321 s[1] = s[3]; 322 s[2] = s[4]; 323 s[3] = s[5]; 324 s[4] = s[6]; 325 s[5] = s[7]; 326 } while (i < h); 327 j += 8; 328 } while (j < w); 329 } 330 } 331 } 332 333 static void convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride, 334 uint8_t *dst, int dst_stride, int w, int h, 335 const InterpFilterParams *filter_params_x, 336 int subpel_x_qn, 337 ConvolveParams *conv_params) { 338 const int fo_horiz = filter_params_x->taps / 2 - 1; 339 const uint8_t *src_ptr = src - fo_horiz; 340 const int bits = FILTER_BITS - conv_params->round_0; 341 const __m128i round_0_const = 342 _mm_set1_epi32((1 << conv_params->round_0) >> 1); 343 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); 344 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); 345 const __m128i round_shift = _mm_cvtsi32_si128(bits); 346 const __m128i zero = _mm_setzero_si128(); 347 __m128i coeffs[6]; 348 349 assert(bits >= 0); 350 assert((FILTER_BITS - conv_params->round_1) >= 0 || 351 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 352 353 prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); 354 355 int i = 0; 356 do { 357 int j = 0; 358 do { 359 const __m128i data = 360 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); 361 __m128i s[4]; 362 363 s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1)); 364 s[1] = 365 _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); 366 s[2] = 367 _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); 368 s[3] = 369 _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); 370 371 const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero); 372 373 __m128i res32_round = 374 _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift); 375 res32_round = 376 _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift); 377 378 const __m128i res16 = _mm_packs_epi32(res32_round, zero); 379 const __m128i res = _mm_packus_epi16(res16, zero); 380 381 const int val = _mm_cvtsi128_si32(res); 382 memcpy((dst + i * dst_stride + j), &val, sizeof(val)); 383 j += 4; 384 } while (j < w); 385 } while (++i < h); 386 } 387 388 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, 389 int dst_stride, int w, int h, 390 const InterpFilterParams *filter_params_x, 391 const int subpel_x_qn, 392 ConvolveParams *conv_params) { 393 if (filter_params_x->taps > 8) { 394 if (w < 4) { 395 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, 396 filter_params_x, subpel_x_qn, conv_params); 397 } else { 398 convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, 399 filter_params_x, subpel_x_qn, conv_params); 400 } 401 } else { 402 const int fo_horiz = filter_params_x->taps / 2 - 1; 403 const uint8_t *src_ptr = src - fo_horiz; 404 const int bits = FILTER_BITS - conv_params->round_0; 405 const __m128i round_0_const = 406 _mm_set1_epi32((1 << conv_params->round_0) >> 1); 407 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); 408 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); 409 const __m128i round_shift = _mm_cvtsi32_si128(bits); 410 __m128i coeffs[4]; 411 412 assert(bits >= 0); 413 assert((FILTER_BITS - conv_params->round_1) >= 0 || 414 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 415 416 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); 417 418 if (w <= 4) { 419 do { 420 const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); 421 __m128i s[4]; 422 423 s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); 424 s[1] = 425 _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); 426 s[2] = 427 _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); 428 s[3] = 429 _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); 430 const __m128i res_lo = convolve_lo_x(s, coeffs); 431 __m128i res_lo_round = 432 _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); 433 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), 434 round_shift); 435 436 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); 437 const __m128i res = _mm_packus_epi16(res16, res16); 438 439 int r = _mm_cvtsi128_si32(res); 440 if (w == 2) 441 *(uint16_t *)dst = (uint16_t)r; 442 else 443 *(int *)dst = r; 444 445 src_ptr += src_stride; 446 dst += dst_stride; 447 } while (--h); 448 } else { 449 assert(!(w % 8)); 450 int i = 0; 451 do { 452 int j = 0; 453 do { 454 const __m128i data = 455 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); 456 __m128i s[4]; 457 458 // Filter even-index pixels 459 s[0] = data; 460 s[1] = _mm_srli_si128(data, 2); 461 s[2] = _mm_srli_si128(data, 4); 462 s[3] = _mm_srli_si128(data, 6); 463 const __m128i res_even = convolve_lo_x(s, coeffs); 464 465 // Filter odd-index pixels 466 s[0] = _mm_srli_si128(data, 1); 467 s[1] = _mm_srli_si128(data, 3); 468 s[2] = _mm_srli_si128(data, 5); 469 s[3] = _mm_srli_si128(data, 7); 470 const __m128i res_odd = convolve_lo_x(s, coeffs); 471 472 // Rearrange pixels back into the order 0 ... 7 473 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); 474 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); 475 __m128i res_lo_round = _mm_sra_epi32( 476 _mm_add_epi32(res_lo, round_0_const), round_0_shift); 477 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), 478 round_shift); 479 __m128i res_hi_round = _mm_sra_epi32( 480 _mm_add_epi32(res_hi, round_0_const), round_0_shift); 481 res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), 482 round_shift); 483 484 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); 485 const __m128i res = _mm_packus_epi16(res16, res16); 486 487 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); 488 j += 8; 489 } while (j < w); 490 } while (++i < h); 491 } 492 } 493 }