resize_ssse3.c (39070B)
1 /* 2 * 3 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <tmmintrin.h> // SSSE3 14 #include "config/av1_rtcd.h" 15 #include "config/aom_scale_rtcd.h" 16 17 #include "aom_dsp/x86/convolve_sse2.h" 18 #include "aom_dsp/x86/convolve_ssse3.h" 19 #include "aom_dsp/x86/mem_sse2.h" 20 #include "aom_dsp/x86/transpose_sse2.h" 21 #include "av1/common/resize.h" 22 23 static inline __m128i scale_plane_2_to_1_phase_0_kernel( 24 const uint8_t *const src, const __m128i *const mask) { 25 const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); 26 const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); 27 const __m128i a_and = _mm_and_si128(a, *mask); 28 const __m128i b_and = _mm_and_si128(b, *mask); 29 return _mm_packus_epi16(a_and, b_and); 30 } 31 32 static inline void shuffle_filter_odd_ssse3(const int16_t *const filter, 33 __m128i *const f) { 34 const __m128i f_values = _mm_load_si128((const __m128i *)filter); 35 // pack and duplicate the filter values 36 // It utilizes the fact that the high byte of filter[3] is always 0 to clean 37 // half of f[0] and f[4]. 38 assert(filter[3] >= 0 && filter[3] < 256); 39 f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); 40 f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); 41 f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); 42 f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); 43 f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); 44 } 45 46 static inline __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, 47 const __m128i *const f) { 48 // multiply 2 adjacent elements with the filter and add the result 49 const __m128i k_64 = _mm_set1_epi16(1 << 6); 50 const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); 51 const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); 52 const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); 53 const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); 54 // compensate the subtracted 64 in f[1]. x4 is always non negative. 55 const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); 56 // add and saturate the results together 57 __m128i temp = _mm_adds_epi16(x0, x3); 58 temp = _mm_adds_epi16(temp, x1); 59 temp = _mm_adds_epi16(temp, x2); 60 temp = _mm_adds_epi16(temp, x4); 61 // round and shift by 7 bit each 16 bit 62 temp = _mm_adds_epi16(temp, k_64); 63 temp = _mm_srai_epi16(temp, 7); 64 return temp; 65 } 66 67 static inline __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, 68 const __m128i *const f) { 69 // multiply 2 adjacent elements with the filter and add the result 70 const __m128i k_64 = _mm_set1_epi16(1 << 6); 71 const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); 72 const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); 73 const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); 74 const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); 75 const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); 76 // compensate the subtracted 64 in f[2]. x5 is always non negative. 77 const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); 78 __m128i temp; 79 80 // add and saturate the results together 81 temp = _mm_adds_epi16(x0, x1); 82 temp = _mm_adds_epi16(temp, x2); 83 temp = _mm_adds_epi16(temp, x3); 84 temp = _mm_adds_epi16(temp, x4); 85 temp = _mm_adds_epi16(temp, x5); 86 // round and shift by 7 bit each 16 bit 87 temp = _mm_adds_epi16(temp, k_64); 88 temp = _mm_srai_epi16(temp, 7); 89 return temp; 90 } 91 92 static void scale_plane_2_to_1_phase_0(const uint8_t *src, 93 const ptrdiff_t src_stride, uint8_t *dst, 94 const ptrdiff_t dst_stride, 95 const int dst_w, const int dst_h) { 96 const int max_width = (dst_w + 15) & ~15; 97 const __m128i mask = _mm_set1_epi16(0x00FF); 98 int y = dst_h; 99 100 do { 101 int x = max_width; 102 do { 103 const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); 104 _mm_storeu_si128((__m128i *)dst, d); 105 src += 32; 106 dst += 16; 107 x -= 16; 108 } while (x); 109 src += 2 * (src_stride - max_width); 110 dst += dst_stride - max_width; 111 } while (--y); 112 } 113 114 static void scale_plane_4_to_1_phase_0(const uint8_t *src, 115 const ptrdiff_t src_stride, uint8_t *dst, 116 const ptrdiff_t dst_stride, 117 const int dst_w, const int dst_h) { 118 const int max_width = (dst_w + 15) & ~15; 119 const __m128i mask = _mm_set1_epi32(0x000000FF); 120 int y = dst_h; 121 122 do { 123 int x = max_width; 124 do { 125 const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); 126 const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); 127 const __m128i d2 = _mm_packus_epi16(d0, d1); 128 _mm_storeu_si128((__m128i *)dst, d2); 129 src += 64; 130 dst += 16; 131 x -= 16; 132 } while (x); 133 src += 4 * (src_stride - max_width); 134 dst += dst_stride - max_width; 135 } while (--y); 136 } 137 138 static inline __m128i scale_plane_bilinear_kernel(const __m128i *const s, 139 const __m128i c0c1) { 140 const __m128i k_64 = _mm_set1_epi16(1 << 6); 141 const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); 142 const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); 143 // round and shift by 7 bit each 16 bit 144 const __m128i t2 = _mm_adds_epi16(t0, k_64); 145 const __m128i t3 = _mm_adds_epi16(t1, k_64); 146 const __m128i t4 = _mm_srai_epi16(t2, 7); 147 const __m128i t5 = _mm_srai_epi16(t3, 7); 148 return _mm_packus_epi16(t4, t5); 149 } 150 151 static void scale_plane_2_to_1_bilinear(const uint8_t *src, 152 const ptrdiff_t src_stride, 153 uint8_t *dst, 154 const ptrdiff_t dst_stride, 155 const int dst_w, const int dst_h, 156 const __m128i c0c1) { 157 const int max_width = (dst_w + 15) & ~15; 158 int y = dst_h; 159 160 do { 161 int x = max_width; 162 do { 163 __m128i s[2], d[2]; 164 165 // Horizontal 166 // Even rows 167 s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); 168 s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); 169 d[0] = scale_plane_bilinear_kernel(s, c0c1); 170 171 // odd rows 172 s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); 173 s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); 174 d[1] = scale_plane_bilinear_kernel(s, c0c1); 175 176 // Vertical 177 s[0] = _mm_unpacklo_epi8(d[0], d[1]); 178 s[1] = _mm_unpackhi_epi8(d[0], d[1]); 179 d[0] = scale_plane_bilinear_kernel(s, c0c1); 180 181 _mm_storeu_si128((__m128i *)dst, d[0]); 182 src += 32; 183 dst += 16; 184 x -= 16; 185 } while (x); 186 src += 2 * (src_stride - max_width); 187 dst += dst_stride - max_width; 188 } while (--y); 189 } 190 191 static void scale_plane_4_to_1_bilinear(const uint8_t *src, 192 const ptrdiff_t src_stride, 193 uint8_t *dst, 194 const ptrdiff_t dst_stride, 195 const int dst_w, const int dst_h, 196 const __m128i c0c1) { 197 const int max_width = (dst_w + 15) & ~15; 198 int y = dst_h; 199 200 do { 201 int x = max_width; 202 do { 203 __m128i s[8], d[8]; 204 205 // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. 206 // Here we tried to not use shuffle instructions which would be slow 207 // on some x86 CPUs. 208 209 // Horizontal 210 // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx 211 // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx 212 // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx 213 // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx 214 // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx 215 // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx 216 // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx 217 // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx 218 s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); 219 s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); 220 s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); 221 s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); 222 s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); 223 s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); 224 s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); 225 s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); 226 227 // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx 228 // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx 229 // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx 230 // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx 231 // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx 232 // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx 233 // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx 234 // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx 235 d[0] = _mm_unpacklo_epi16(s[0], s[4]); 236 d[1] = _mm_unpackhi_epi16(s[0], s[4]); 237 d[2] = _mm_unpacklo_epi16(s[1], s[5]); 238 d[3] = _mm_unpackhi_epi16(s[1], s[5]); 239 d[4] = _mm_unpacklo_epi16(s[2], s[6]); 240 d[5] = _mm_unpackhi_epi16(s[2], s[6]); 241 d[6] = _mm_unpacklo_epi16(s[3], s[7]); 242 d[7] = _mm_unpackhi_epi16(s[3], s[7]); 243 244 // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx 245 // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx 246 // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx 247 // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx 248 // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx 249 // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx 250 // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx 251 // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx 252 s[0] = _mm_unpacklo_epi32(d[0], d[1]); 253 s[1] = _mm_unpackhi_epi32(d[0], d[1]); 254 s[2] = _mm_unpacklo_epi32(d[2], d[3]); 255 s[3] = _mm_unpackhi_epi32(d[2], d[3]); 256 s[4] = _mm_unpacklo_epi32(d[4], d[5]); 257 s[5] = _mm_unpackhi_epi32(d[4], d[5]); 258 s[6] = _mm_unpacklo_epi32(d[6], d[7]); 259 s[7] = _mm_unpackhi_epi32(d[6], d[7]); 260 261 // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D 262 // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D 263 // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D 264 // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D 265 d[0] = _mm_unpacklo_epi32(s[0], s[1]); 266 d[1] = _mm_unpacklo_epi32(s[2], s[3]); 267 d[2] = _mm_unpacklo_epi32(s[4], s[5]); 268 d[3] = _mm_unpacklo_epi32(s[6], s[7]); 269 270 d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); 271 d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); 272 273 // Vertical 274 d[0] = scale_plane_bilinear_kernel(d, c0c1); 275 276 _mm_storeu_si128((__m128i *)dst, d[0]); 277 src += 64; 278 dst += 16; 279 x -= 16; 280 } while (x); 281 src += 4 * (src_stride - max_width); 282 dst += dst_stride - max_width; 283 } while (--y); 284 } 285 286 static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, 287 uint8_t *dst, const int dst_stride, 288 const int w, const int h, 289 const int16_t *const coef, 290 uint8_t *const temp_buffer) { 291 const int width_hor = (w + 1) & ~1; 292 const int width_ver = (w + 7) & ~7; 293 const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; 294 const int height_ver = (h + 1) & ~1; 295 int x, y = height_hor; 296 uint8_t *t = temp_buffer; 297 __m128i s[11], d[4]; 298 __m128i f[4]; 299 300 assert(w && h); 301 302 shuffle_filter_ssse3(coef, f); 303 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; 304 305 // horizontal 2x8 306 do { 307 load_8bit_8x8(src + 4, src_stride, s); 308 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 309 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 310 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) 311 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) 312 transpose_16bit_4x8(s, s); 313 x = width_hor; 314 315 do { 316 src += 8; 317 load_8bit_8x8(src, src_stride, &s[2]); 318 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 319 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 320 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 321 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B 322 transpose_16bit_4x8(&s[2], &s[2]); 323 324 d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 325 d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 326 327 // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx 328 // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx 329 d[0] = _mm_packus_epi16(d[0], d[0]); 330 d[1] = _mm_packus_epi16(d[1], d[1]); 331 // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 332 d[0] = _mm_unpacklo_epi16(d[0], d[1]); 333 store_8bit_4x4_sse2(d[0], t, 2 * width_hor); 334 335 s[0] = s[4]; 336 s[1] = s[5]; 337 338 t += 4; 339 x -= 2; 340 } while (x); 341 src += 8 * src_stride - 4 * width_hor; 342 t += 6 * width_hor; 343 y -= 8; 344 } while (y); 345 346 // vertical 8x2 347 x = width_ver; 348 t = temp_buffer; 349 do { 350 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 351 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 352 s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); 353 s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); 354 t += 4 * width_hor; 355 y = height_ver; 356 357 do { 358 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 359 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 360 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 361 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 362 loadu_8bit_16x4(t, 2 * width_hor, &s[2]); 363 t += 8 * width_hor; 364 365 d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 366 d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 367 368 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 369 d[0] = _mm_packus_epi16(d[0], d[1]); 370 _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); 371 _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); 372 373 s[0] = s[4]; 374 s[1] = s[5]; 375 376 dst += 2 * dst_stride; 377 y -= 2; 378 } while (y); 379 t -= width_hor * (4 * height_ver + 4); 380 t += 16; 381 dst -= height_ver * dst_stride; 382 dst += 8; 383 x -= 8; 384 } while (x); 385 } 386 387 static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, 388 uint8_t *dst, const int dst_stride, 389 const int w, const int h, 390 const int16_t *const coef, 391 uint8_t *const temp_buffer) { 392 const int width_hor = (w + 3) & ~3; 393 const int width_ver = (w + 7) & ~7; 394 const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; 395 const int height_ver = (h + 3) & ~3; 396 int x, y = height_hor; 397 uint8_t *t = temp_buffer; 398 __m128i s[11], d[4]; 399 __m128i f[4]; 400 401 assert(w && h); 402 403 shuffle_filter_ssse3(coef, f); 404 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; 405 406 // horizontal 4x8 407 do { 408 load_8bit_8x8(src + 2, src_stride, s); 409 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 410 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 411 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 412 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) 413 transpose_16bit_4x8(s, s); 414 x = width_hor; 415 416 do { 417 src += 8; 418 load_8bit_8x8(src, src_stride, &s[3]); 419 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 420 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 421 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B 422 // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D 423 transpose_16bit_4x8(&s[3], &s[3]); 424 425 d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 426 d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 427 d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 428 d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 429 430 // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 431 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 432 d[0] = _mm_packus_epi16(d[0], d[2]); 433 d[1] = _mm_packus_epi16(d[1], d[3]); 434 // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 435 // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 436 d[2] = _mm_unpacklo_epi16(d[0], d[1]); 437 d[3] = _mm_unpackhi_epi16(d[0], d[1]); 438 // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 439 // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 440 d[0] = _mm_unpacklo_epi32(d[2], d[3]); 441 d[1] = _mm_unpackhi_epi32(d[2], d[3]); 442 store_8bit_8x4_from_16x2(d, t, 2 * width_hor); 443 444 s[0] = s[4]; 445 s[1] = s[5]; 446 s[2] = s[6]; 447 448 t += 8; 449 x -= 4; 450 } while (x); 451 src += 8 * src_stride - 2 * width_hor; 452 t += 6 * width_hor; 453 y -= 8; 454 } while (y); 455 456 // vertical 8x4 457 x = width_ver; 458 t = temp_buffer; 459 do { 460 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 461 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 462 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 463 s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); 464 s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); 465 s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); 466 t += 6 * width_hor; 467 y = height_ver; 468 469 do { 470 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 471 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 472 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 473 // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 474 loadu_8bit_16x4(t, 2 * width_hor, &s[3]); 475 t += 8 * width_hor; 476 477 d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 478 d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 479 d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 480 d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 481 482 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 483 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 484 d[0] = _mm_packus_epi16(d[0], d[1]); 485 d[1] = _mm_packus_epi16(d[2], d[3]); 486 store_8bit_8x4_from_16x2(d, dst, dst_stride); 487 488 s[0] = s[4]; 489 s[1] = s[5]; 490 s[2] = s[6]; 491 492 dst += 4 * dst_stride; 493 y -= 4; 494 } while (y); 495 t -= width_hor * (2 * height_ver + 6); 496 t += 16; 497 dst -= height_ver * dst_stride; 498 dst += 8; 499 x -= 8; 500 } while (x); 501 } 502 503 typedef void (*shuffle_filter_funcs)(const int16_t *const filter, 504 __m128i *const f); 505 506 typedef __m128i (*convolve8_funcs)(const __m128i *const s, 507 const __m128i *const f); 508 509 static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, 510 uint8_t *dst, const int dst_stride, 511 const int w, const int h, 512 const InterpKernel *const coef, 513 const int phase, 514 uint8_t *const temp_buffer) { 515 static const int step_q4 = 16 * 4 / 3; 516 const int width_hor = (w + 5) - ((w + 5) % 6); 517 const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels 518 const int width_ver = (w + 7) & ~7; 519 // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows 520 // above and (SUBPEL_TAPS / 2) extra rows below. 521 const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; 522 const int height_ver = (h + 5) - ((h + 5) % 6); 523 int x, y = height_hor; 524 uint8_t *t = temp_buffer; 525 __m128i s[12], d[6], dd[4]; 526 __m128i f0[4], f1[5], f2[5]; 527 // The offset of the first row is always less than 1 pixel. 528 const int offset1_q4 = phase + 1 * step_q4; 529 const int offset2_q4 = phase + 2 * step_q4; 530 // offset_idxx indicates the pixel offset is even (0) or odd (1). 531 // It's used to choose the src offset and filter coefficient offset. 532 const int offset_idx1 = (offset1_q4 >> 4) & 1; 533 const int offset_idx2 = (offset2_q4 >> 4) & 1; 534 static const shuffle_filter_funcs shuffle_filter_func_list[2] = { 535 shuffle_filter_ssse3, shuffle_filter_odd_ssse3 536 }; 537 static const convolve8_funcs convolve8_func_list[2] = { 538 convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 539 }; 540 541 assert(w && h); 542 543 shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0); 544 shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); 545 shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); 546 547 // Sub 64 to avoid overflow. 548 // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. 549 // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. 550 // When filter phase idx is 1, the two biggest coefficients are shuffled 551 // together, and the sum of them are always no less than 128. Sub 64 here. 552 // After the subtraction, when the sum of all positive coefficients are no 553 // larger than 128, and the sum of all negative coefficients are no 554 // less than -128, there will be no overflow in the convolve8 functions. 555 f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); 556 f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); 557 f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); 558 559 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; 560 561 // horizontal 6x8 562 do { 563 load_8bit_8x8(src, src_stride, s); 564 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 565 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 566 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 567 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 568 transpose_16bit_4x8(s, s); 569 x = width_hor; 570 571 do { 572 src += 8; 573 load_8bit_8x8(src, src_stride, &s[4]); 574 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 575 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B 576 // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D 577 // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F 578 transpose_16bit_4x8(&s[4], &s[4]); 579 580 // 00 10 20 30 40 50 60 70 581 // 01 11 21 31 41 51 61 71 582 // 02 12 22 32 42 52 62 72 583 // 03 13 23 33 43 53 63 73 584 // 04 14 24 34 44 54 64 74 585 // 05 15 25 35 45 55 65 75 586 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); 587 d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); 588 d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); 589 d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); 590 d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); 591 d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); 592 593 // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 594 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 595 // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx 596 // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx 597 dd[0] = _mm_packus_epi16(d[0], d[2]); 598 dd[1] = _mm_packus_epi16(d[1], d[3]); 599 dd[2] = _mm_packus_epi16(d[4], d[4]); 600 dd[3] = _mm_packus_epi16(d[5], d[5]); 601 602 // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 603 // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 604 // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 605 d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); 606 d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); 607 d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); 608 609 // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 610 // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 611 // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx 612 // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx 613 dd[0] = _mm_unpacklo_epi32(d[0], d[1]); 614 dd[1] = _mm_unpackhi_epi32(d[0], d[1]); 615 dd[2] = _mm_unpacklo_epi32(d[2], d[2]); 616 dd[3] = _mm_unpackhi_epi32(d[2], d[2]); 617 618 // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx 619 // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx 620 // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx 621 // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx 622 d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); 623 d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); 624 d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); 625 d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); 626 627 // store 4 extra pixels 628 storeu_8bit_16x4(d, t, stride_hor); 629 630 s[0] = s[4]; 631 s[1] = s[5]; 632 s[2] = s[6]; 633 s[3] = s[7]; 634 635 t += 12; 636 x -= 6; 637 } while (x); 638 src += 8 * src_stride - 4 * width_hor / 3; 639 t += 3 * stride_hor + 4; 640 y -= 8; 641 } while (y); 642 643 // vertical 8x6 644 x = width_ver; 645 t = temp_buffer; 646 do { 647 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 648 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 649 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 650 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 651 loadu_8bit_16x4(t, stride_hor, s); 652 y = height_ver; 653 654 do { 655 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 656 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 657 // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 658 // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 659 t += 4 * stride_hor; 660 loadu_8bit_16x4(t, stride_hor, &s[4]); 661 662 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); 663 d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); 664 d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); 665 d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); 666 d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); 667 d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); 668 669 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 670 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 671 // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 672 d[0] = _mm_packus_epi16(d[0], d[1]); 673 d[2] = _mm_packus_epi16(d[2], d[3]); 674 d[4] = _mm_packus_epi16(d[4], d[5]); 675 676 _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); 677 _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); 678 _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); 679 _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); 680 _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); 681 _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); 682 683 s[0] = s[4]; 684 s[1] = s[5]; 685 s[2] = s[6]; 686 s[3] = s[7]; 687 688 dst += 6 * dst_stride; 689 y -= 6; 690 } while (y); 691 t -= stride_hor * 2 * height_ver / 3; 692 t += 16; 693 dst -= height_ver * dst_stride; 694 dst += 8; 695 x -= 8; 696 } while (x); 697 } 698 699 static inline __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, 700 const __m128i *const f) { 701 __m128i ss[4], temp; 702 703 ss[0] = _mm_unpacklo_epi8(s[0], s[1]); 704 ss[1] = _mm_unpacklo_epi8(s[2], s[3]); 705 ss[2] = _mm_unpacklo_epi8(s[4], s[5]); 706 ss[3] = _mm_unpacklo_epi8(s[6], s[7]); 707 temp = convolve8_8_ssse3(ss, f); 708 return _mm_packus_epi16(temp, temp); 709 } 710 711 // Only calculate odd columns since even columns are just src pixels' copies. 712 static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, 713 const int w, const __m128i *const f) { 714 int x = w; 715 716 do { 717 __m128i s[8], temp; 718 s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); 719 s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); 720 s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); 721 s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); 722 s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); 723 s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); 724 s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); 725 s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); 726 temp = scale_1_to_2_phase_0_kernel(s, f); 727 _mm_storel_epi64((__m128i *)dst, temp); 728 src += 8; 729 dst += 8; 730 x -= 8; 731 } while (x); 732 } 733 734 static void scale_plane_1_to_2_phase_0(const uint8_t *src, 735 const ptrdiff_t src_stride, uint8_t *dst, 736 const ptrdiff_t dst_stride, 737 const int src_w, const int src_h, 738 const int16_t *const coef, 739 uint8_t *const temp_buffer) { 740 int max_width; 741 int y; 742 uint8_t *tmp[9]; 743 __m128i f[4]; 744 745 max_width = (src_w + 7) & ~7; 746 tmp[0] = temp_buffer + 0 * max_width; 747 tmp[1] = temp_buffer + 1 * max_width; 748 tmp[2] = temp_buffer + 2 * max_width; 749 tmp[3] = temp_buffer + 3 * max_width; 750 tmp[4] = temp_buffer + 4 * max_width; 751 tmp[5] = temp_buffer + 5 * max_width; 752 tmp[6] = temp_buffer + 6 * max_width; 753 tmp[7] = temp_buffer + 7 * max_width; 754 755 shuffle_filter_ssse3(coef, f); 756 757 scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); 758 scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); 759 scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); 760 scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); 761 scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); 762 scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); 763 scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); 764 765 y = src_h; 766 do { 767 int x; 768 scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); 769 for (x = 0; x < max_width; x += 8) { 770 __m128i s[8], C, D, CD; 771 772 // Even rows 773 const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); 774 const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); 775 const __m128i ab = _mm_unpacklo_epi8(a, b); 776 _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); 777 778 // Odd rows 779 // Even columns 780 load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); 781 C = scale_1_to_2_phase_0_kernel(s, f); 782 783 // Odd columns 784 s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); 785 s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); 786 s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); 787 s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); 788 s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); 789 s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); 790 s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); 791 s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); 792 D = scale_1_to_2_phase_0_kernel(s, f); 793 794 CD = _mm_unpacklo_epi8(C, D); 795 _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); 796 } 797 798 src += src_stride; 799 dst += 2 * dst_stride; 800 tmp[8] = tmp[0]; 801 tmp[0] = tmp[1]; 802 tmp[1] = tmp[2]; 803 tmp[2] = tmp[3]; 804 tmp[3] = tmp[4]; 805 tmp[4] = tmp[5]; 806 tmp[5] = tmp[6]; 807 tmp[6] = tmp[7]; 808 tmp[7] = tmp[8]; 809 } while (--y); 810 } 811 812 // There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling 813 // in SSSE3. 814 static inline bool has_normative_scaler_ssse3(const int src_width, 815 const int src_height, 816 const int dst_width, 817 const int dst_height) { 818 const bool has_normative_scaler = 819 (2 * dst_width == src_width && 2 * dst_height == src_height) || 820 (4 * dst_width == src_width && 4 * dst_height == src_height) || 821 (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) || 822 (dst_width == src_width * 2 && dst_height == src_height * 2); 823 824 return has_normative_scaler; 825 } 826 827 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, 828 YV12_BUFFER_CONFIG *dst, 829 const InterpFilter filter, 830 const int phase, const int num_planes) { 831 bool has_normative_scaler = 832 has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height, 833 dst->y_crop_width, dst->y_crop_height); 834 835 if (num_planes > 1) { 836 has_normative_scaler = 837 has_normative_scaler && 838 has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height, 839 dst->uv_crop_width, dst->uv_crop_height); 840 } 841 842 if (!has_normative_scaler) { 843 av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); 844 return; 845 } 846 847 // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet 848 // the static analysis warnings. 849 int malloc_failed = 0; 850 for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { 851 const int is_uv = i > 0; 852 const int src_w = src->crop_widths[is_uv]; 853 const int src_h = src->crop_heights[is_uv]; 854 const int src_y_w = (src->crop_widths[0] + 1) & ~1; 855 const int dst_w = dst->crop_widths[is_uv]; 856 const int dst_h = dst->crop_heights[is_uv]; 857 const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; 858 const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; 859 860 if (2 * dst_w == src_w && 2 * dst_h == src_h) { 861 // 2 to 1 862 if (phase == 0) { 863 scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], 864 dst->buffers[i], dst->strides[is_uv], dst_w, 865 dst_h); 866 } else if (filter == BILINEAR) { 867 const int16_t c0 = av1_bilinear_filters[phase][3]; 868 const int16_t c1 = av1_bilinear_filters[phase][4]; 869 const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 870 scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], 871 dst->buffers[i], dst->strides[is_uv], dst_w, 872 dst_h, c0c1); 873 } else { 874 const int buffer_stride = (dst_y_w + 3) & ~3; 875 const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; 876 uint8_t *const temp_buffer = 877 (uint8_t *)malloc(buffer_stride * buffer_height); 878 if (!temp_buffer) { 879 malloc_failed = 1; 880 break; 881 } 882 const InterpKernel *interp_kernel = 883 (const InterpKernel *)av1_interp_filter_params_list[filter] 884 .filter_ptr; 885 scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv], 886 dst->buffers[i], dst->strides[is_uv], dst_w, 887 dst_h, interp_kernel[phase], temp_buffer); 888 free(temp_buffer); 889 } 890 } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { 891 // 4 to 1 892 if (phase == 0) { 893 scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], 894 dst->buffers[i], dst->strides[is_uv], dst_w, 895 dst_h); 896 } else if (filter == BILINEAR) { 897 const int16_t c0 = av1_bilinear_filters[phase][3]; 898 const int16_t c1 = av1_bilinear_filters[phase][4]; 899 const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 900 scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], 901 dst->buffers[i], dst->strides[is_uv], dst_w, 902 dst_h, c0c1); 903 } else { 904 const int buffer_stride = (dst_y_w + 1) & ~1; 905 const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; 906 // When dst_w is 1 or 2, we need extra padding to avoid heap read 907 // overflow 908 const int extra_padding = 16; 909 uint8_t *const temp_buffer = 910 (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); 911 if (!temp_buffer) { 912 malloc_failed = 1; 913 break; 914 } 915 const InterpKernel *interp_kernel = 916 (const InterpKernel *)av1_interp_filter_params_list[filter] 917 .filter_ptr; 918 scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv], 919 dst->buffers[i], dst->strides[is_uv], dst_w, 920 dst_h, interp_kernel[phase], temp_buffer); 921 free(temp_buffer); 922 } 923 } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { 924 // 4 to 3 925 const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; 926 const int buffer_stride_ver = (dst_y_w + 7) & ~7; 927 const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; 928 // When the vertical filter reads more pixels than the horizontal filter 929 // generated in each row, we need extra padding to avoid heap read 930 // overflow. For example, the horizontal filter generates 18 pixels but 931 // the vertical filter reads 24 pixels in a row. The difference is 932 // multiplied by 2 since two rows are interlaced together in the 933 // optimization. 934 const int extra_padding = 935 (buffer_stride_ver > buffer_stride_hor) 936 ? 2 * (buffer_stride_ver - buffer_stride_hor) 937 : 0; 938 const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; 939 uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); 940 if (!temp_buffer) { 941 malloc_failed = 1; 942 break; 943 } 944 const InterpKernel *interp_kernel = 945 (const InterpKernel *)av1_interp_filter_params_list[filter] 946 .filter_ptr; 947 scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv], 948 dst->buffers[i], dst->strides[is_uv], dst_w, 949 dst_h, interp_kernel, phase, temp_buffer); 950 free(temp_buffer); 951 } else { 952 assert(dst_w == src_w * 2 && dst_h == src_h * 2); 953 // 1 to 2 954 uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7)); 955 if (!temp_buffer) { 956 malloc_failed = 1; 957 break; 958 } 959 const InterpKernel *interp_kernel = 960 (const InterpKernel *)av1_interp_filter_params_list[filter] 961 .filter_ptr; 962 scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv], 963 dst->buffers[i], dst->strides[is_uv], src_w, 964 src_h, interp_kernel[8], temp_buffer); 965 free(temp_buffer); 966 } 967 } 968 969 if (malloc_failed) { 970 av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); 971 } else { 972 aom_extend_frame_borders(dst, num_planes); 973 } 974 }