resize_neon.c (30210B)
1 /* 2 * 3 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 16 #include "aom_dsp/arm/mem_neon.h" 17 #include "aom_dsp/arm/transpose_neon.h" 18 #include "av1/common/arm/resize_neon.h" 19 #include "av1/common/resize.h" 20 #include "config/aom_scale_rtcd.h" 21 #include "config/av1_rtcd.h" 22 23 static inline void scale_plane_2_to_1_phase_0(const uint8_t *src, 24 const int src_stride, 25 uint8_t *dst, 26 const int dst_stride, int w, 27 int h) { 28 assert(w > 0 && h > 0); 29 30 do { 31 const uint8_t *s = src; 32 uint8_t *d = dst; 33 int width = w; 34 35 do { 36 const uint8x16x2_t s0 = vld2q_u8(s); 37 38 vst1q_u8(d, s0.val[0]); 39 40 s += 32; 41 d += 16; 42 width -= 16; 43 } while (width > 0); 44 45 src += 2 * src_stride; 46 dst += dst_stride; 47 } while (--h != 0); 48 } 49 50 static inline void scale_plane_4_to_1_phase_0(const uint8_t *src, 51 const int src_stride, 52 uint8_t *dst, 53 const int dst_stride, int w, 54 int h) { 55 assert(w > 0 && h > 0); 56 57 do { 58 const uint8_t *s = src; 59 uint8_t *d = dst; 60 int width = w; 61 62 do { 63 const uint8x16x4_t s0 = vld4q_u8(s); 64 65 vst1q_u8(d, s0.val[0]); 66 67 s += 64; 68 d += 16; 69 width -= 16; 70 } while (width > 0); 71 72 src += 4 * src_stride; 73 dst += dst_stride; 74 } while (--h != 0); 75 } 76 77 static inline uint8x16_t scale_plane_bilinear_kernel( 78 const uint8x16_t s0_even, const uint8x16_t s0_odd, const uint8x16_t s1_even, 79 const uint8x16_t s1_odd, const uint8x8_t filter0, const uint8x8_t filter1) { 80 // A shim of 1 << (FILTER_BITS - 1) enables us to use non-rounding 81 // shifts - which are generally faster than rounding shifts on modern CPUs. 82 uint16x8_t offset = vdupq_n_u16(1 << (FILTER_BITS - 1)); 83 84 // Horizontal filtering 85 uint16x8_t h0_lo = vmlal_u8(offset, vget_low_u8(s0_even), filter0); 86 uint16x8_t h0_hi = vmlal_u8(offset, vget_high_u8(s0_even), filter0); 87 uint16x8_t h1_lo = vmlal_u8(offset, vget_low_u8(s1_even), filter0); 88 uint16x8_t h1_hi = vmlal_u8(offset, vget_high_u8(s1_even), filter0); 89 90 h0_lo = vmlal_u8(h0_lo, vget_low_u8(s0_odd), filter1); 91 h0_hi = vmlal_u8(h0_hi, vget_high_u8(s0_odd), filter1); 92 h1_lo = vmlal_u8(h1_lo, vget_low_u8(s1_odd), filter1); 93 h1_hi = vmlal_u8(h1_hi, vget_high_u8(s1_odd), filter1); 94 95 const uint8x8_t h0_lo_u8 = vshrn_n_u16(h0_lo, FILTER_BITS); 96 const uint8x8_t h0_hi_u8 = vshrn_n_u16(h0_hi, FILTER_BITS); 97 const uint8x8_t h1_lo_u8 = vshrn_n_u16(h1_lo, FILTER_BITS); 98 const uint8x8_t h1_hi_u8 = vshrn_n_u16(h1_hi, FILTER_BITS); 99 100 // Vertical filtering 101 uint16x8_t v_lo = vmlal_u8(offset, h0_lo_u8, filter0); 102 uint16x8_t v_hi = vmlal_u8(offset, h0_hi_u8, filter0); 103 104 v_lo = vmlal_u8(v_lo, h1_lo_u8, filter1); 105 v_hi = vmlal_u8(v_hi, h1_hi_u8, filter1); 106 107 return vcombine_u8(vshrn_n_u16(v_lo, FILTER_BITS), 108 vshrn_n_u16(v_hi, FILTER_BITS)); 109 } 110 111 static inline void scale_plane_2_to_1_bilinear( 112 const uint8_t *src, const int src_stride, uint8_t *dst, 113 const int dst_stride, int w, int h, const int16_t f0, const int16_t f1) { 114 assert(w > 0 && h > 0); 115 const uint8x8_t filter0 = vdup_n_u8(f0); 116 const uint8x8_t filter1 = vdup_n_u8(f1); 117 118 do { 119 const uint8_t *s = src; 120 uint8_t *d = dst; 121 int width = w; 122 123 do { 124 const uint8x16x2_t s0 = vld2q_u8(s + 0 * src_stride); 125 const uint8x16x2_t s1 = vld2q_u8(s + 1 * src_stride); 126 127 uint8x16_t d0 = scale_plane_bilinear_kernel( 128 s0.val[0], s0.val[1], s1.val[0], s1.val[1], filter0, filter1); 129 130 vst1q_u8(d, d0); 131 132 s += 32; 133 d += 16; 134 width -= 16; 135 } while (width > 0); 136 137 src += 2 * src_stride; 138 dst += dst_stride; 139 } while (--h != 0); 140 } 141 142 static inline void scale_plane_4_to_1_bilinear( 143 const uint8_t *src, const int src_stride, uint8_t *dst, 144 const int dst_stride, int w, int h, const int16_t f0, const int16_t f1) { 145 assert(w > 0 && h > 0); 146 const uint8x8_t filter0 = vdup_n_u8(f0); 147 const uint8x8_t filter1 = vdup_n_u8(f1); 148 149 do { 150 const uint8_t *s = src; 151 uint8_t *d = dst; 152 int width = w; 153 154 do { 155 const uint8x16x4_t s0 = vld4q_u8(s + 0 * src_stride); 156 const uint8x16x4_t s1 = vld4q_u8(s + 1 * src_stride); 157 158 uint8x16_t d0 = scale_plane_bilinear_kernel( 159 s0.val[0], s0.val[1], s1.val[0], s1.val[1], filter0, filter1); 160 161 vst1q_u8(d, d0); 162 163 s += 64; 164 d += 16; 165 width -= 16; 166 } while (width > 0); 167 168 src += 4 * src_stride; 169 dst += dst_stride; 170 } while (--h != 0); 171 } 172 173 static inline void scale_2_to_1_horiz_6tap(const uint8_t *src, 174 const int src_stride, int w, int h, 175 uint8_t *dst, const int dst_stride, 176 const int16x8_t filters) { 177 do { 178 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 179 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 180 181 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 182 183 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 184 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 185 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 186 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 187 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 188 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 189 190 const uint8_t *s = src + 6; 191 uint8_t *d = dst; 192 int width = w; 193 194 do { 195 uint8x8_t t8, t9, t10, t11, t12, t13; 196 load_u8_8x8(s, src_stride, &t6, &t7, &t8, &t9, &t10, &t11, &t12, &t13); 197 198 transpose_elems_inplace_u8_8x8(&t6, &t7, &t8, &t9, &t10, &t11, &t12, 199 &t13); 200 201 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 202 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 203 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 204 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); 205 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); 206 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); 207 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); 208 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); 209 210 uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); 211 uint8x8_t d1 = scale_filter6_8(s2, s3, s4, s5, s6, s7, filters); 212 uint8x8_t d2 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); 213 uint8x8_t d3 = scale_filter6_8(s6, s7, s8, s9, s10, s11, filters); 214 215 transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); 216 217 store_u8x4_strided_x2(d + 0 * dst_stride, 4 * dst_stride, d0); 218 store_u8x4_strided_x2(d + 1 * dst_stride, 4 * dst_stride, d1); 219 store_u8x4_strided_x2(d + 2 * dst_stride, 4 * dst_stride, d2); 220 store_u8x4_strided_x2(d + 3 * dst_stride, 4 * dst_stride, d3); 221 222 s0 = s8; 223 s1 = s9; 224 s2 = s10; 225 s3 = s11; 226 s4 = s12; 227 s5 = s13; 228 229 d += 4; 230 s += 8; 231 width -= 4; 232 } while (width > 0); 233 234 dst += 8 * dst_stride; 235 src += 8 * src_stride; 236 h -= 8; 237 } while (h > 0); 238 } 239 240 static inline void scale_plane_2_to_1_6tap(const uint8_t *src, 241 const int src_stride, uint8_t *dst, 242 const int dst_stride, const int w, 243 const int h, 244 const int16_t *const filter_ptr, 245 uint8_t *const im_block) { 246 assert(w > 0 && h > 0); 247 const int im_h = 2 * h + SUBPEL_TAPS - 3; 248 const int im_stride = (w + 3) & ~3; 249 250 // All filter values are even, halve them to stay in 16-bit elements when 251 // applying filter. 252 const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); 253 254 const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 2; 255 const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 2) * src_stride; 256 257 scale_2_to_1_horiz_6tap(src - horiz_offset - vert_offset, src_stride, w, im_h, 258 im_block, im_stride, filters); 259 260 scale_2_to_1_vert_6tap(im_block, im_stride, w, h, dst, dst_stride, filters); 261 } 262 263 static inline void scale_4_to_1_horiz_6tap(const uint8_t *src, 264 const int src_stride, int w, int h, 265 uint8_t *dst, const int dst_stride, 266 const int16x8_t filters) { 267 do { 268 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 269 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 270 271 transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3); 272 273 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 274 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 275 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 276 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 277 278 const uint8_t *s = src + 4; 279 uint8_t *d = dst; 280 int width = w; 281 282 do { 283 uint8x8_t t8, t9, t10, t11; 284 load_u8_8x8(s, src_stride, &t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); 285 286 transpose_elems_inplace_u8_8x8(&t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); 287 288 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 289 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 290 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 291 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 292 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 293 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); 294 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); 295 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); 296 297 uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); 298 uint8x8_t d1 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); 299 300 uint8x8x2_t d01 = vtrn_u8(d0, d1); 301 302 store_u8x2_strided_x4(d + 0 * dst_stride, 2 * dst_stride, d01.val[0]); 303 store_u8x2_strided_x4(d + 1 * dst_stride, 2 * dst_stride, d01.val[1]); 304 305 s0 = s8; 306 s1 = s9; 307 s2 = s10; 308 s3 = s11; 309 310 d += 2; 311 s += 8; 312 width -= 2; 313 } while (width > 0); 314 315 dst += 8 * dst_stride; 316 src += 8 * src_stride; 317 h -= 8; 318 } while (h > 0); 319 } 320 321 static inline void scale_plane_4_to_1_6tap(const uint8_t *src, 322 const int src_stride, uint8_t *dst, 323 const int dst_stride, const int w, 324 const int h, 325 const int16_t *const filter_ptr, 326 uint8_t *const im_block) { 327 assert(w > 0 && h > 0); 328 const int im_h = 4 * h + SUBPEL_TAPS - 3; 329 const int im_stride = (w + 1) & ~1; 330 // All filter values are even, halve them to stay in 16-bit elements when 331 // applying filter. 332 const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); 333 334 const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 2; 335 const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 2) * src_stride; 336 337 scale_4_to_1_horiz_6tap(src - horiz_offset - vert_offset, src_stride, w, im_h, 338 im_block, im_stride, filters); 339 340 scale_4_to_1_vert_6tap(im_block, im_stride, w, h, dst, dst_stride, filters); 341 } 342 343 static inline uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, 344 const uint8x8_t *const coef) { 345 const uint16x8_t h0 = vmull_u8(s[0], coef[0]); 346 const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); 347 348 return vrshrn_n_u16(h1, 7); 349 } 350 351 // Notes for 4 to 3 scaling: 352 // 353 // 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be 354 // multiple of 6, and no less than w. 355 // 356 // 2. 8 rows are calculated in each vertical inner loop, so width_ver must be 357 // multiple of 8, and no less than w. 358 // 359 // 3. 8 columns are calculated in each horizontal inner loop for further 360 // vertical scaling, so height_hor must be multiple of 8, and no less than 361 // 4 * h / 3. 362 // 363 // 4. 6 columns are calculated in each vertical inner loop, so height_ver must 364 // be multiple of 6, and no less than h. 365 // 366 // 5. The physical location of the last row of the 4 to 3 scaled frame is 367 // decided by phase_scaler, and are always less than 1 pixel below the last row 368 // of the original image. 369 static inline void scale_plane_4_to_3_bilinear( 370 const uint8_t *src, const int src_stride, uint8_t *dst, 371 const int dst_stride, const int w, const int h, const int phase_scaler, 372 uint8_t *const temp_buffer) { 373 static const int step_q4 = 16 * 4 / 3; 374 const int width_hor = (w + 5) - ((w + 5) % 6); 375 const int stride_hor = width_hor + 2; // store 2 extra pixels 376 const int width_ver = (w + 7) & ~7; 377 // We only need 1 extra row below because there are only 2 bilinear 378 // coefficients. 379 const int height_hor = (4 * h / 3 + 1 + 7) & ~7; 380 const int height_ver = (h + 5) - ((h + 5) % 6); 381 int x, y = height_hor; 382 uint8_t *t = temp_buffer; 383 uint8x8_t s[9], d[8], c[6]; 384 const InterpKernel *interp_kernel = 385 (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr; 386 assert(w && h); 387 388 c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]); 389 c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]); 390 c[2] = vdup_n_u8( 391 (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]); 392 c[3] = vdup_n_u8( 393 (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]); 394 c[4] = vdup_n_u8( 395 (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]); 396 c[5] = vdup_n_u8( 397 (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]); 398 399 d[6] = vdup_n_u8(0); 400 d[7] = vdup_n_u8(0); 401 402 // horizontal 6x8 403 do { 404 load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], 405 &s[6], &s[7]); 406 src += 1; 407 transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], 408 &s[6], &s[7]); 409 x = width_hor; 410 411 do { 412 load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], 413 &s[7], &s[8]); 414 src += 8; 415 transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], 416 &s[7], &s[8]); 417 418 // 00 10 20 30 40 50 60 70 419 // 01 11 21 31 41 51 61 71 420 // 02 12 22 32 42 52 62 72 421 // 03 13 23 33 43 53 63 73 422 // 04 14 24 34 44 54 64 74 423 // 05 15 25 35 45 55 65 75 424 d[0] = scale_filter_bilinear(&s[0], &c[0]); 425 d[1] = 426 scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); 427 d[2] = 428 scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); 429 d[3] = scale_filter_bilinear(&s[4], &c[0]); 430 d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], 431 &c[2]); 432 d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], 433 &c[4]); 434 435 // 00 01 02 03 04 05 xx xx 436 // 10 11 12 13 14 15 xx xx 437 // 20 21 22 23 24 25 xx xx 438 // 30 31 32 33 34 35 xx xx 439 // 40 41 42 43 44 45 xx xx 440 // 50 51 52 53 54 55 xx xx 441 // 60 61 62 63 64 65 xx xx 442 // 70 71 72 73 74 75 xx xx 443 transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], 444 &d[6], &d[7]); 445 // store 2 extra pixels 446 vst1_u8(t + 0 * stride_hor, d[0]); 447 vst1_u8(t + 1 * stride_hor, d[1]); 448 vst1_u8(t + 2 * stride_hor, d[2]); 449 vst1_u8(t + 3 * stride_hor, d[3]); 450 vst1_u8(t + 4 * stride_hor, d[4]); 451 vst1_u8(t + 5 * stride_hor, d[5]); 452 vst1_u8(t + 6 * stride_hor, d[6]); 453 vst1_u8(t + 7 * stride_hor, d[7]); 454 455 s[0] = s[8]; 456 457 t += 6; 458 x -= 6; 459 } while (x); 460 src += 8 * src_stride - 4 * width_hor / 3 - 1; 461 t += 7 * stride_hor + 2; 462 y -= 8; 463 } while (y); 464 465 // vertical 8x6 466 x = width_ver; 467 t = temp_buffer; 468 do { 469 load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], 470 &s[7]); 471 t += stride_hor; 472 y = height_ver; 473 474 do { 475 load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], 476 &s[7], &s[8]); 477 t += 8 * stride_hor; 478 479 d[0] = scale_filter_bilinear(&s[0], &c[0]); 480 d[1] = 481 scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); 482 d[2] = 483 scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); 484 d[3] = scale_filter_bilinear(&s[4], &c[0]); 485 d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], 486 &c[2]); 487 d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], 488 &c[4]); 489 vst1_u8(dst + 0 * dst_stride, d[0]); 490 vst1_u8(dst + 1 * dst_stride, d[1]); 491 vst1_u8(dst + 2 * dst_stride, d[2]); 492 vst1_u8(dst + 3 * dst_stride, d[3]); 493 vst1_u8(dst + 4 * dst_stride, d[4]); 494 vst1_u8(dst + 5 * dst_stride, d[5]); 495 496 s[0] = s[8]; 497 498 dst += 6 * dst_stride; 499 y -= 6; 500 } while (y); 501 t -= stride_hor * (4 * height_ver / 3 + 1); 502 t += 8; 503 dst -= height_ver * dst_stride; 504 dst += 8; 505 x -= 8; 506 } while (x); 507 } 508 509 static inline uint8x8_t scale_filter_8(const uint8x8_t *const s, 510 const int16x8_t filter) { 511 const int16x4_t filter_lo = vget_low_s16(filter); 512 const int16x4_t filter_hi = vget_high_s16(filter); 513 514 int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0])); 515 int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1])); 516 int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2])); 517 int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3])); 518 int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4])); 519 int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5])); 520 int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6])); 521 int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7])); 522 523 int16x8_t sum = vmulq_lane_s16(ss0, filter_lo, 0); 524 sum = vmlaq_lane_s16(sum, ss1, filter_lo, 1); 525 sum = vmlaq_lane_s16(sum, ss2, filter_lo, 2); 526 sum = vmlaq_lane_s16(sum, ss5, filter_hi, 1); 527 sum = vmlaq_lane_s16(sum, ss6, filter_hi, 2); 528 sum = vmlaq_lane_s16(sum, ss7, filter_hi, 3); 529 sum = vqaddq_s16(sum, vmulq_lane_s16(ss3, filter_lo, 3)); 530 sum = vqaddq_s16(sum, vmulq_lane_s16(ss4, filter_hi, 0)); 531 532 return vqrshrun_n_s16(sum, FILTER_BITS); 533 } 534 535 static inline void scale_plane_4_to_3_8tap(const uint8_t *src, 536 const int src_stride, uint8_t *dst, 537 const int dst_stride, const int w, 538 const int h, 539 const InterpKernel *const coef, 540 const int phase_scaler, 541 uint8_t *const temp_buffer) { 542 static const int step_q4 = 16 * 4 / 3; 543 const int width_hor = (w + 5) - ((w + 5) % 6); 544 const int stride_hor = width_hor + 2; // store 2 extra pixels 545 const int width_ver = (w + 7) & ~7; 546 // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows 547 // above and (SUBPEL_TAPS / 2) extra rows below. 548 const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; 549 const int height_ver = (h + 5) - ((h + 5) % 6); 550 const int16x8_t filters0 = vld1q_s16( 551 (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); 552 const int16x8_t filters1 = vld1q_s16( 553 (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); 554 const int16x8_t filters2 = vld1q_s16( 555 (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); 556 int x, y = height_hor; 557 uint8_t *t = temp_buffer; 558 uint8x8_t s[15], d[8]; 559 560 assert(w > 0 && h > 0); 561 562 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; 563 d[6] = vdup_n_u8(0); 564 d[7] = vdup_n_u8(0); 565 566 // horizontal 6x8 567 do { 568 load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], 569 &s[6], &s[7]); 570 transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], 571 &s[6], &s[7]); 572 x = width_hor; 573 574 do { 575 src += 8; 576 load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], 577 &s[13], &s[14]); 578 transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], 579 &s[12], &s[13], &s[14]); 580 581 // 00 10 20 30 40 50 60 70 582 // 01 11 21 31 41 51 61 71 583 // 02 12 22 32 42 52 62 72 584 // 03 13 23 33 43 53 63 73 585 // 04 14 24 34 44 54 64 74 586 // 05 15 25 35 45 55 65 75 587 d[0] = scale_filter_8(&s[0], filters0); 588 d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); 589 d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); 590 d[3] = scale_filter_8(&s[4], filters0); 591 d[4] = 592 scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); 593 d[5] = 594 scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); 595 596 // 00 01 02 03 04 05 xx xx 597 // 10 11 12 13 14 15 xx xx 598 // 20 21 22 23 24 25 xx xx 599 // 30 31 32 33 34 35 xx xx 600 // 40 41 42 43 44 45 xx xx 601 // 50 51 52 53 54 55 xx xx 602 // 60 61 62 63 64 65 xx xx 603 // 70 71 72 73 74 75 xx xx 604 transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], 605 &d[6], &d[7]); 606 // store 2 extra pixels 607 vst1_u8(t + 0 * stride_hor, d[0]); 608 vst1_u8(t + 1 * stride_hor, d[1]); 609 vst1_u8(t + 2 * stride_hor, d[2]); 610 vst1_u8(t + 3 * stride_hor, d[3]); 611 vst1_u8(t + 4 * stride_hor, d[4]); 612 vst1_u8(t + 5 * stride_hor, d[5]); 613 vst1_u8(t + 6 * stride_hor, d[6]); 614 vst1_u8(t + 7 * stride_hor, d[7]); 615 616 s[0] = s[8]; 617 s[1] = s[9]; 618 s[2] = s[10]; 619 s[3] = s[11]; 620 s[4] = s[12]; 621 s[5] = s[13]; 622 s[6] = s[14]; 623 624 t += 6; 625 x -= 6; 626 } while (x); 627 src += 8 * src_stride - 4 * width_hor / 3; 628 t += 7 * stride_hor + 2; 629 y -= 8; 630 } while (y); 631 632 // vertical 8x6 633 x = width_ver; 634 t = temp_buffer; 635 do { 636 load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], 637 &s[7]); 638 t += 7 * stride_hor; 639 y = height_ver; 640 641 do { 642 load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], 643 &s[13], &s[14]); 644 t += 8 * stride_hor; 645 646 d[0] = scale_filter_8(&s[0], filters0); 647 d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); 648 d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); 649 d[3] = scale_filter_8(&s[4], filters0); 650 d[4] = 651 scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); 652 d[5] = 653 scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); 654 vst1_u8(dst + 0 * dst_stride, d[0]); 655 vst1_u8(dst + 1 * dst_stride, d[1]); 656 vst1_u8(dst + 2 * dst_stride, d[2]); 657 vst1_u8(dst + 3 * dst_stride, d[3]); 658 vst1_u8(dst + 4 * dst_stride, d[4]); 659 vst1_u8(dst + 5 * dst_stride, d[5]); 660 661 s[0] = s[8]; 662 s[1] = s[9]; 663 s[2] = s[10]; 664 s[3] = s[11]; 665 s[4] = s[12]; 666 s[5] = s[13]; 667 s[6] = s[14]; 668 669 dst += 6 * dst_stride; 670 y -= 6; 671 } while (y); 672 t -= stride_hor * (4 * height_ver / 3 + 7); 673 t += 8; 674 dst -= height_ver * dst_stride; 675 dst += 8; 676 x -= 8; 677 } while (x); 678 } 679 680 // There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON. 681 static inline bool has_normative_scaler_neon(const int src_width, 682 const int src_height, 683 const int dst_width, 684 const int dst_height) { 685 const bool has_normative_scaler = 686 (2 * dst_width == src_width && 2 * dst_height == src_height) || 687 (4 * dst_width == src_width && 4 * dst_height == src_height) || 688 (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); 689 690 return has_normative_scaler; 691 } 692 693 void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, 694 YV12_BUFFER_CONFIG *dst, 695 const InterpFilter filter, 696 const int phase, const int num_planes) { 697 assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || 698 filter == EIGHTTAP_REGULAR); 699 700 bool has_normative_scaler = 701 has_normative_scaler_neon(src->y_crop_width, src->y_crop_height, 702 dst->y_crop_width, dst->y_crop_height); 703 704 if (num_planes > 1) { 705 has_normative_scaler = 706 has_normative_scaler && 707 has_normative_scaler_neon(src->uv_crop_width, src->uv_crop_height, 708 dst->uv_crop_width, dst->uv_crop_height); 709 } 710 711 if (!has_normative_scaler) { 712 av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); 713 return; 714 } 715 716 // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet 717 // the static analysis warnings. 718 int malloc_failed = 0; 719 for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { 720 const int is_uv = i > 0; 721 const int src_w = src->crop_widths[is_uv]; 722 const int src_h = src->crop_heights[is_uv]; 723 const int dst_w = dst->crop_widths[is_uv]; 724 const int dst_h = dst->crop_heights[is_uv]; 725 const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; 726 const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; 727 728 if (2 * dst_w == src_w && 2 * dst_h == src_h) { 729 if (phase == 0) { 730 scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], 731 dst->buffers[i], dst->strides[is_uv], dst_w, 732 dst_h); 733 } else if (filter == BILINEAR) { 734 const int16_t c0 = av1_bilinear_filters[phase][3]; 735 const int16_t c1 = av1_bilinear_filters[phase][4]; 736 scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], 737 dst->buffers[i], dst->strides[is_uv], dst_w, 738 dst_h, c0, c1); 739 } else { 740 const int buffer_stride = (dst_y_w + 3) & ~3; 741 const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; 742 uint8_t *const temp_buffer = 743 (uint8_t *)malloc(buffer_stride * buffer_height); 744 if (!temp_buffer) { 745 malloc_failed = 1; 746 break; 747 } 748 const InterpKernel *interp_kernel = 749 (const InterpKernel *)av1_interp_filter_params_list[filter] 750 .filter_ptr; 751 scale_plane_2_to_1_6tap(src->buffers[i], src->strides[is_uv], 752 dst->buffers[i], dst->strides[is_uv], dst_w, 753 dst_h, interp_kernel[phase], temp_buffer); 754 free(temp_buffer); 755 } 756 } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { 757 if (phase == 0) { 758 scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], 759 dst->buffers[i], dst->strides[is_uv], dst_w, 760 dst_h); 761 } else if (filter == BILINEAR) { 762 const int16_t c0 = av1_bilinear_filters[phase][3]; 763 const int16_t c1 = av1_bilinear_filters[phase][4]; 764 scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], 765 dst->buffers[i], dst->strides[is_uv], dst_w, 766 dst_h, c0, c1); 767 } else { 768 const int buffer_stride = (dst_y_w + 1) & ~1; 769 const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; 770 uint8_t *const temp_buffer = 771 (uint8_t *)malloc(buffer_stride * buffer_height); 772 if (!temp_buffer) { 773 malloc_failed = 1; 774 break; 775 } 776 const InterpKernel *interp_kernel = 777 (const InterpKernel *)av1_interp_filter_params_list[filter] 778 .filter_ptr; 779 scale_plane_4_to_1_6tap(src->buffers[i], src->strides[is_uv], 780 dst->buffers[i], dst->strides[is_uv], dst_w, 781 dst_h, interp_kernel[phase], temp_buffer); 782 free(temp_buffer); 783 } 784 } else { 785 assert(4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h); 786 // 4 to 3 787 const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; 788 const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; 789 uint8_t *const temp_buffer = 790 (uint8_t *)malloc(buffer_stride * buffer_height); 791 if (!temp_buffer) { 792 malloc_failed = 1; 793 break; 794 } 795 if (filter == BILINEAR) { 796 scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv], 797 dst->buffers[i], dst->strides[is_uv], dst_w, 798 dst_h, phase, temp_buffer); 799 } else { 800 const InterpKernel *interp_kernel = 801 (const InterpKernel *)av1_interp_filter_params_list[filter] 802 .filter_ptr; 803 scale_plane_4_to_3_8tap(src->buffers[i], src->strides[is_uv], 804 dst->buffers[i], dst->strides[is_uv], dst_w, 805 dst_h, interp_kernel, phase, temp_buffer); 806 } 807 free(temp_buffer); 808 } 809 } 810 811 if (malloc_failed) { 812 av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); 813 } else { 814 aom_extend_frame_borders(dst, num_planes); 815 } 816 }