convolve.c (63089B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <string.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 #include "config/av1_rtcd.h" 17 18 #include "av1/common/av1_common_int.h" 19 #include "av1/common/blockd.h" 20 #include "av1/common/convolve.h" 21 #include "av1/common/filter.h" 22 #include "av1/common/resize.h" 23 #include "aom_dsp/aom_dsp_common.h" 24 #include "aom_ports/mem.h" 25 26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, 27 int dst_stride, int w, int h, 28 const int16_t *x_filters, int x0_qn, 29 int x_step_qn) { 30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; 31 for (int y = 0; y < h; ++y) { 32 int x_qn = x0_qn; 33 for (int x = 0; x < w; ++x) { 34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; 35 const int x_filter_idx = 36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 37 assert(x_filter_idx <= RS_SUBPEL_MASK); 38 const int16_t *const x_filter = 39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; 40 int sum = 0; 41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) 42 sum += src_x[k] * x_filter[k]; 43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); 44 x_qn += x_step_qn; 45 } 46 src += src_stride; 47 dst += dst_stride; 48 } 49 } 50 51 #if CONFIG_AV1_HIGHBITDEPTH 52 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, 53 uint16_t *dst, int dst_stride, int w, int h, 54 const int16_t *x_filters, int x0_qn, 55 int x_step_qn, int bd) { 56 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; 57 for (int y = 0; y < h; ++y) { 58 int x_qn = x0_qn; 59 for (int x = 0; x < w; ++x) { 60 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; 61 const int x_filter_idx = 62 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 63 assert(x_filter_idx <= RS_SUBPEL_MASK); 64 const int16_t *const x_filter = 65 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; 66 int sum = 0; 67 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) 68 sum += src_x[k] * x_filter[k]; 69 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); 70 x_qn += x_step_qn; 71 } 72 src += src_stride; 73 dst += dst_stride; 74 } 75 } 76 #endif // CONFIG_AV1_HIGHBITDEPTH 77 78 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 79 int dst_stride, int w, int h, 80 const InterpFilterParams *filter_params_x, 81 const InterpFilterParams *filter_params_y, 82 const int subpel_x_qn, const int subpel_y_qn, 83 ConvolveParams *conv_params) { 84 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 85 int im_h = h + filter_params_y->taps - 1; 86 int im_stride = w; 87 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 88 const int fo_vert = filter_params_y->taps / 2 - 1; 89 const int fo_horiz = filter_params_x->taps / 2 - 1; 90 const int bd = 8; 91 const int bits = 92 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 93 94 // horizontal filter 95 const uint8_t *src_horiz = src - fo_vert * src_stride; 96 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 97 filter_params_x, subpel_x_qn & SUBPEL_MASK); 98 for (int y = 0; y < im_h; ++y) { 99 for (int x = 0; x < w; ++x) { 100 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 101 for (int k = 0; k < filter_params_x->taps; ++k) { 102 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 103 } 104 105 // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can 106 // be beyond the following range. For better prediction, a clamping can be 107 // added for 12 tap filter to ensure the horizontal filtering result is 108 // within 16 bit. The same applies to the vertical filtering. 109 assert(filter_params_x->taps > 8 || 110 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); 111 im_block[y * im_stride + x] = 112 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 113 } 114 } 115 116 // vertical filter 117 int16_t *src_vert = im_block + fo_vert * im_stride; 118 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 119 filter_params_y, subpel_y_qn & SUBPEL_MASK); 120 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 121 for (int y = 0; y < h; ++y) { 122 for (int x = 0; x < w; ++x) { 123 int32_t sum = 1 << offset_bits; 124 for (int k = 0; k < filter_params_y->taps; ++k) { 125 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 126 } 127 assert(filter_params_y->taps > 8 || 128 (0 <= sum && sum < (1 << (offset_bits + 2)))); 129 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - 130 ((1 << (offset_bits - conv_params->round_1)) + 131 (1 << (offset_bits - conv_params->round_1 - 1))); 132 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); 133 } 134 } 135 } 136 137 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 138 int dst_stride, int w, int h, 139 const InterpFilterParams *filter_params_y, 140 const int subpel_y_qn) { 141 const int fo_vert = filter_params_y->taps / 2 - 1; 142 143 // vertical filter 144 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 145 filter_params_y, subpel_y_qn & SUBPEL_MASK); 146 for (int y = 0; y < h; ++y) { 147 for (int x = 0; x < w; ++x) { 148 int32_t res = 0; 149 for (int k = 0; k < filter_params_y->taps; ++k) { 150 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 151 } 152 dst[y * dst_stride + x] = 153 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); 154 } 155 } 156 } 157 158 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 159 int dst_stride, int w, int h, 160 const InterpFilterParams *filter_params_x, 161 const int subpel_x_qn, ConvolveParams *conv_params) { 162 const int fo_horiz = filter_params_x->taps / 2 - 1; 163 const int bits = FILTER_BITS - conv_params->round_0; 164 165 assert(bits >= 0); 166 assert((FILTER_BITS - conv_params->round_1) >= 0 || 167 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 168 169 // horizontal filter 170 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 171 filter_params_x, subpel_x_qn & SUBPEL_MASK); 172 173 for (int y = 0; y < h; ++y) { 174 for (int x = 0; x < w; ++x) { 175 int32_t res = 0; 176 for (int k = 0; k < filter_params_x->taps; ++k) { 177 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 178 } 179 res = ROUND_POWER_OF_TWO(res, conv_params->round_0); 180 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); 181 } 182 } 183 } 184 185 // This function is exactly the same as av1_convolve_2d_sr_c, and is an 186 // optimized version for intrabc. Use the following 2-tap filter: 187 // DECLARE_ALIGNED(256, static const int16_t, 188 // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { 189 // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 190 // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 191 // }; 192 void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, 193 uint8_t *dst, int dst_stride, int w, int h, 194 const InterpFilterParams *filter_params_x, 195 const InterpFilterParams *filter_params_y, 196 const int subpel_x_qn, const int subpel_y_qn, 197 ConvolveParams *conv_params) { 198 assert(subpel_x_qn == 8); 199 assert(subpel_y_qn == 8); 200 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 201 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 202 (void)filter_params_x; 203 (void)subpel_x_qn; 204 (void)filter_params_y; 205 (void)subpel_y_qn; 206 (void)conv_params; 207 208 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 209 int im_h = h + 1; 210 int im_stride = w; 211 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 212 const int bd = 8; 213 214 // horizontal filter 215 // explicitly operate for subpel_x_qn = 8. 216 int16_t *im = im_block; 217 for (int y = 0; y < im_h; ++y) { 218 for (int x = 0; x < w; ++x) { 219 const int32_t sum = (1 << bd) + src[x] + src[x + 1]; 220 assert(0 <= sum && sum < (1 << (bd + 2))); 221 im[x] = sum; 222 } 223 src += src_stride; 224 im += im_stride; 225 } 226 227 // vertical filter 228 // explicitly operate for subpel_y_qn = 8. 229 int16_t *src_vert = im_block; 230 for (int y = 0; y < h; ++y) { 231 for (int x = 0; x < w; ++x) { 232 const int32_t sum = 233 (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x]; 234 assert(0 <= sum && sum < (1 << (bd + 4))); 235 const int16_t res = 236 ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1))); 237 dst[x] = clip_pixel(res); 238 } 239 src_vert += im_stride; 240 dst += dst_stride; 241 } 242 } 243 244 // This function is exactly the same as av1_convolve_y_sr_c, and is an 245 // optimized version for intrabc. 246 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, 247 uint8_t *dst, int dst_stride, int w, int h, 248 const InterpFilterParams *filter_params_y, 249 const int subpel_y_qn) { 250 assert(subpel_y_qn == 8); 251 assert(filter_params_y->taps == 2); 252 (void)filter_params_y; 253 (void)subpel_y_qn; 254 255 // vertical filter 256 // explicitly operate for subpel_y_qn = 8. 257 for (int y = 0; y < h; ++y) { 258 for (int x = 0; x < w; ++x) { 259 const int32_t res = src[x] + src[src_stride + x]; 260 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); 261 } 262 src += src_stride; 263 dst += dst_stride; 264 } 265 } 266 267 // This function is exactly the same as av1_convolve_x_sr_c, and is an 268 // optimized version for intrabc. 269 void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, 270 uint8_t *dst, int dst_stride, int w, int h, 271 const InterpFilterParams *filter_params_x, 272 const int subpel_x_qn, 273 ConvolveParams *conv_params) { 274 assert(subpel_x_qn == 8); 275 assert(filter_params_x->taps == 2); 276 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 277 (void)filter_params_x; 278 (void)subpel_x_qn; 279 (void)conv_params; 280 281 // horizontal filter 282 // explicitly operate for subpel_x_qn = 8. 283 for (int y = 0; y < h; ++y) { 284 for (int x = 0; x < w; ++x) { 285 const int32_t res = src[x] + src[x + 1]; 286 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); 287 } 288 src += src_stride; 289 dst += dst_stride; 290 } 291 } 292 293 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, 294 uint8_t *dst, int dst_stride, int w, int h, 295 const InterpFilterParams *filter_params_x, 296 const InterpFilterParams *filter_params_y, 297 const int subpel_x_qn, const int subpel_y_qn, 298 ConvolveParams *conv_params) { 299 CONV_BUF_TYPE *dst16 = conv_params->dst; 300 int dst16_stride = conv_params->dst_stride; 301 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 302 int im_h = h + filter_params_y->taps - 1; 303 int im_stride = w; 304 const int fo_vert = filter_params_y->taps / 2 - 1; 305 const int fo_horiz = filter_params_x->taps / 2 - 1; 306 const int bd = 8; 307 const int round_bits = 308 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 309 310 // horizontal filter 311 const uint8_t *src_horiz = src - fo_vert * src_stride; 312 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 313 filter_params_x, subpel_x_qn & SUBPEL_MASK); 314 for (int y = 0; y < im_h; ++y) { 315 for (int x = 0; x < w; ++x) { 316 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 317 for (int k = 0; k < filter_params_x->taps; ++k) { 318 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 319 } 320 assert(filter_params_x->taps > 8 || 321 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); 322 im_block[y * im_stride + x] = 323 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 324 } 325 } 326 327 // vertical filter 328 int16_t *src_vert = im_block + fo_vert * im_stride; 329 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 330 filter_params_y, subpel_y_qn & SUBPEL_MASK); 331 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 332 for (int y = 0; y < h; ++y) { 333 for (int x = 0; x < w; ++x) { 334 int32_t sum = 1 << offset_bits; 335 for (int k = 0; k < filter_params_y->taps; ++k) { 336 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 337 } 338 assert(filter_params_y->taps > 8 || 339 (0 <= sum && sum < (1 << (offset_bits + 2)))); 340 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 341 if (conv_params->do_average) { 342 int32_t tmp = dst16[y * dst16_stride + x]; 343 if (conv_params->use_dist_wtd_comp_avg) { 344 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 345 tmp = tmp >> DIST_PRECISION_BITS; 346 } else { 347 tmp += res; 348 tmp = tmp >> 1; 349 } 350 tmp -= (1 << (offset_bits - conv_params->round_1)) + 351 (1 << (offset_bits - conv_params->round_1 - 1)); 352 dst[y * dst_stride + x] = 353 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); 354 } else { 355 dst16[y * dst16_stride + x] = res; 356 } 357 } 358 } 359 } 360 361 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, 362 int dst_stride, int w, int h, 363 const InterpFilterParams *filter_params_y, 364 const int subpel_y_qn, 365 ConvolveParams *conv_params) { 366 CONV_BUF_TYPE *dst16 = conv_params->dst; 367 int dst16_stride = conv_params->dst_stride; 368 const int fo_vert = filter_params_y->taps / 2 - 1; 369 const int bits = FILTER_BITS - conv_params->round_0; 370 const int bd = 8; 371 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 372 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 373 (1 << (offset_bits - conv_params->round_1 - 1)); 374 const int round_bits = 375 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 376 377 // vertical filter 378 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 379 filter_params_y, subpel_y_qn & SUBPEL_MASK); 380 for (int y = 0; y < h; ++y) { 381 for (int x = 0; x < w; ++x) { 382 int32_t res = 0; 383 for (int k = 0; k < filter_params_y->taps; ++k) { 384 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 385 } 386 res *= (1 << bits); 387 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; 388 389 if (conv_params->do_average) { 390 int32_t tmp = dst16[y * dst16_stride + x]; 391 if (conv_params->use_dist_wtd_comp_avg) { 392 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 393 tmp = tmp >> DIST_PRECISION_BITS; 394 } else { 395 tmp += res; 396 tmp = tmp >> 1; 397 } 398 tmp -= round_offset; 399 dst[y * dst_stride + x] = 400 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); 401 } else { 402 dst16[y * dst16_stride + x] = res; 403 } 404 } 405 } 406 } 407 408 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, 409 int dst_stride, int w, int h, 410 const InterpFilterParams *filter_params_x, 411 const int subpel_x_qn, 412 ConvolveParams *conv_params) { 413 CONV_BUF_TYPE *dst16 = conv_params->dst; 414 int dst16_stride = conv_params->dst_stride; 415 const int fo_horiz = filter_params_x->taps / 2 - 1; 416 const int bits = FILTER_BITS - conv_params->round_1; 417 const int bd = 8; 418 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 419 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 420 (1 << (offset_bits - conv_params->round_1 - 1)); 421 const int round_bits = 422 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 423 424 // horizontal filter 425 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 426 filter_params_x, subpel_x_qn & SUBPEL_MASK); 427 for (int y = 0; y < h; ++y) { 428 for (int x = 0; x < w; ++x) { 429 int32_t res = 0; 430 for (int k = 0; k < filter_params_x->taps; ++k) { 431 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 432 } 433 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); 434 res += round_offset; 435 436 if (conv_params->do_average) { 437 int32_t tmp = dst16[y * dst16_stride + x]; 438 if (conv_params->use_dist_wtd_comp_avg) { 439 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 440 tmp = tmp >> DIST_PRECISION_BITS; 441 } else { 442 tmp += res; 443 tmp = tmp >> 1; 444 } 445 tmp -= round_offset; 446 dst[y * dst_stride + x] = 447 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); 448 } else { 449 dst16[y * dst16_stride + x] = res; 450 } 451 } 452 } 453 } 454 455 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, 456 uint8_t *dst, int dst_stride, int w, int h, 457 ConvolveParams *conv_params) { 458 CONV_BUF_TYPE *dst16 = conv_params->dst; 459 int dst16_stride = conv_params->dst_stride; 460 const int bits = 461 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; 462 const int bd = 8; 463 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 464 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 465 (1 << (offset_bits - conv_params->round_1 - 1)); 466 467 for (int y = 0; y < h; ++y) { 468 for (int x = 0; x < w; ++x) { 469 CONV_BUF_TYPE res = src[y * src_stride + x] << bits; 470 res += round_offset; 471 472 if (conv_params->do_average) { 473 int32_t tmp = dst16[y * dst16_stride + x]; 474 if (conv_params->use_dist_wtd_comp_avg) { 475 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 476 tmp = tmp >> DIST_PRECISION_BITS; 477 } else { 478 tmp += res; 479 tmp = tmp >> 1; 480 } 481 tmp -= round_offset; 482 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); 483 } else { 484 dst16[y * dst16_stride + x] = res; 485 } 486 } 487 } 488 } 489 490 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, 491 int dst_stride, int w, int h, 492 const InterpFilterParams *filter_params_x, 493 const InterpFilterParams *filter_params_y, 494 const int subpel_x_qn, const int x_step_qn, 495 const int subpel_y_qn, const int y_step_qn, 496 ConvolveParams *conv_params) { 497 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; 498 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + 499 filter_params_y->taps; 500 CONV_BUF_TYPE *dst16 = conv_params->dst; 501 const int dst16_stride = conv_params->dst_stride; 502 const int bits = 503 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 504 assert(bits >= 0); 505 int im_stride = w; 506 const int fo_vert = filter_params_y->taps / 2 - 1; 507 const int fo_horiz = filter_params_x->taps / 2 - 1; 508 const int bd = 8; 509 510 // horizontal filter 511 const uint8_t *src_horiz = src - fo_vert * src_stride; 512 for (int y = 0; y < im_h; ++y) { 513 int x_qn = subpel_x_qn; 514 for (int x = 0; x < w; ++x, x_qn += x_step_qn) { 515 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; 516 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 517 assert(x_filter_idx < SUBPEL_SHIFTS); 518 const int16_t *x_filter = 519 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); 520 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 521 for (int k = 0; k < filter_params_x->taps; ++k) { 522 sum += x_filter[k] * src_x[k - fo_horiz]; 523 } 524 assert(filter_params_x->taps > 8 || 525 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); 526 im_block[y * im_stride + x] = 527 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 528 } 529 src_horiz += src_stride; 530 } 531 532 // vertical filter 533 int16_t *src_vert = im_block + fo_vert * im_stride; 534 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 535 for (int x = 0; x < w; ++x) { 536 int y_qn = subpel_y_qn; 537 for (int y = 0; y < h; ++y, y_qn += y_step_qn) { 538 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; 539 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 540 assert(y_filter_idx < SUBPEL_SHIFTS); 541 const int16_t *y_filter = 542 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); 543 int32_t sum = 1 << offset_bits; 544 for (int k = 0; k < filter_params_y->taps; ++k) { 545 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; 546 } 547 assert(filter_params_y->taps > 8 || 548 (0 <= sum && sum < (1 << (offset_bits + 2)))); 549 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 550 if (conv_params->is_compound) { 551 if (conv_params->do_average) { 552 int32_t tmp = dst16[y * dst16_stride + x]; 553 if (conv_params->use_dist_wtd_comp_avg) { 554 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 555 tmp = tmp >> DIST_PRECISION_BITS; 556 } else { 557 tmp += res; 558 tmp = tmp >> 1; 559 } 560 /* Subtract round offset and convolve round */ 561 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + 562 (1 << (offset_bits - conv_params->round_1 - 1))); 563 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); 564 } else { 565 dst16[y * dst16_stride + x] = res; 566 } 567 } else { 568 /* Subtract round offset and convolve round */ 569 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + 570 (1 << (offset_bits - conv_params->round_1 - 1))); 571 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); 572 } 573 } 574 src_vert++; 575 } 576 } 577 578 static void convolve_2d_scale_wrapper( 579 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 580 int h, const InterpFilterParams *filter_params_x, 581 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 582 const int x_step_qn, const int subpel_y_qn, const int y_step_qn, 583 ConvolveParams *conv_params) { 584 if (conv_params->is_compound) { 585 assert(conv_params->dst != NULL); 586 } 587 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, 588 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, 589 y_step_qn, conv_params); 590 } 591 592 static void convolve_2d_facade_compound( 593 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 594 int h, const InterpFilterParams *filter_params_x, 595 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 596 const int subpel_y_qn, ConvolveParams *conv_params) { 597 const bool need_x = subpel_x_qn != 0; 598 const bool need_y = subpel_y_qn != 0; 599 if (!need_x && !need_y) { 600 av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, 601 conv_params); 602 } else if (need_x && !need_y) { 603 av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, 604 filter_params_x, subpel_x_qn, conv_params); 605 } else if (!need_x && need_y) { 606 av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, 607 filter_params_y, subpel_y_qn, conv_params); 608 } else { 609 assert(need_y && need_x); 610 av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, 611 filter_params_x, filter_params_y, subpel_x_qn, 612 subpel_y_qn, conv_params); 613 } 614 } 615 616 static void convolve_2d_facade_single( 617 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 618 int h, const InterpFilterParams *filter_params_x, 619 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 620 const int subpel_y_qn, ConvolveParams *conv_params) { 621 const bool need_x = subpel_x_qn != 0; 622 const bool need_y = subpel_y_qn != 0; 623 if (!need_x && !need_y) { 624 aom_convolve_copy(src, src_stride, dst, dst_stride, w, h); 625 } else if (need_x && !need_y) { 626 av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, 627 subpel_x_qn, conv_params); 628 } else if (!need_x && need_y) { 629 av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y, 630 subpel_y_qn); 631 } else { 632 assert(need_x && need_y); 633 av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, 634 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); 635 } 636 } 637 638 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, 639 int dst_stride, int w, int h, 640 const InterpFilterParams *interp_filters[2], 641 const int subpel_x_qn, int x_step_q4, 642 const int subpel_y_qn, int y_step_q4, int scaled, 643 ConvolveParams *conv_params) { 644 (void)x_step_q4; 645 (void)y_step_q4; 646 (void)dst; 647 (void)dst_stride; 648 649 const InterpFilterParams *filter_params_x = interp_filters[0]; 650 const InterpFilterParams *filter_params_y = interp_filters[1]; 651 652 // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case. 653 // 2-tap filter indicates that it is for IntraBC. 654 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { 655 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 656 assert(!scaled); 657 if (subpel_x_qn && subpel_y_qn) { 658 av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h, 659 filter_params_x, filter_params_y, subpel_x_qn, 660 subpel_y_qn, conv_params); 661 return; 662 } else if (subpel_x_qn) { 663 av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h, 664 filter_params_x, subpel_x_qn, conv_params); 665 return; 666 } else if (subpel_y_qn) { 667 av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h, 668 filter_params_y, subpel_y_qn); 669 return; 670 } 671 } 672 673 if (scaled) { 674 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, 675 filter_params_x, filter_params_y, subpel_x_qn, 676 x_step_q4, subpel_y_qn, y_step_q4, conv_params); 677 } else if (conv_params->is_compound) { 678 convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h, 679 filter_params_x, filter_params_y, subpel_x_qn, 680 subpel_y_qn, conv_params); 681 } else { 682 convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, 683 filter_params_x, filter_params_y, subpel_x_qn, 684 subpel_y_qn, conv_params); 685 } 686 } 687 688 #if CONFIG_AV1_HIGHBITDEPTH 689 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, 690 uint16_t *dst, int dst_stride, int w, int h, 691 const InterpFilterParams *filter_params_x, 692 const int subpel_x_qn, 693 ConvolveParams *conv_params, int bd) { 694 const int fo_horiz = filter_params_x->taps / 2 - 1; 695 const int bits = FILTER_BITS - conv_params->round_0; 696 697 assert(bits >= 0); 698 assert((FILTER_BITS - conv_params->round_1) >= 0 || 699 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 700 701 // horizontal filter 702 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 703 filter_params_x, subpel_x_qn & SUBPEL_MASK); 704 for (int y = 0; y < h; ++y) { 705 for (int x = 0; x < w; ++x) { 706 int32_t res = 0; 707 for (int k = 0; k < filter_params_x->taps; ++k) { 708 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 709 } 710 res = ROUND_POWER_OF_TWO(res, conv_params->round_0); 711 dst[y * dst_stride + x] = 712 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); 713 } 714 } 715 } 716 717 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, 718 uint16_t *dst, int dst_stride, int w, int h, 719 const InterpFilterParams *filter_params_y, 720 const int subpel_y_qn, int bd) { 721 const int fo_vert = filter_params_y->taps / 2 - 1; 722 // vertical filter 723 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 724 filter_params_y, subpel_y_qn & SUBPEL_MASK); 725 for (int y = 0; y < h; ++y) { 726 for (int x = 0; x < w; ++x) { 727 int32_t res = 0; 728 for (int k = 0; k < filter_params_y->taps; ++k) { 729 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 730 } 731 dst[y * dst_stride + x] = 732 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); 733 } 734 } 735 } 736 737 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, 738 uint16_t *dst, int dst_stride, int w, int h, 739 const InterpFilterParams *filter_params_x, 740 const InterpFilterParams *filter_params_y, 741 const int subpel_x_qn, const int subpel_y_qn, 742 ConvolveParams *conv_params, int bd) { 743 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 744 int im_h = h + filter_params_y->taps - 1; 745 int im_stride = w; 746 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 747 const int fo_vert = filter_params_y->taps / 2 - 1; 748 const int fo_horiz = filter_params_x->taps / 2 - 1; 749 const int bits = 750 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 751 assert(bits >= 0); 752 753 // horizontal filter 754 const uint16_t *src_horiz = src - fo_vert * src_stride; 755 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 756 filter_params_x, subpel_x_qn & SUBPEL_MASK); 757 for (int y = 0; y < im_h; ++y) { 758 for (int x = 0; x < w; ++x) { 759 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 760 for (int k = 0; k < filter_params_x->taps; ++k) { 761 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 762 } 763 assert(filter_params_x->taps > 8 || 764 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); 765 im_block[y * im_stride + x] = 766 ROUND_POWER_OF_TWO(sum, conv_params->round_0); 767 } 768 } 769 770 // vertical filter 771 int16_t *src_vert = im_block + fo_vert * im_stride; 772 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 773 filter_params_y, subpel_y_qn & SUBPEL_MASK); 774 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 775 for (int y = 0; y < h; ++y) { 776 for (int x = 0; x < w; ++x) { 777 int32_t sum = 1 << offset_bits; 778 for (int k = 0; k < filter_params_y->taps; ++k) { 779 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 780 } 781 assert(filter_params_y->taps > 8 || 782 (0 <= sum && sum < (1 << (offset_bits + 2)))); 783 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - 784 ((1 << (offset_bits - conv_params->round_1)) + 785 (1 << (offset_bits - conv_params->round_1 - 1))); 786 dst[y * dst_stride + x] = 787 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); 788 } 789 } 790 } 791 792 // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an 793 // optimized version for intrabc. Use the following 2-tap filter: 794 // DECLARE_ALIGNED(256, static const int16_t, 795 // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { 796 // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 797 // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 798 // }; 799 void av1_highbd_convolve_2d_sr_intrabc_c( 800 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 801 int h, const InterpFilterParams *filter_params_x, 802 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 803 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 804 const int bits = 805 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 806 assert(bits >= 0); 807 assert(subpel_x_qn == 8); 808 assert(subpel_y_qn == 8); 809 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 810 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 811 (void)filter_params_x; 812 (void)subpel_x_qn; 813 (void)filter_params_y; 814 (void)subpel_y_qn; 815 (void)conv_params; 816 817 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 818 int im_h = h + 1; 819 int im_stride = w; 820 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 821 822 // horizontal filter 823 // explicitly operate for subpel_x_qn = 8. 824 int16_t *im = im_block; 825 for (int y = 0; y < im_h; ++y) { 826 for (int x = 0; x < w; ++x) { 827 int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]); 828 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 829 sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0); 830 im[x] = sum; 831 } 832 src += src_stride; 833 im += im_stride; 834 } 835 836 // vertical filter 837 // explicitly operate for subpel_y_qn = 8. 838 int16_t *src_vert = im_block; 839 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 840 for (int y = 0; y < h; ++y) { 841 for (int x = 0; x < w; ++x) { 842 const int32_t sum = 843 (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]); 844 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 845 const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - 846 ((1 << (offset_bits - conv_params->round_1)) + 847 (1 << (offset_bits - conv_params->round_1 - 1))); 848 849 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); 850 } 851 src_vert += im_stride; 852 dst += dst_stride; 853 } 854 } 855 856 // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an 857 // optimized version for intrabc. 858 void av1_highbd_convolve_y_sr_intrabc_c( 859 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 860 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, 861 int bd) { 862 assert(subpel_y_qn == 8); 863 assert(filter_params_y->taps == 2); 864 (void)filter_params_y; 865 (void)subpel_y_qn; 866 867 // vertical filter 868 // explicitly operate for subpel_y_qn = 8. 869 for (int y = 0; y < h; ++y) { 870 for (int x = 0; x < w; ++x) { 871 const int32_t res = src[x] + src[src_stride + x]; 872 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd); 873 } 874 src += src_stride; 875 dst += dst_stride; 876 } 877 } 878 879 // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an 880 // optimized version for intrabc. 881 void av1_highbd_convolve_x_sr_intrabc_c( 882 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 883 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 884 ConvolveParams *conv_params, int bd) { 885 const int bits = FILTER_BITS - conv_params->round_0; 886 assert(bits >= 0); 887 assert(subpel_x_qn == 8); 888 assert(filter_params_x->taps == 2); 889 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 890 (void)filter_params_x; 891 (void)subpel_x_qn; 892 893 // horizontal filter 894 // explicitly operate for subpel_x_qn = 8. 895 for (int y = 0; y < h; ++y) { 896 for (int x = 0; x < w; ++x) { 897 int32_t res = 64 * (src[x] + src[x + 1]); 898 res = ROUND_POWER_OF_TWO(res, conv_params->round_0); 899 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); 900 } 901 src += src_stride; 902 dst += dst_stride; 903 } 904 } 905 906 void av1_highbd_dist_wtd_convolve_2d_c( 907 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 908 int h, const InterpFilterParams *filter_params_x, 909 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 910 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 911 int x, y, k; 912 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 913 CONV_BUF_TYPE *dst16 = conv_params->dst; 914 int dst16_stride = conv_params->dst_stride; 915 int im_h = h + filter_params_y->taps - 1; 916 int im_stride = w; 917 const int fo_vert = filter_params_y->taps / 2 - 1; 918 const int fo_horiz = filter_params_x->taps / 2 - 1; 919 const int round_bits = 920 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 921 assert(round_bits >= 0); 922 923 // horizontal filter 924 const uint16_t *src_horiz = src - fo_vert * src_stride; 925 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 926 filter_params_x, subpel_x_qn & SUBPEL_MASK); 927 for (y = 0; y < im_h; ++y) { 928 for (x = 0; x < w; ++x) { 929 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 930 for (k = 0; k < filter_params_x->taps; ++k) { 931 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 932 } 933 assert(filter_params_x->taps > 8 || 934 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); 935 (void)bd; 936 im_block[y * im_stride + x] = 937 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 938 } 939 } 940 941 // vertical filter 942 int16_t *src_vert = im_block + fo_vert * im_stride; 943 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 944 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 945 filter_params_y, subpel_y_qn & SUBPEL_MASK); 946 for (y = 0; y < h; ++y) { 947 for (x = 0; x < w; ++x) { 948 int32_t sum = 1 << offset_bits; 949 for (k = 0; k < filter_params_y->taps; ++k) { 950 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 951 } 952 assert(filter_params_y->taps > 8 || 953 (0 <= sum && sum < (1 << (offset_bits + 2)))); 954 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 955 if (conv_params->do_average) { 956 int32_t tmp = dst16[y * dst16_stride + x]; 957 if (conv_params->use_dist_wtd_comp_avg) { 958 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 959 tmp = tmp >> DIST_PRECISION_BITS; 960 } else { 961 tmp += res; 962 tmp = tmp >> 1; 963 } 964 tmp -= (1 << (offset_bits - conv_params->round_1)) + 965 (1 << (offset_bits - conv_params->round_1 - 1)); 966 dst[y * dst_stride + x] = 967 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); 968 } else { 969 dst16[y * dst16_stride + x] = res; 970 } 971 } 972 } 973 } 974 975 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, 976 uint16_t *dst, int dst_stride, int w, 977 int h, 978 const InterpFilterParams *filter_params_x, 979 const int subpel_x_qn, 980 ConvolveParams *conv_params, int bd) { 981 CONV_BUF_TYPE *dst16 = conv_params->dst; 982 int dst16_stride = conv_params->dst_stride; 983 const int fo_horiz = filter_params_x->taps / 2 - 1; 984 const int bits = FILTER_BITS - conv_params->round_1; 985 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 986 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 987 (1 << (offset_bits - conv_params->round_1 - 1)); 988 const int round_bits = 989 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 990 assert(round_bits >= 0); 991 assert(bits >= 0); 992 // horizontal filter 993 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 994 filter_params_x, subpel_x_qn & SUBPEL_MASK); 995 for (int y = 0; y < h; ++y) { 996 for (int x = 0; x < w; ++x) { 997 int32_t res = 0; 998 for (int k = 0; k < filter_params_x->taps; ++k) { 999 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 1000 } 1001 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); 1002 res += round_offset; 1003 1004 if (conv_params->do_average) { 1005 int32_t tmp = dst16[y * dst16_stride + x]; 1006 if (conv_params->use_dist_wtd_comp_avg) { 1007 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 1008 tmp = tmp >> DIST_PRECISION_BITS; 1009 } else { 1010 tmp += res; 1011 tmp = tmp >> 1; 1012 } 1013 tmp -= round_offset; 1014 dst[y * dst_stride + x] = 1015 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); 1016 } else { 1017 dst16[y * dst16_stride + x] = res; 1018 } 1019 } 1020 } 1021 } 1022 1023 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, 1024 uint16_t *dst, int dst_stride, int w, 1025 int h, 1026 const InterpFilterParams *filter_params_y, 1027 const int subpel_y_qn, 1028 ConvolveParams *conv_params, int bd) { 1029 CONV_BUF_TYPE *dst16 = conv_params->dst; 1030 int dst16_stride = conv_params->dst_stride; 1031 const int fo_vert = filter_params_y->taps / 2 - 1; 1032 const int bits = FILTER_BITS - conv_params->round_0; 1033 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1034 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 1035 (1 << (offset_bits - conv_params->round_1 - 1)); 1036 const int round_bits = 1037 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 1038 assert(round_bits >= 0); 1039 assert(bits >= 0); 1040 // vertical filter 1041 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 1042 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1043 for (int y = 0; y < h; ++y) { 1044 for (int x = 0; x < w; ++x) { 1045 int32_t res = 0; 1046 for (int k = 0; k < filter_params_y->taps; ++k) { 1047 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 1048 } 1049 res *= (1 << bits); 1050 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; 1051 1052 if (conv_params->do_average) { 1053 int32_t tmp = dst16[y * dst16_stride + x]; 1054 if (conv_params->use_dist_wtd_comp_avg) { 1055 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 1056 tmp = tmp >> DIST_PRECISION_BITS; 1057 } else { 1058 tmp += res; 1059 tmp = tmp >> 1; 1060 } 1061 tmp -= round_offset; 1062 dst[y * dst_stride + x] = 1063 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); 1064 } else { 1065 dst16[y * dst16_stride + x] = res; 1066 } 1067 } 1068 } 1069 } 1070 1071 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, 1072 uint16_t *dst, int dst_stride, 1073 int w, int h, 1074 ConvolveParams *conv_params, 1075 int bd) { 1076 CONV_BUF_TYPE *dst16 = conv_params->dst; 1077 int dst16_stride = conv_params->dst_stride; 1078 const int bits = 1079 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; 1080 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1081 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 1082 (1 << (offset_bits - conv_params->round_1 - 1)); 1083 assert(bits >= 0); 1084 1085 for (int y = 0; y < h; ++y) { 1086 for (int x = 0; x < w; ++x) { 1087 CONV_BUF_TYPE res = src[y * src_stride + x] << bits; 1088 res += round_offset; 1089 if (conv_params->do_average) { 1090 int32_t tmp = dst16[y * dst16_stride + x]; 1091 if (conv_params->use_dist_wtd_comp_avg) { 1092 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 1093 tmp = tmp >> DIST_PRECISION_BITS; 1094 } else { 1095 tmp += res; 1096 tmp = tmp >> 1; 1097 } 1098 tmp -= round_offset; 1099 dst[y * dst_stride + x] = 1100 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); 1101 } else { 1102 dst16[y * dst16_stride + x] = res; 1103 } 1104 } 1105 } 1106 } 1107 1108 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, 1109 uint16_t *dst, int dst_stride, int w, int h, 1110 const InterpFilterParams *filter_params_x, 1111 const InterpFilterParams *filter_params_y, 1112 const int subpel_x_qn, const int x_step_qn, 1113 const int subpel_y_qn, const int y_step_qn, 1114 ConvolveParams *conv_params, int bd) { 1115 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; 1116 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + 1117 filter_params_y->taps; 1118 int im_stride = w; 1119 const int fo_vert = filter_params_y->taps / 2 - 1; 1120 const int fo_horiz = filter_params_x->taps / 2 - 1; 1121 CONV_BUF_TYPE *dst16 = conv_params->dst; 1122 const int dst16_stride = conv_params->dst_stride; 1123 const int bits = 1124 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 1125 assert(bits >= 0); 1126 // horizontal filter 1127 const uint16_t *src_horiz = src - fo_vert * src_stride; 1128 for (int y = 0; y < im_h; ++y) { 1129 int x_qn = subpel_x_qn; 1130 for (int x = 0; x < w; ++x, x_qn += x_step_qn) { 1131 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; 1132 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 1133 assert(x_filter_idx < SUBPEL_SHIFTS); 1134 const int16_t *x_filter = 1135 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); 1136 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 1137 for (int k = 0; k < filter_params_x->taps; ++k) { 1138 sum += x_filter[k] * src_x[k - fo_horiz]; 1139 } 1140 assert(filter_params_x->taps > 8 || 1141 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); 1142 im_block[y * im_stride + x] = 1143 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 1144 } 1145 src_horiz += src_stride; 1146 } 1147 1148 // vertical filter 1149 int16_t *src_vert = im_block + fo_vert * im_stride; 1150 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1151 for (int x = 0; x < w; ++x) { 1152 int y_qn = subpel_y_qn; 1153 for (int y = 0; y < h; ++y, y_qn += y_step_qn) { 1154 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; 1155 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 1156 assert(y_filter_idx < SUBPEL_SHIFTS); 1157 const int16_t *y_filter = 1158 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); 1159 int32_t sum = 1 << offset_bits; 1160 for (int k = 0; k < filter_params_y->taps; ++k) { 1161 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; 1162 } 1163 assert(filter_params_y->taps > 8 || 1164 (0 <= sum && sum < (1 << (offset_bits + 2)))); 1165 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 1166 if (conv_params->is_compound) { 1167 if (conv_params->do_average) { 1168 int32_t tmp = dst16[y * dst16_stride + x]; 1169 if (conv_params->use_dist_wtd_comp_avg) { 1170 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 1171 tmp = tmp >> DIST_PRECISION_BITS; 1172 } else { 1173 tmp += res; 1174 tmp = tmp >> 1; 1175 } 1176 /* Subtract round offset and convolve round */ 1177 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + 1178 (1 << (offset_bits - conv_params->round_1 - 1))); 1179 dst[y * dst_stride + x] = 1180 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); 1181 } else { 1182 dst16[y * dst16_stride + x] = res; 1183 } 1184 } else { 1185 /* Subtract round offset and convolve round */ 1186 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + 1187 (1 << (offset_bits - conv_params->round_1 - 1))); 1188 dst[y * dst_stride + x] = 1189 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); 1190 } 1191 } 1192 src_vert++; 1193 } 1194 } 1195 1196 static void highbd_convolve_2d_facade_compound( 1197 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1198 const int w, const int h, const InterpFilterParams *filter_params_x, 1199 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 1200 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 1201 const bool need_x = subpel_x_qn != 0; 1202 const bool need_y = subpel_y_qn != 0; 1203 if (!need_x && !need_y) { 1204 av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, 1205 conv_params, bd); 1206 } else if (need_x && !need_y) { 1207 av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, 1208 filter_params_x, subpel_x_qn, conv_params, 1209 bd); 1210 } else if (!need_x && need_y) { 1211 av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, 1212 filter_params_y, subpel_y_qn, conv_params, 1213 bd); 1214 } else { 1215 assert(need_x && need_y); 1216 av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, 1217 filter_params_x, filter_params_y, 1218 subpel_x_qn, subpel_y_qn, conv_params, bd); 1219 } 1220 } 1221 1222 static void highbd_convolve_2d_facade_single( 1223 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1224 const int w, const int h, const InterpFilterParams *filter_params_x, 1225 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 1226 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 1227 const bool need_x = subpel_x_qn != 0; 1228 const bool need_y = subpel_y_qn != 0; 1229 1230 if (!need_x && !need_y) { 1231 aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h); 1232 } else if (need_x && !need_y) { 1233 av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, 1234 filter_params_x, subpel_x_qn, conv_params, bd); 1235 } else if (!need_x && need_y) { 1236 av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, 1237 filter_params_y, subpel_y_qn, bd); 1238 } else { 1239 assert(need_x && need_y); 1240 av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, 1241 filter_params_x, filter_params_y, subpel_x_qn, 1242 subpel_y_qn, conv_params, bd); 1243 } 1244 } 1245 1246 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, 1247 uint8_t *dst8, int dst_stride, int w, int h, 1248 const InterpFilterParams *interp_filters[2], 1249 const int subpel_x_qn, int x_step_q4, 1250 const int subpel_y_qn, int y_step_q4, 1251 int scaled, ConvolveParams *conv_params, 1252 int bd) { 1253 (void)x_step_q4; 1254 (void)y_step_q4; 1255 (void)dst_stride; 1256 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 1257 1258 const InterpFilterParams *filter_params_x = interp_filters[0]; 1259 const InterpFilterParams *filter_params_y = interp_filters[1]; 1260 1261 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1262 // 2-tap filter indicates that it is for IntraBC. 1263 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { 1264 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 1265 assert(!scaled); 1266 if (subpel_x_qn && subpel_y_qn) { 1267 av1_highbd_convolve_2d_sr_intrabc_c( 1268 src, src_stride, dst, dst_stride, w, h, filter_params_x, 1269 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); 1270 return; 1271 } else if (subpel_x_qn) { 1272 av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, 1273 filter_params_x, subpel_x_qn, 1274 conv_params, bd); 1275 return; 1276 } else if (subpel_y_qn) { 1277 av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, 1278 filter_params_y, subpel_y_qn, bd); 1279 return; 1280 } 1281 } 1282 1283 if (scaled) { 1284 if (conv_params->is_compound) { 1285 assert(conv_params->dst != NULL); 1286 } 1287 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, 1288 filter_params_x, filter_params_y, subpel_x_qn, 1289 x_step_q4, subpel_y_qn, y_step_q4, conv_params, 1290 bd); 1291 } else if (conv_params->is_compound) { 1292 highbd_convolve_2d_facade_compound( 1293 src, src_stride, dst, dst_stride, w, h, filter_params_x, 1294 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); 1295 } else { 1296 highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, 1297 filter_params_x, filter_params_y, 1298 subpel_x_qn, subpel_y_qn, conv_params, bd); 1299 } 1300 } 1301 #endif // CONFIG_AV1_HIGHBITDEPTH 1302 1303 // Note: Fixed size intermediate buffers, place limits on parameters 1304 // of some functions. 2d filtering proceeds in 2 steps: 1305 // (1) Interpolate horizontally into an intermediate buffer, temp. 1306 // (2) Interpolate temp vertically to derive the sub-pixel result. 1307 // Deriving the maximum number of rows in the temp buffer (135): 1308 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 1309 // --Largest block size is 128x128 pixels. 1310 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the 1311 // original frame (in 1/16th pixel units). 1312 // --Must round-up because block may be located at sub-pixel position. 1313 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 1314 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263. 1315 #define WIENER_MAX_EXT_SIZE 263 1316 1317 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1318 static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) { 1319 int sum = 0; 1320 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; 1321 return sum; 1322 } 1323 1324 #if CONFIG_AV1_HIGHBITDEPTH 1325 static inline int highbd_horz_scalar_product(const uint16_t *a, 1326 const int16_t *b) { 1327 int sum = 0; 1328 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; 1329 return sum; 1330 } 1331 #endif 1332 1333 static inline int highbd_vert_scalar_product(const uint16_t *a, 1334 ptrdiff_t a_stride, 1335 const int16_t *b) { 1336 int sum = 0; 1337 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; 1338 return sum; 1339 } 1340 1341 static const InterpKernel *get_filter_base(const int16_t *filter) { 1342 // NOTE: This assumes that the filter table is 256-byte aligned. 1343 // TODO(agrange) Modify to make independent of table alignment. 1344 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); 1345 } 1346 1347 static int get_filter_offset(const int16_t *f, const InterpKernel *base) { 1348 return (int)((const InterpKernel *)(intptr_t)f - base); 1349 } 1350 1351 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, 1352 uint16_t *dst, ptrdiff_t dst_stride, 1353 const InterpKernel *x_filters, int x0_q4, 1354 int x_step_q4, int w, int h, 1355 int round0_bits) { 1356 const int bd = 8; 1357 src -= SUBPEL_TAPS / 2 - 1; 1358 for (int y = 0; y < h; ++y) { 1359 int x_q4 = x0_q4; 1360 for (int x = 0; x < w; ++x) { 1361 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 1362 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 1363 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + 1364 (1 << (bd + FILTER_BITS - 1)); 1365 const int sum = horz_scalar_product(src_x, x_filter) + rounding; 1366 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, 1367 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); 1368 x_q4 += x_step_q4; 1369 } 1370 src += src_stride; 1371 dst += dst_stride; 1372 } 1373 } 1374 1375 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, 1376 uint8_t *dst, ptrdiff_t dst_stride, 1377 const InterpKernel *y_filters, int y0_q4, 1378 int y_step_q4, int w, int h, 1379 int round1_bits) { 1380 const int bd = 8; 1381 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1382 1383 for (int x = 0; x < w; ++x) { 1384 int y_q4 = y0_q4; 1385 for (int y = 0; y < h; ++y) { 1386 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1387 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1388 const int rounding = 1389 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - 1390 (1 << (bd + round1_bits - 1)); 1391 const int sum = 1392 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; 1393 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); 1394 y_q4 += y_step_q4; 1395 } 1396 ++src; 1397 ++dst; 1398 } 1399 } 1400 1401 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, 1402 uint8_t *dst, ptrdiff_t dst_stride, 1403 const int16_t *filter_x, int x_step_q4, 1404 const int16_t *filter_y, int y_step_q4, 1405 int w, int h, 1406 const WienerConvolveParams *conv_params) { 1407 const InterpKernel *const filters_x = get_filter_base(filter_x); 1408 const int x0_q4 = get_filter_offset(filter_x, filters_x); 1409 1410 const InterpKernel *const filters_y = get_filter_base(filter_y); 1411 const int y0_q4 = get_filter_offset(filter_y, filters_y); 1412 1413 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; 1414 const int intermediate_height = 1415 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; 1416 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); 1417 1418 assert(w <= MAX_SB_SIZE); 1419 assert(h <= MAX_SB_SIZE); 1420 assert(y_step_q4 <= 32); 1421 assert(x_step_q4 <= 32); 1422 1423 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1424 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, 1425 x_step_q4, w, intermediate_height, 1426 conv_params->round_0); 1427 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), 1428 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, 1429 y_step_q4, w, h, conv_params->round_1); 1430 } 1431 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1432 1433 #if CONFIG_AV1_HIGHBITDEPTH 1434 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1435 static void highbd_convolve_add_src_horiz_hip( 1436 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, 1437 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, 1438 int x_step_q4, int w, int h, int round0_bits, int bd) { 1439 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); 1440 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 1441 src -= SUBPEL_TAPS / 2 - 1; 1442 for (int y = 0; y < h; ++y) { 1443 int x_q4 = x0_q4; 1444 for (int x = 0; x < w; ++x) { 1445 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 1446 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 1447 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + 1448 (1 << (bd + FILTER_BITS - 1)); 1449 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; 1450 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, 1451 extraprec_clamp_limit - 1); 1452 x_q4 += x_step_q4; 1453 } 1454 src += src_stride; 1455 dst += dst_stride; 1456 } 1457 } 1458 1459 static void highbd_convolve_add_src_vert_hip( 1460 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, 1461 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, 1462 int y_step_q4, int w, int h, int round1_bits, int bd) { 1463 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1464 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1465 for (int x = 0; x < w; ++x) { 1466 int y_q4 = y0_q4; 1467 for (int y = 0; y < h; ++y) { 1468 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1469 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1470 const int rounding = 1471 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - 1472 (1 << (bd + round1_bits - 1)); 1473 const int sum = 1474 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; 1475 dst[y * dst_stride] = 1476 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); 1477 y_q4 += y_step_q4; 1478 } 1479 ++src; 1480 ++dst; 1481 } 1482 } 1483 1484 void av1_highbd_wiener_convolve_add_src_c( 1485 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 1486 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, 1487 const int16_t *filter_y, int y_step_q4, int w, int h, 1488 const WienerConvolveParams *conv_params, int bd) { 1489 const InterpKernel *const filters_x = get_filter_base(filter_x); 1490 const int x0_q4 = get_filter_offset(filter_x, filters_x); 1491 1492 const InterpKernel *const filters_y = get_filter_base(filter_y); 1493 const int y0_q4 = get_filter_offset(filter_y, filters_y); 1494 1495 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; 1496 const int intermediate_height = 1497 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 1498 1499 assert(w <= MAX_SB_SIZE); 1500 assert(h <= MAX_SB_SIZE); 1501 assert(y_step_q4 <= 32); 1502 assert(x_step_q4 <= 32); 1503 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); 1504 1505 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1506 src_stride, temp, MAX_SB_SIZE, filters_x, 1507 x0_q4, x_step_q4, w, intermediate_height, 1508 conv_params->round_0, bd); 1509 highbd_convolve_add_src_vert_hip( 1510 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, 1511 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); 1512 } 1513 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1514 #endif // CONFIG_AV1_HIGHBITDEPTH