itx_tmpl.c (71466B)
1 /* 2 * Copyright © 2024, VideoLAN and dav1d authors 3 * Copyright © 2024, Luca Barbato 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/ppc/dav1d_types.h" 29 #include "src/ppc/itx.h" 30 #include "src/ppc/utils.h" 31 32 #if BITDEPTH == 8 33 34 #define LOAD_4(src, stride, a, b, c, d) \ 35 { \ 36 uint8_t *s = src; \ 37 a = vec_xl(0, s); \ 38 s += stride; \ 39 b = vec_xl(0, s); \ 40 s += stride; \ 41 c = vec_xl(0, s); \ 42 s += stride; \ 43 d = vec_xl(0, s); \ 44 } 45 46 #define LOAD_DECLARE_2_I16(src, a, b) \ 47 i16x8 a = vec_xl(0, src); \ 48 i16x8 b = vec_xl(0, src + 8); 49 50 #define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \ 51 i32x4 a = i16h_to_i32(sa); \ 52 i32x4 b = i16l_to_i32(sa); \ 53 i32x4 c = i16h_to_i32(sb); \ 54 i32x4 d = i16l_to_i32(sb); 55 56 #define LOAD_COEFF_4(coeff) \ 57 LOAD_DECLARE_2_I16(coeff, c01, c23) \ 58 UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) 59 60 #define LOAD_SCALE_COEFF_4x8(coeff, scale) \ 61 LOAD_DECLARE_2_I16(coeff, c04, c15) \ 62 LOAD_DECLARE_2_I16(coeff+16, c26, c37) \ 63 i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \ 64 i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \ 65 i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \ 66 i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \ 67 c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ 68 c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ 69 UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ 70 c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ 71 c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ 72 UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) 73 74 #define LOAD_SCALE_COEFF_8x4(coeff, scale) \ 75 LOAD_DECLARE_2_I16(coeff, c01, c23) \ 76 LOAD_DECLARE_2_I16(coeff+16, c45, c67) \ 77 c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ 78 c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ 79 UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ 80 c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ 81 c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ 82 UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) 83 84 #define LOAD_COEFF_8x8(coeff) \ 85 LOAD_DECLARE_2_I16(coeff, c0, c1) \ 86 LOAD_DECLARE_2_I16(coeff+16, c2, c3) \ 87 LOAD_DECLARE_2_I16(coeff+32, c4, c5) \ 88 LOAD_DECLARE_2_I16(coeff+48, c6, c7) \ 89 UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \ 90 UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \ 91 UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \ 92 UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \ 93 94 #define LOAD_COEFF_4x16(coeff) \ 95 LOAD_DECLARE_2_I16(coeff, a0b0, c0d0) \ 96 LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \ 97 LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \ 98 LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \ 99 UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \ 100 UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \ 101 UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \ 102 UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3) 103 104 #define LOAD_DECLARE_4(src, stride, a, b, c, d) \ 105 u8x16 a, b, c, d; \ 106 LOAD_4(src, stride, a, b, c, d) 107 108 #define STORE_LEN(l, dst, stride, a, b, c, d) \ 109 { \ 110 uint8_t *dst2 = dst; \ 111 vec_xst_len(a, dst2, l); \ 112 dst2 += stride; \ 113 vec_xst_len(b, dst2, l); \ 114 dst2 += stride; \ 115 vec_xst_len(c, dst2, l); \ 116 dst2 += stride; \ 117 vec_xst_len(d, dst2, l); \ 118 } 119 120 #define STORE_4(dst, stride, a, b, c, d) \ 121 STORE_LEN(4, dst, stride, a, b, c, d) 122 123 #define STORE_8(dst, stride, ab, cd, ef, gh) \ 124 STORE_LEN(8, dst, stride, ab, cd, ef, gh) 125 126 #define STORE_16(dst, stride, l0, l1, l2, l3) \ 127 { \ 128 uint8_t *dst##2 = dst; \ 129 vec_xst(l0, 0, dst##2); \ 130 dst##2 += stride; \ 131 vec_xst(l1, 0, dst##2); \ 132 dst##2 += stride; \ 133 vec_xst(l2, 0, dst##2); \ 134 dst##2 += stride; \ 135 vec_xst(l3, 0, dst##2); \ 136 } 137 138 #define APPLY_COEFF_4(a, b, c, d, c01, c23) \ 139 { \ 140 u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \ 141 u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \ 142 \ 143 c01 = vec_adds(c01, vec_splat_s16(8)); \ 144 c23 = vec_adds(c23, vec_splat_s16(8)); \ 145 c01 = vec_sra(c01, vec_splat_u16(4)); \ 146 c23 = vec_sra(c23, vec_splat_u16(4)); \ 147 \ 148 i16x8 abs = u8h_to_i16(ab); \ 149 i16x8 cds = u8h_to_i16(cd); \ 150 \ 151 abs = vec_adds(abs, c01); \ 152 cds = vec_adds(cds, c23); \ 153 \ 154 a = vec_packsu(abs, abs); \ 155 c = vec_packsu(cds, cds); \ 156 \ 157 b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \ 158 d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \ 159 } 160 161 #define APPLY_COEFF_8x4(ab, cd, c01, c23) \ 162 { \ 163 i16x8 abs = u8h_to_i16(ab); \ 164 i16x8 cds = u8h_to_i16(cd); \ 165 c01 = vec_adds(c01, vec_splat_s16(8)); \ 166 c23 = vec_adds(c23, vec_splat_s16(8)); \ 167 c01 = vec_sra(c01, vec_splat_u16(4)); \ 168 c23 = vec_sra(c23, vec_splat_u16(4)); \ 169 \ 170 abs = vec_adds(abs, c01); \ 171 cds = vec_adds(cds, c23); \ 172 \ 173 ab = vec_packsu(abs, abs); \ 174 cd = vec_packsu(cds, cds); \ 175 } 176 177 #define APPLY_COEFF_16x4(a, b, c, d, \ 178 c00c01, c02c03, c04c05, c06c07, \ 179 c08c09, c10c11, c12c13, c14c15) \ 180 { \ 181 i16x8 ah = u8h_to_i16(a); \ 182 i16x8 al = u8l_to_i16(a); \ 183 i16x8 bh = u8h_to_i16(b); \ 184 i16x8 bl = u8l_to_i16(b); \ 185 i16x8 ch = u8h_to_i16(c); \ 186 i16x8 cl = u8l_to_i16(c); \ 187 i16x8 dh = u8h_to_i16(d); \ 188 i16x8 dl = u8l_to_i16(d); \ 189 SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \ 190 SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \ 191 \ 192 ah = vec_adds(ah, c00c01); \ 193 al = vec_adds(al, c02c03); \ 194 bh = vec_adds(bh, c04c05); \ 195 bl = vec_adds(bl, c06c07); \ 196 ch = vec_adds(ch, c08c09); \ 197 cl = vec_adds(cl, c10c11); \ 198 dh = vec_adds(dh, c12c13); \ 199 dl = vec_adds(dl, c14c15); \ 200 \ 201 a = vec_packsu(ah, al); \ 202 b = vec_packsu(bh, bl); \ 203 c = vec_packsu(ch, cl); \ 204 d = vec_packsu(dh, dl); \ 205 } 206 207 #define IDCT_4_INNER(c0, c1, c2, c3) \ 208 { \ 209 i32x4 o0 = vec_add(c0, c2); \ 210 i32x4 o1 = vec_sub(c0, c2); \ 211 \ 212 i32x4 v2896 = vec_splats(2896); \ 213 i32x4 v1567 = vec_splats(1567); \ 214 i32x4 v3784 = vec_splats(3784); \ 215 i32x4 v2048 = vec_splats(2048); \ 216 \ 217 o0 = vec_mul(o0, v2896); \ 218 o1 = vec_mul(o1, v2896); \ 219 \ 220 i32x4 o2a = vec_mul(c1, v1567); \ 221 i32x4 o2b = vec_mul(c3, v3784); \ 222 i32x4 o3a = vec_mul(c1, v3784); \ 223 i32x4 o3b = vec_mul(c3, v1567); \ 224 \ 225 i32x4 o2 = vec_sub(o2a, o2b); \ 226 i32x4 o3 = vec_add(o3a, o3b); \ 227 \ 228 u32x4 v12 = vec_splat_u32(12); \ 229 \ 230 o0 = vec_add(o0, v2048); \ 231 o1 = vec_add(o1, v2048); \ 232 o2 = vec_add(o2, v2048); \ 233 o3 = vec_add(o3, v2048); \ 234 \ 235 o0 = vec_sra(o0, v12); \ 236 o1 = vec_sra(o1, v12); \ 237 o2 = vec_sra(o2, v12); \ 238 o3 = vec_sra(o3, v12); \ 239 \ 240 c0 = vec_add(o0, o3); \ 241 c1 = vec_add(o1, o2); \ 242 c2 = vec_sub(o1, o2); \ 243 c3 = vec_sub(o0, o3); \ 244 \ 245 } 246 247 #define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \ 248 IDCT_4_INNER(c0, c1, c2, c3) \ 249 c03 = vec_packs(c0, c3); \ 250 c12 = vec_packs(c1, c2); \ 251 252 #define dct_4_in(c0, c1, c2, c3, c01, c23) \ 253 { \ 254 IDCT_4_INNER(c0, c1, c2, c3) \ 255 c01 = vec_packs(c0, c1); \ 256 c23 = vec_packs(c2, c3); \ 257 c0 = i16h_to_i32(c01); \ 258 c1 = i16l_to_i32(c01); \ 259 c2 = i16h_to_i32(c23); \ 260 c3 = i16l_to_i32(c23); \ 261 } 262 263 #define dct_4_out(c0, c1, c2, c3, c01, c23) \ 264 IDCT_4_INNER(c0, c1, c2, c3) \ 265 c01 = vec_packs(c0, c1); \ 266 c23 = vec_packs(c2, c3); \ 267 268 269 #define IDENTITY_4(c01, c23) \ 270 { \ 271 i16x8 v1697 = vec_splats((int16_t)(1697*8)); \ 272 i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \ 273 i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \ 274 c01 = vec_adds(c01, o01); \ 275 c23 = vec_adds(c23, o23); \ 276 } 277 278 #define identity_4_in(c0, c1, c2, c3, c01, c23) \ 279 { \ 280 IDENTITY_4(c01, c23) \ 281 c0 = i16h_to_i32(c01); \ 282 c1 = i16l_to_i32(c01); \ 283 c2 = i16h_to_i32(c23); \ 284 c3 = i16l_to_i32(c23); \ 285 } 286 287 #define identity_4_out(c0, c1, c2, c3, c01, c23) \ 288 { \ 289 c01 = vec_packs(c0, c1); \ 290 c23 = vec_packs(c2, c3); \ 291 IDENTITY_4(c01, c23) \ 292 } 293 294 #define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \ 295 { \ 296 i32x4 v1321 = vec_splats(1321); \ 297 i32x4 v3803 = vec_splats(3803); \ 298 i32x4 v2482 = vec_splats(2482); \ 299 i32x4 v3344 = vec_splats(3344); \ 300 i32x4 v2048 = vec_splats(2048); \ 301 i32x4 i0_v1321 = vec_mul(c0, v1321); \ 302 i32x4 i0_v2482 = vec_mul(c0, v2482); \ 303 i32x4 i0_v3803 = vec_mul(c0, v3803); \ 304 i32x4 i1 = vec_mul(c1, v3344); \ 305 i32x4 i2_v1321 = vec_mul(c2, v1321); \ 306 i32x4 i2_v2482 = vec_mul(c2, v2482); \ 307 i32x4 i2_v3803 = vec_mul(c2, v3803); \ 308 i32x4 i3_v1321 = vec_mul(c3, v1321); \ 309 i32x4 i3_v2482 = vec_mul(c3, v2482); \ 310 i32x4 i3_v3803 = vec_mul(c3, v3803); \ 311 \ 312 i32x4 n1 = vec_sub(i1, v2048); \ 313 i1 = vec_add(i1, v2048); \ 314 \ 315 \ 316 i32x4 o0 = vec_add(i0_v1321, i2_v3803); \ 317 i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \ 318 i32x4 o2 = vec_sub(c0, c2); \ 319 i32x4 o3 = vec_add(i0_v3803, i2_v2482); \ 320 \ 321 o0 = vec_add(o0, i3_v2482); \ 322 o1 = vec_sub(o1, i3_v3803); \ 323 o2 = vec_add(o2, c3); \ 324 o3 = vec_sub(o3, i3_v1321); \ 325 \ 326 o0 = vec_add(o0, i1); \ 327 o1 = vec_add(o1, i1); \ 328 o2 = vec_mul(o2, v3344); \ 329 o3 = vec_sub(o3, n1); \ 330 \ 331 o2 = vec_add(o2, v2048); \ 332 \ 333 oc0 = vec_sra(o0, vec_splat_u32(12)); \ 334 oc1 = vec_sra(o1, vec_splat_u32(12)); \ 335 oc2 = vec_sra(o2, vec_splat_u32(12)); \ 336 oc3 = vec_sra(o3, vec_splat_u32(12)); \ 337 } 338 339 #define adst_4_in(c0, c1, c2, c3, c01, c23) \ 340 { \ 341 ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ 342 } 343 344 #define flipadst_4_in(c0, c1, c2, c3, c01, c23) \ 345 { \ 346 ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ 347 } 348 349 #define adst_4_out(c0, c1, c2, c3, c01, c23) \ 350 { \ 351 ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ 352 c01 = vec_packs(c0, c1); \ 353 c23 = vec_packs(c2, c3); \ 354 } 355 356 #define flipadst_4_out(c0, c1, c2, c3, c01, c23) \ 357 { \ 358 ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ 359 c01 = vec_packs(c0, c1); \ 360 c23 = vec_packs(c2, c3); \ 361 } 362 363 static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) 364 { 365 int dc = coeff[0]; 366 const int rnd = (1 << shift) >> 1; 367 if (is_rect2) 368 dc = (dc * 181 + 128) >> 8; 369 dc = (dc * 181 + 128) >> 8; 370 dc = (dc + rnd) >> shift; 371 dc = (dc * 181 + 128 + 2048) >> 12; 372 373 i16x8 vdc = vec_splats((int16_t)dc); 374 coeff[0] = 0; 375 for (int i = 0; i < n; i++, dst += 4 * stride) { 376 LOAD_DECLARE_4(dst, stride, a, b, c, d) 377 378 i16x8 as = u8h_to_i16(a); 379 i16x8 bs = u8h_to_i16(b); 380 i16x8 cs = u8h_to_i16(c); 381 i16x8 ds = u8h_to_i16(d); 382 383 as = vec_adds(as, vdc); 384 bs = vec_adds(bs, vdc); 385 cs = vec_adds(cs, vdc); 386 ds = vec_adds(ds, vdc); 387 388 a = vec_packsu(as, as); 389 b = vec_packsu(bs, bs); 390 c = vec_packsu(cs, cs); 391 d = vec_packsu(ds, ds); 392 393 STORE_4(dst, stride, a, b, c, d) 394 } 395 } 396 397 static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) 398 { 399 int dc = coeff[0]; 400 const int rnd = (1 << shift) >> 1; 401 if (is_rect2) 402 dc = (dc * 181 + 128) >> 8; 403 dc = (dc * 181 + 128) >> 8; 404 dc = (dc + rnd) >> shift; 405 dc = (dc * 181 + 128 + 2048) >> 12; 406 407 i16x8 vdc = vec_splats((int16_t)dc); 408 coeff[0] = 0; 409 410 for (int i = 0; i < n; i++, dst += 4 * stride) { 411 LOAD_DECLARE_4(dst, stride, a, b, c, d) 412 413 i16x8 as = u8h_to_i16(a); 414 i16x8 bs = u8h_to_i16(b); 415 i16x8 cs = u8h_to_i16(c); 416 i16x8 ds = u8h_to_i16(d); 417 418 as = vec_adds(as, vdc); 419 bs = vec_adds(bs, vdc); 420 cs = vec_adds(cs, vdc); 421 ds = vec_adds(ds, vdc); 422 423 a = vec_packsu(as, as); 424 b = vec_packsu(bs, bs); 425 c = vec_packsu(cs, cs); 426 d = vec_packsu(ds, ds); 427 428 STORE_8(dst, stride, a, b, c, d) 429 } 430 } 431 432 static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) 433 { 434 int dc = coeff[0]; 435 const int rnd = (1 << shift) >> 1; 436 if (is_rect2) 437 dc = (dc * 181 + 128) >> 8; 438 dc = (dc * 181 + 128) >> 8; 439 dc = (dc + rnd) >> shift; 440 dc = (dc * 181 + 128 + 2048) >> 12; 441 442 i16x8 vdc = vec_splats((int16_t)dc); 443 coeff[0] = 0; 444 445 for (int i = 0; i < n; i++, dst += 4 * stride) { 446 LOAD_DECLARE_4(dst, stride, a, b, c, d) 447 448 i16x8 ah = u8h_to_i16(a); 449 i16x8 bh = u8h_to_i16(b); 450 i16x8 ch = u8h_to_i16(c); 451 i16x8 dh = u8h_to_i16(d); 452 i16x8 al = u8l_to_i16(a); 453 i16x8 bl = u8l_to_i16(b); 454 i16x8 cl = u8l_to_i16(c); 455 i16x8 dl = u8l_to_i16(d); 456 457 ah = vec_adds(ah, vdc); 458 bh = vec_adds(bh, vdc); 459 ch = vec_adds(ch, vdc); 460 dh = vec_adds(dh, vdc); 461 al = vec_adds(al, vdc); 462 bl = vec_adds(bl, vdc); 463 cl = vec_adds(cl, vdc); 464 dl = vec_adds(dl, vdc); 465 466 a = vec_packsu(ah, al); 467 b = vec_packsu(bh, bl); 468 c = vec_packsu(ch, cl); 469 d = vec_packsu(dh, dl); 470 471 STORE_16(dst, stride, a, b, c, d) 472 } 473 } 474 475 void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, 476 int16_t *const coeff, const int eob) 477 { 478 assert(eob >= 0); 479 480 if (eob < 1) { 481 return dc_only_4xN(dst, stride, coeff, 1, 0, 0); 482 } 483 484 LOAD_COEFF_4(coeff) 485 486 dct_4_in(c0, c1, c2, c3, c01, c23) 487 488 TRANSPOSE4_I32(c0, c1, c2, c3) 489 490 memset(coeff, 0, sizeof(*coeff) * 4 * 4); 491 492 dct_4_out(c0, c1, c2, c3, c01, c23) 493 494 LOAD_DECLARE_4(dst, stride, a, b, c, d) 495 496 APPLY_COEFF_4(a, b, c, d, c01, c23) 497 498 STORE_4(dst, stride, a, b, c, d) 499 } 500 501 void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride, 502 coef *const coeff, const int eob) 503 { 504 LOAD_COEFF_4(coeff) 505 506 u32x4 v2 = vec_splat_u32(2); 507 508 c0 = vec_sra(c0, v2); 509 c1 = vec_sra(c1, v2); 510 c2 = vec_sra(c2, v2); 511 c3 = vec_sra(c3, v2); 512 513 i32x4 t0 = vec_add(c0, c1); 514 i32x4 t2 = vec_sub(c2, c3); 515 i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); 516 i32x4 t3 = vec_sub(t4, c3); 517 i32x4 t1 = vec_sub(t4, c1); 518 c0 = vec_sub(t0, t3); 519 c1 = t3; 520 c2 = t1; 521 c3 = vec_add(t2, t1); 522 523 memset(coeff, 0, sizeof(*coeff) * 4 * 4); 524 525 TRANSPOSE4_I32(c0, c1, c2, c3) 526 527 t0 = vec_add(c0, c1); 528 t2 = vec_sub(c2, c3); 529 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); 530 t3 = vec_sub(t4, c3); 531 t1 = vec_sub(t4, c1); 532 c0 = vec_sub(t0, t3); 533 c1 = t3; 534 c2 = t1; 535 c3 = vec_add(t2, t1); 536 537 c01 = vec_packs(c0, c1); 538 c23 = vec_packs(c2, c3); 539 540 LOAD_DECLARE_4(dst, stride, a, b, c, d) 541 542 u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); 543 u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); 544 545 i16x8 abs = u8h_to_i16(ab); 546 i16x8 cds = u8h_to_i16(cd); 547 548 abs = vec_adds(abs, c01); 549 cds = vec_adds(cds, c23); 550 551 a = vec_packsu(abs, abs); 552 c = vec_packsu(cds, cds); 553 554 b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); 555 d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); 556 557 STORE_4(dst, stride, a, b, c, d) 558 } 559 560 #define inv_txfm_fn4x4(type1, type2) \ 561 void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 562 int16_t *const coeff, const int eob) \ 563 { \ 564 LOAD_COEFF_4(coeff) \ 565 type1##_4_in(c0, c1, c2, c3, c01, c23) \ 566 memset(coeff, 0, sizeof(*coeff) * 4 * 4); \ 567 TRANSPOSE4_I32(c0, c1, c2, c3) \ 568 type2##_4_out(c0, c1, c2, c3, c01, c23) \ 569 LOAD_DECLARE_4(dst, stride, a, b, c, d) \ 570 APPLY_COEFF_4(a, b, c, d, c01, c23) \ 571 STORE_4(dst, stride, a, b, c, d) \ 572 } 573 574 inv_txfm_fn4x4(adst, dct ) 575 inv_txfm_fn4x4(dct, adst ) 576 inv_txfm_fn4x4(dct, flipadst) 577 inv_txfm_fn4x4(flipadst, dct ) 578 inv_txfm_fn4x4(adst, flipadst) 579 inv_txfm_fn4x4(flipadst, adst ) 580 inv_txfm_fn4x4(identity, dct ) 581 inv_txfm_fn4x4(dct, identity) 582 inv_txfm_fn4x4(identity, flipadst) 583 inv_txfm_fn4x4(flipadst, identity) 584 inv_txfm_fn4x4(identity, adst ) 585 inv_txfm_fn4x4(adst, identity) 586 inv_txfm_fn4x4(identity, identity) 587 inv_txfm_fn4x4(adst, adst ) 588 inv_txfm_fn4x4(flipadst, flipadst) 589 590 591 #define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ 592 dct4_for_dct8(c0, c2, c4, c6, c03, c12) \ 593 \ 594 i32x4 v799 = vec_splats(799); \ 595 i32x4 v4017 = vec_splats(4017); \ 596 i32x4 v3406 = vec_splats(3406); \ 597 i32x4 v2276 = vec_splats(2276); \ 598 i32x4 v2048 = vec_splats(2048); \ 599 u32x4 v12 = vec_splat_u32(12); \ 600 \ 601 i32x4 c1v799 = vec_mul(c1, v799); \ 602 i32x4 c7v4017 = vec_mul(c7, v4017); \ 603 i32x4 c5v3406 = vec_mul(c5, v3406); \ 604 i32x4 c3v2276 = vec_mul(c3, v2276); \ 605 i32x4 c5v2276 = vec_mul(c5, v2276); \ 606 i32x4 c3v3406 = vec_mul(c3, v3406); \ 607 i32x4 c1v4017 = vec_mul(c1, v4017); \ 608 i32x4 c7v799 = vec_mul(c7, v799); \ 609 \ 610 i32x4 t4a = vec_subs(c1v799, c7v4017); \ 611 i32x4 t5a = vec_subs(c5v3406, c3v2276); \ 612 i32x4 t6a = vec_adds(c5v2276, c3v3406); \ 613 i32x4 t7a = vec_adds(c1v4017, c7v799); \ 614 \ 615 t4a = vec_adds(t4a, v2048); \ 616 t5a = vec_adds(t5a, v2048); \ 617 t6a = vec_adds(t6a, v2048); \ 618 t7a = vec_adds(t7a, v2048); \ 619 \ 620 t4a = vec_sra(t4a, v12); \ 621 t7a = vec_sra(t7a, v12); \ 622 t5a = vec_sra(t5a, v12); \ 623 t6a = vec_sra(t6a, v12); \ 624 \ 625 i16x8 t7at4a = vec_packs(t7a, t4a); \ 626 i16x8 t6at5a = vec_packs(t6a, t5a); \ 627 \ 628 i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \ 629 t6at5a = vec_subs(t7at4a, t6at5a); \ 630 \ 631 t6a = i16h_to_i32(t6at5a); \ 632 t5a = i16l_to_i32(t6at5a); \ 633 \ 634 i32x4 t6 = vec_add(t6a, t5a); \ 635 i32x4 t5 = vec_sub(t6a, t5a); \ 636 \ 637 t6 = vec_mul(t6, vec_splats(181)); \ 638 t5 = vec_mul(t5, vec_splats(181)); \ 639 t6 = vec_add(t6, vec_splats(128)); \ 640 t5 = vec_add(t5, vec_splats(128)); \ 641 \ 642 t6 = vec_sra(t6, vec_splat_u32(8)); \ 643 t5 = vec_sra(t5, vec_splat_u32(8)); \ 644 \ 645 i16x8 t6t5 = vec_packs(t6, t5); \ 646 \ 647 c74 = vec_subs(c03, t7t4); \ 648 c65 = vec_subs(c12, t6t5); \ 649 c03 = vec_adds(c03, t7t4); \ 650 c12 = vec_adds(c12, t6t5); \ 651 652 #define UNPACK_4_I16_I32(t0, t1, t2, t3) \ 653 t0 = i16h_to_i32(t0##t1); \ 654 t1 = i16l_to_i32(t0##t1); \ 655 t2 = i16h_to_i32(t2##t3); \ 656 t3 = i16l_to_i32(t2##t3); 657 658 #define UNPACK_PAIR_I16_I32(hi, lo, v) \ 659 hi = i16h_to_i32(v); \ 660 lo = i16l_to_i32(v); \ 661 662 663 #define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \ 664 { \ 665 i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \ 666 IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \ 667 UNPACK_4_I16_I32(c0, c3, c1, c2) \ 668 UNPACK_4_I16_I32(c7, c4, c6, c5) \ 669 } 670 671 #define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 672 { \ 673 i16x8 c03, c12, c74, c65; \ 674 IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ 675 c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \ 676 c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \ 677 c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \ 678 c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \ 679 } 680 681 #define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 682 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 683 c0, c1, c2, c3, c4, c5, c6, c7) \ 684 { \ 685 dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \ 686 dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \ 687 } 688 689 #define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 690 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 691 c0, c1, c2, c3, c4, c5, c6, c7) \ 692 { \ 693 i16x8 c03h, c12h, c74h, c65h; \ 694 i16x8 c03l, c12l, c74l, c65l; \ 695 { \ 696 IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \ 697 } \ 698 { \ 699 IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \ 700 } \ 701 c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \ 702 c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \ 703 c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \ 704 c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \ 705 c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \ 706 c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \ 707 c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \ 708 c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \ 709 } 710 711 #define IDENTITY_8(c01, c23, c45, c67) \ 712 { \ 713 c01 = vec_adds(c01, c01); \ 714 c23 = vec_adds(c23, c23); \ 715 c45 = vec_adds(c45, c45); \ 716 c67 = vec_adds(c67, c67); \ 717 } 718 719 #define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 720 { \ 721 IDENTITY_8(c01, c23, c45, c67) \ 722 UNPACK_PAIR_I16_I32(c0, c1, c01) \ 723 UNPACK_PAIR_I16_I32(c2, c3, c23) \ 724 UNPACK_PAIR_I16_I32(c4, c5, c45) \ 725 UNPACK_PAIR_I16_I32(c6, c7, c67) \ 726 } 727 728 #define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 729 c01 = vec_packs(c0, c1); \ 730 c23 = vec_packs(c2, c3); \ 731 c45 = vec_packs(c4, c5); \ 732 c67 = vec_packs(c6, c7); \ 733 IDENTITY_8(c01, c23, c45, c67) 734 735 #define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 736 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 737 c0, c1, c2, c3, c4, c5, c6, c7) \ 738 { \ 739 IDENTITY_8(c0, c1, c2, c3) \ 740 IDENTITY_8(c4, c5, c6, c7) \ 741 UNPACK_PAIR_I16_I32(c0h, c0l, c0) \ 742 UNPACK_PAIR_I16_I32(c1h, c1l, c1) \ 743 UNPACK_PAIR_I16_I32(c2h, c2l, c2) \ 744 UNPACK_PAIR_I16_I32(c3h, c3l, c3) \ 745 UNPACK_PAIR_I16_I32(c4h, c4l, c4) \ 746 UNPACK_PAIR_I16_I32(c5h, c5l, c5) \ 747 UNPACK_PAIR_I16_I32(c6h, c6l, c6) \ 748 UNPACK_PAIR_I16_I32(c7h, c7l, c7) \ 749 } 750 751 #define PACK_4(c0, c1, c2, c3, \ 752 c0h, c1h, c2h, c3h, \ 753 c0l, c1l, c2l, c3l) \ 754 { \ 755 c0 = vec_packs(c0h, c0l); \ 756 c1 = vec_packs(c1h, c1l); \ 757 c2 = vec_packs(c2h, c2l); \ 758 c3 = vec_packs(c3h, c3l); \ 759 } 760 761 #define DECLARE_PACK_4(c0, c1, c2, c3, \ 762 c0h, c1h, c2h, c3h, \ 763 c0l, c1l, c2l, c3l) \ 764 i16x8 c0, c1, c2, c3; \ 765 PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l); 766 767 #define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 768 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 769 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 770 { \ 771 c0 = vec_packs(c0h, c0l); \ 772 c1 = vec_packs(c1h, c1l); \ 773 c2 = vec_packs(c2h, c2l); \ 774 c3 = vec_packs(c3h, c3l); \ 775 c4 = vec_packs(c4h, c4l); \ 776 c5 = vec_packs(c5h, c5l); \ 777 c6 = vec_packs(c6h, c6l); \ 778 c7 = vec_packs(c7h, c7l); \ 779 } 780 781 #define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 782 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 783 c0, c1, c2, c3, c4, c5, c6, c7) \ 784 { \ 785 PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 786 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 787 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 788 IDENTITY_8(c0, c1, c2, c3) \ 789 IDENTITY_8(c4, c5, c6, c7) \ 790 } 791 792 #define DECLARE_SPLAT_I32(val) \ 793 i32x4 v##val = vec_splats(val); 794 795 #define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \ 796 i32x4 ca##va = vec_mul(ca, va); \ 797 i32x4 cb##vb = vec_mul(cb, vb); \ 798 i32x4 ca##vb = vec_mul(ca, vb); \ 799 i32x4 cb##va = vec_mul(cb, va); 800 801 #define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ 802 r0 = vec_adds(ca##va, cb##vb); \ 803 r1 = vec_subs(ca##vb, cb##va); 804 805 #define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ 806 i32x4 r0, r1; \ 807 ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) 808 809 #define SCALE_ROUND_4(a, b, c, d, rnd, shift) \ 810 a = vec_adds(a, rnd); \ 811 b = vec_adds(b, rnd); \ 812 c = vec_adds(c, rnd); \ 813 d = vec_adds(d, rnd); \ 814 a = vec_sra(a, shift); \ 815 b = vec_sra(b, shift); \ 816 c = vec_sra(c, shift); \ 817 d = vec_sra(d, shift); 818 819 #define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 820 o0, o1, o2, o3, o4, o5, o6, o7) \ 821 { \ 822 DECLARE_SPLAT_I32(4076) \ 823 DECLARE_SPLAT_I32(401) \ 824 \ 825 DECLARE_SPLAT_I32(3612) \ 826 DECLARE_SPLAT_I32(1931) \ 827 \ 828 DECLARE_SPLAT_I32(2598) \ 829 DECLARE_SPLAT_I32(3166) \ 830 \ 831 DECLARE_SPLAT_I32(1189) \ 832 DECLARE_SPLAT_I32(3920) \ 833 \ 834 DECLARE_SPLAT_I32(3784) \ 835 DECLARE_SPLAT_I32(1567) \ 836 \ 837 DECLARE_SPLAT_I32(2048) \ 838 u32x4 v12 = vec_splat_u32(12); \ 839 \ 840 DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \ 841 DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \ 842 DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \ 843 DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \ 844 \ 845 DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \ 846 DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \ 847 DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \ 848 DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \ 849 \ 850 SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \ 851 SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ 852 \ 853 i32x4 t0 = vec_add(t0a, t4a); \ 854 i32x4 t1 = vec_add(t1a, t5a); \ 855 i32x4 t2 = vec_add(t2a, t6a); \ 856 i32x4 t3 = vec_add(t3a, t7a); \ 857 i32x4 t4 = vec_sub(t0a, t4a); \ 858 i32x4 t5 = vec_sub(t1a, t5a); \ 859 i32x4 t6 = vec_sub(t2a, t6a); \ 860 i32x4 t7 = vec_sub(t3a, t7a); \ 861 \ 862 i16x8 t0t1 = vec_packs(t0, t1); \ 863 i16x8 t2t3 = vec_packs(t2, t3); \ 864 i16x8 t4t5 = vec_packs(t4, t5); \ 865 i16x8 t6t7 = vec_packs(t6, t7); \ 866 \ 867 UNPACK_4_I16_I32(t4, t5, t6, t7) \ 868 UNPACK_4_I16_I32(t0, t1, t2, t3) \ 869 \ 870 DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \ 871 DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \ 872 \ 873 ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \ 874 ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \ 875 \ 876 SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ 877 \ 878 o0 = vec_add(t0, t2); \ 879 o1 = vec_add(t4a, t6a); \ 880 o7 = vec_add(t1, t3); \ 881 o6 = vec_add(t5a, t7a); \ 882 t2 = vec_sub(t0, t2); \ 883 t3 = vec_sub(t1, t3); \ 884 t6 = vec_sub(t4a, t6a); \ 885 t7 = vec_sub(t5a, t7a); \ 886 \ 887 i16x8 o7##o1 = vec_packs(o7, o1); \ 888 i16x8 o0##o6 = vec_packs(o0, o6); \ 889 t2t3 = vec_packs(t2, t3); \ 890 t6t7 = vec_packs(t6, t7); \ 891 \ 892 UNPACK_4_I16_I32(t2, t3, t6, t7) \ 893 UNPACK_4_I16_I32(o7, o1, o0, o6) \ 894 \ 895 o7 = -o7; \ 896 o1 = -o1; \ 897 \ 898 o3 = vec_add(t2, t3); \ 899 o4 = vec_sub(t2, t3); \ 900 o5 = vec_sub(t6, t7); \ 901 o2 = vec_add(t6, t7); \ 902 \ 903 i32x4 v181 = vec_splats(181); \ 904 i32x4 v128 = vec_splats(128); \ 905 u32x4 v8 = vec_splat_u32(8); \ 906 \ 907 o2 = vec_mul(o2, v181); \ 908 o3 = vec_mul(o3, v181); \ 909 o4 = vec_mul(o4, v181); \ 910 o5 = vec_mul(o5, v181); \ 911 \ 912 SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \ 913 \ 914 o3 = -o3; \ 915 o5 = -o5; \ 916 } 917 918 #define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 919 {\ 920 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 921 c0, c1, c2, c3, c4, c5, c6, c7) \ 922 c01 = vec_packs(c0, c1); \ 923 c23 = vec_packs(c2, c3); \ 924 c45 = vec_packs(c4, c5); \ 925 c67 = vec_packs(c6, c7); \ 926 UNPACK_PAIR_I16_I32(c0, c1, c01) \ 927 UNPACK_PAIR_I16_I32(c2, c3, c23) \ 928 UNPACK_PAIR_I16_I32(c4, c5, c45) \ 929 UNPACK_PAIR_I16_I32(c6, c7, c67) \ 930 } 931 932 #define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 933 {\ 934 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 935 c0, c1, c2, c3, c4, c5, c6, c7) \ 936 c01 = vec_packs(c0, c1); \ 937 c23 = vec_packs(c2, c3); \ 938 c45 = vec_packs(c4, c5); \ 939 c67 = vec_packs(c6, c7); \ 940 } 941 942 #define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 943 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 944 c0, c1, c2, c3, c4, c5, c6, c7) \ 945 { \ 946 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 947 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ 948 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 949 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 950 } 951 952 #define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 953 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 954 c0, c1, c2, c3, c4, c5, c6, c7) \ 955 { \ 956 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 957 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ 958 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 959 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 960 PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 961 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 962 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 963 } 964 965 #define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 966 {\ 967 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 968 c7, c6, c5, c4, c3, c2, c1, c0) \ 969 c01 = vec_packs(c0, c1); \ 970 c23 = vec_packs(c2, c3); \ 971 c45 = vec_packs(c4, c5); \ 972 c67 = vec_packs(c6, c7); \ 973 UNPACK_PAIR_I16_I32(c0, c1, c01) \ 974 UNPACK_PAIR_I16_I32(c2, c3, c23) \ 975 UNPACK_PAIR_I16_I32(c4, c5, c45) \ 976 UNPACK_PAIR_I16_I32(c6, c7, c67) \ 977 } 978 979 #define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 980 {\ 981 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 982 c7, c6, c5, c4, c3, c2, c1, c0) \ 983 c01 = vec_packs(c0, c1); \ 984 c23 = vec_packs(c2, c3); \ 985 c45 = vec_packs(c4, c5); \ 986 c67 = vec_packs(c6, c7); \ 987 } 988 989 #define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 990 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 991 c0, c1, c2, c3, c4, c5, c6, c7) \ 992 { \ 993 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 994 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ 995 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 996 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ 997 } 998 999 #define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1000 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 1001 c0, c1, c2, c3, c4, c5, c6, c7) \ 1002 { \ 1003 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1004 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ 1005 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 1006 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ 1007 PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ 1008 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1009 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 1010 } 1011 1012 void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, 1013 int16_t *const coeff, const int eob) 1014 { 1015 i16x8 v = vec_splats((int16_t)(2896*8)); 1016 1017 if (eob < 1) { 1018 return dc_only_4xN(dst, stride, coeff, 2, 1, 0); 1019 } 1020 1021 LOAD_SCALE_COEFF_4x8(coeff, v) 1022 1023 dct_4_in(c0, c1, c2, c3, c01, c23) 1024 dct_4_in(c4, c5, c6, c7, c45, c67) 1025 1026 1027 memset(coeff, 0, sizeof(*coeff) * 4 * 8); 1028 1029 TRANSPOSE4_I32(c0, c1, c2, c3); 1030 TRANSPOSE4_I32(c4, c5, c6, c7); 1031 1032 dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) 1033 1034 LOAD_DECLARE_4(dst, stride, a, b, cc, d) 1035 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) 1036 1037 APPLY_COEFF_4(a, b, cc, d, c01, c23) 1038 APPLY_COEFF_4(e, f, g, hh, c45, c67) 1039 1040 STORE_4(dst, stride, a, b, cc, d) 1041 STORE_4(dst + 4 * stride, stride, e, f, g, hh) 1042 } 1043 1044 1045 #define inv_txfm_fn4x8(type1, type2) \ 1046 void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1047 int16_t *const coeff, const int eob) \ 1048 { \ 1049 i16x8 v = vec_splats((int16_t)(2896*8)); \ 1050 LOAD_SCALE_COEFF_4x8(coeff, v) \ 1051 type1##_4_in(c0, c1, c2, c3, c01, c23) \ 1052 type1##_4_in(c4, c5, c6, c7, c45, c67) \ 1053 memset(coeff, 0, sizeof(*coeff) * 4 * 8); \ 1054 TRANSPOSE4_I32(c0, c1, c2, c3); \ 1055 TRANSPOSE4_I32(c4, c5, c6, c7); \ 1056 type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 1057 LOAD_DECLARE_4(dst, stride, a, b, c, d) \ 1058 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ 1059 APPLY_COEFF_4(a, b, c, d, c01, c23) \ 1060 APPLY_COEFF_4(e, f, g, h, c45, c67) \ 1061 STORE_4(dst, stride, a, b, c, d) \ 1062 STORE_4(dst + 4 * stride, stride, e, f, g, h) \ 1063 } 1064 1065 inv_txfm_fn4x8(adst, dct ) 1066 inv_txfm_fn4x8(dct, adst ) 1067 inv_txfm_fn4x8(dct, flipadst) 1068 inv_txfm_fn4x8(flipadst, dct ) 1069 inv_txfm_fn4x8(adst, flipadst) 1070 inv_txfm_fn4x8(flipadst, adst ) 1071 inv_txfm_fn4x8(identity, dct ) 1072 inv_txfm_fn4x8(dct, identity) 1073 inv_txfm_fn4x8(identity, flipadst) 1074 inv_txfm_fn4x8(flipadst, identity) 1075 inv_txfm_fn4x8(identity, adst ) 1076 inv_txfm_fn4x8(adst, identity) 1077 inv_txfm_fn4x8(identity, identity) 1078 inv_txfm_fn4x8(adst, adst ) 1079 inv_txfm_fn4x8(flipadst, flipadst) 1080 1081 1082 void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, 1083 int16_t *const coeff, const int eob) 1084 { 1085 i16x8 v = vec_splats((int16_t)(2896*8)); 1086 1087 if (eob < 1) { 1088 return dc_only_8xN(dst, stride, coeff, 1, 1, 0); 1089 } 1090 1091 LOAD_SCALE_COEFF_8x4(coeff, v) 1092 1093 dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) 1094 1095 memset(coeff, 0, sizeof(*coeff) * 8 * 4); 1096 1097 TRANSPOSE4_I32(c0, c1, c2, c3) 1098 TRANSPOSE4_I32(c4, c5, c6, c7) 1099 1100 dct_4_out(c0, c1, c2, c3, c01, c23) 1101 dct_4_out(c4, c5, c6, c7, c45, c67) 1102 1103 LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) 1104 1105 i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); 1106 i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); 1107 i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); 1108 i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); 1109 1110 APPLY_COEFF_8x4(ae, bf, c04, c15) 1111 APPLY_COEFF_8x4(cg, dh, c26, c37) 1112 1113 STORE_8(dst, stride, ae, bf, cg, dh) 1114 } 1115 1116 1117 #define inv_txfm_fn8x4(type1, type2) \ 1118 void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1119 int16_t *const coeff, const int eob) \ 1120 { \ 1121 i16x8 v = vec_splats((int16_t)(2896*8)); \ 1122 LOAD_SCALE_COEFF_8x4(coeff, v) \ 1123 type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ 1124 memset(coeff, 0, sizeof(*coeff) * 8 * 4); \ 1125 TRANSPOSE4_I32(c0, c1, c2, c3) \ 1126 TRANSPOSE4_I32(c4, c5, c6, c7) \ 1127 type2##_4_out(c0, c1, c2, c3, c01, c23) \ 1128 type2##_4_out(c4, c5, c6, c7, c45, c67) \ 1129 LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \ 1130 i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \ 1131 i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \ 1132 i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \ 1133 i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \ 1134 APPLY_COEFF_8x4(ae, bf, c04, c15) \ 1135 APPLY_COEFF_8x4(cg, dh, c26, c37) \ 1136 STORE_8(dst, stride, ae, bf, cg, dh) \ 1137 } 1138 inv_txfm_fn8x4(adst, dct ) 1139 inv_txfm_fn8x4(dct, adst ) 1140 inv_txfm_fn8x4(dct, flipadst) 1141 inv_txfm_fn8x4(flipadst, dct ) 1142 inv_txfm_fn8x4(adst, flipadst) 1143 inv_txfm_fn8x4(flipadst, adst ) 1144 inv_txfm_fn8x4(identity, dct ) 1145 inv_txfm_fn8x4(dct, identity) 1146 inv_txfm_fn8x4(identity, flipadst) 1147 inv_txfm_fn8x4(flipadst, identity) 1148 inv_txfm_fn8x4(identity, adst ) 1149 inv_txfm_fn8x4(adst, identity) 1150 inv_txfm_fn8x4(identity, identity) 1151 inv_txfm_fn8x4(adst, adst ) 1152 inv_txfm_fn8x4(flipadst, flipadst) 1153 1154 void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, 1155 int16_t *const coeff, const int eob) 1156 { 1157 if (eob < 1) { 1158 return dc_only_8xN(dst, stride, coeff, 2, 0, 1); 1159 } 1160 1161 LOAD_COEFF_8x8(coeff) 1162 1163 dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, 1164 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, 1165 c0, c1, c2, c3, c4, c5, c6, c7) 1166 1167 memset(coeff, 0, sizeof(*coeff) * 8 * 8); 1168 1169 SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) 1170 SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) 1171 SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) 1172 SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) 1173 1174 TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, 1175 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) 1176 1177 dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, 1178 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, 1179 c0, c1, c2, c3, c4, c5, c6, c7) 1180 1181 LOAD_DECLARE_4(dst, stride, a, b, cc, d) 1182 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) 1183 1184 APPLY_COEFF_8x4(a, b, c0, c1) 1185 APPLY_COEFF_8x4(cc, d, c2, c3) 1186 APPLY_COEFF_8x4(e, f, c4, c5) 1187 APPLY_COEFF_8x4(g, hh, c6, c7) 1188 1189 STORE_8(dst, stride, a, b, cc, d) 1190 STORE_8(dst + 4 * stride, stride, e, f, g, hh) 1191 } 1192 1193 #define inv_txfm_fn8x8(type1, type2) \ 1194 void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1195 int16_t *const coeff, const int eob) \ 1196 { \ 1197 LOAD_COEFF_8x8(coeff) \ 1198 type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1199 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 1200 c0, c1, c2, c3, c4, c5, c6, c7) \ 1201 SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \ 1202 SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \ 1203 SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \ 1204 SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \ 1205 memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ 1206 TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1207 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 1208 type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1209 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 1210 c0, c1, c2, c3, c4, c5, c6, c7) \ 1211 LOAD_DECLARE_4(dst, stride, a, b, c, d) \ 1212 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ 1213 APPLY_COEFF_8x4(a, b, c0, c1) \ 1214 APPLY_COEFF_8x4(c, d, c2, c3) \ 1215 APPLY_COEFF_8x4(e, f, c4, c5) \ 1216 APPLY_COEFF_8x4(g, h, c6, c7) \ 1217 STORE_8(dst, stride, a, b, c, d) \ 1218 STORE_8(dst + 4 * stride, stride, e, f, g, h) \ 1219 } 1220 inv_txfm_fn8x8(adst, dct ) 1221 inv_txfm_fn8x8(dct, adst ) 1222 inv_txfm_fn8x8(dct, flipadst) 1223 inv_txfm_fn8x8(flipadst, dct ) 1224 inv_txfm_fn8x8(adst, flipadst) 1225 inv_txfm_fn8x8(flipadst, adst ) 1226 inv_txfm_fn8x8(dct, identity) 1227 inv_txfm_fn8x8(flipadst, identity) 1228 inv_txfm_fn8x8(adst, identity) 1229 inv_txfm_fn8x8(adst, adst ) 1230 inv_txfm_fn8x8(flipadst, flipadst) 1231 1232 // identity + scale is a no op 1233 #define inv_txfm_fn8x8_identity(type2) \ 1234 void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1235 int16_t *const coeff, const int eob) \ 1236 { \ 1237 LOAD_COEFF_8x8(coeff) \ 1238 memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ 1239 TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1240 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ 1241 type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ 1242 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ 1243 c0, c1, c2, c3, c4, c5, c6, c7) \ 1244 LOAD_DECLARE_4(dst, stride, a, b, c, d) \ 1245 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ 1246 APPLY_COEFF_8x4(a, b, c0, c1) \ 1247 APPLY_COEFF_8x4(c, d, c2, c3) \ 1248 APPLY_COEFF_8x4(e, f, c4, c5) \ 1249 APPLY_COEFF_8x4(g, h, c6, c7) \ 1250 STORE_8(dst, stride, a, b, c, d) \ 1251 STORE_8(dst + 4 * stride, stride, e, f, g, h) \ 1252 } 1253 inv_txfm_fn8x8_identity(dct ) 1254 inv_txfm_fn8x8_identity(flipadst) 1255 inv_txfm_fn8x8_identity(adst ) 1256 inv_txfm_fn8x8_identity(identity) 1257 1258 #define CLIP16_I32_8(a, b, c, d, e, f, g, h, \ 1259 ab, cd, ef, gh) \ 1260 { \ 1261 ab = vec_packs(a, b); \ 1262 cd = vec_packs(c, d); \ 1263 ef = vec_packs(e, f); \ 1264 gh = vec_packs(g, h); \ 1265 UNPACK_PAIR_I16_I32(a, b, ab) \ 1266 UNPACK_PAIR_I16_I32(c, d, cd) \ 1267 UNPACK_PAIR_I16_I32(e, f, ef) \ 1268 UNPACK_PAIR_I16_I32(g, h, gh) \ 1269 } 1270 1271 #define MUL_4_INPLACE(a, b, c, d, v) \ 1272 a = vec_mul(a, v); \ 1273 b = vec_mul(b, v); \ 1274 c = vec_mul(c, v); \ 1275 d = vec_mul(d, v); \ 1276 1277 #define IDENTITY_16_V(v) \ 1278 { \ 1279 i16x8 v_ = vec_adds(v, v); \ 1280 v = vec_mradds(v, v1697_16, v_); \ 1281 } 1282 1283 #define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ 1284 c08c09, c10c11, c12c13, c14c15) \ 1285 { \ 1286 i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \ 1287 IDENTITY_16_V(c00c01) \ 1288 IDENTITY_16_V(c02c03) \ 1289 IDENTITY_16_V(c04c05) \ 1290 IDENTITY_16_V(c06c07) \ 1291 IDENTITY_16_V(c08c09) \ 1292 IDENTITY_16_V(c10c11) \ 1293 IDENTITY_16_V(c12c13) \ 1294 IDENTITY_16_V(c14c15) \ 1295 } 1296 1297 #define IDENTITY_16_4_I32(a, b, c, d) \ 1298 { \ 1299 i32x4 a2 = vec_add(a, a); \ 1300 i32x4 b2 = vec_add(b, b); \ 1301 i32x4 c2 = vec_add(c, c); \ 1302 i32x4 d2 = vec_add(d, d); \ 1303 MUL_4_INPLACE(a, b, c, d, v1697) \ 1304 SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \ 1305 a = vec_add(a2, a); \ 1306 b = vec_add(b2, b); \ 1307 c = vec_add(c2, c); \ 1308 d = vec_add(d2, d); \ 1309 } 1310 1311 1312 #define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ 1313 c08, c09, c10, c11, c12, c13, c14, c15, \ 1314 c00c01, c02c03, c04c05, c06c07, \ 1315 c08c09, c10c11, c12c13, c14c15) \ 1316 { \ 1317 DECLARE_SPLAT_I32(1697) \ 1318 DECLARE_SPLAT_I32(1024) \ 1319 IDENTITY_16_4_I32(c00, c01, c02, c03) \ 1320 IDENTITY_16_4_I32(c04, c05, c06, c07) \ 1321 IDENTITY_16_4_I32(c08, c09, c10, c11) \ 1322 IDENTITY_16_4_I32(c12, c13, c14, c15) \ 1323 } 1324 1325 #define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ 1326 c08, c09, c10, c11, c12, c13, c14, c15, \ 1327 c00c01, c02c03, c04c05, c06c07, \ 1328 c08c09, c10c11, c12c13, c14c15) \ 1329 { \ 1330 PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ 1331 c00, c02, c04, c06, c08, c10, c12, c14, \ 1332 c01, c03, c05, c07, c09, c11, c13, c15) \ 1333 IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ 1334 c08c09, c10c11, c12c13, c14c15) \ 1335 } 1336 1337 #define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \ 1338 c08, c09, c10, c11, c12, c13, c14, c15, \ 1339 c00c03, c01c02, c07c04, c06c05, \ 1340 c08c11, c09c10, c14c13, c15c12) \ 1341 IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \ 1342 c00c03, c01c02, c07c04, c06c05) \ 1343 DECLARE_SPLAT_I32(128) \ 1344 DECLARE_SPLAT_I32(181) \ 1345 DECLARE_SPLAT_I32(401) \ 1346 DECLARE_SPLAT_I32(4076) \ 1347 DECLARE_SPLAT_I32(3166) \ 1348 DECLARE_SPLAT_I32(2598) \ 1349 DECLARE_SPLAT_I32(1931) \ 1350 DECLARE_SPLAT_I32(3612) \ 1351 DECLARE_SPLAT_I32(3920) \ 1352 DECLARE_SPLAT_I32(1189) \ 1353 DECLARE_SPLAT_I32(1567) \ 1354 DECLARE_SPLAT_I32(3784) \ 1355 \ 1356 DECLARE_MUL_PAIR_I32(c01, c15, v401, v4076) \ 1357 DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \ 1358 DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \ 1359 DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \ 1360 \ 1361 DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076, v401) \ 1362 DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \ 1363 DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \ 1364 DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \ 1365 \ 1366 SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \ 1367 SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \ 1368 \ 1369 CLIP16_I32_8(t15a, t08a, t14a, t09a, \ 1370 t13a, t10a, t12a, t11a, \ 1371 c08c11, c09c10, c14c13, c15c12) \ 1372 DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \ 1373 DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \ 1374 DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \ 1375 DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \ 1376 \ 1377 CLIP16_I32_8(t08, t09, t11, t10, \ 1378 t12, t13, t15, t14, \ 1379 c08c11, c09c10, c14c13, c15c12) \ 1380 \ 1381 DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \ 1382 DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \ 1383 \ 1384 ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \ 1385 ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \ 1386 t10a = -t10a; \ 1387 \ 1388 SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \ 1389 \ 1390 ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \ 1391 ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \ 1392 ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \ 1393 ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \ 1394 \ 1395 CLIP16_I32_8(t08a, t11a, t09, t10, \ 1396 t15a, t12a, t14, t13, \ 1397 c08c11, c09c10, c14c13, c15c12) \ 1398 ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \ 1399 ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \ 1400 \ 1401 MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \ 1402 SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \ 1403 \ 1404 DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \ 1405 t15a, t14, t08a, t09, \ 1406 t12, t13a, t11, t10a) \ 1407 \ 1408 c15c12 = vec_subs(c00c03, t15at12); \ 1409 c14c13 = vec_subs(c01c02, t14t13a); \ 1410 c08c11 = vec_subs(c07c04, t08at11); \ 1411 c09c10 = vec_subs(c06c05, t09t10a); \ 1412 c00c03 = vec_adds(c00c03, t15at12); \ 1413 c01c02 = vec_adds(c01c02, t14t13a); \ 1414 c07c04 = vec_adds(c07c04, t08at11); \ 1415 c06c05 = vec_adds(c06c05, t09t10a); \ 1416 1417 #define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ 1418 c08, c09, c10, c11, c12, c13, c14, c15, \ 1419 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1420 \ 1421 i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \ 1422 IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1423 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ 1424 c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \ 1425 c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \ 1426 c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \ 1427 c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \ 1428 c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \ 1429 c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \ 1430 c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \ 1431 c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \ 1432 1433 #define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ 1434 c08, c09, c10, c11, c12, c13, c14, c15, \ 1435 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ 1436 \ 1437 IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1438 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ 1439 UNPACK_PAIR_I16_I32(c00, c03, c00c03) \ 1440 UNPACK_PAIR_I16_I32(c01, c02, c01c02) \ 1441 UNPACK_PAIR_I16_I32(c07, c04, c07c04) \ 1442 UNPACK_PAIR_I16_I32(c06, c05, c06c05) \ 1443 UNPACK_PAIR_I16_I32(c08, c11, c08c11) \ 1444 UNPACK_PAIR_I16_I32(c09, c10, c09c10) \ 1445 UNPACK_PAIR_I16_I32(c14, c13, c14c13) \ 1446 UNPACK_PAIR_I16_I32(c15, c12, c15c12) \ 1447 1448 1449 #define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1450 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ 1451 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ 1452 dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ 1453 dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ 1454 dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ 1455 dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) 1456 1457 1458 #define PACK_4x4(c00, c01, c02, c03, \ 1459 c04, c05, c06, c07, \ 1460 c08, c09, c10, c11, \ 1461 c12, c13, c14, c15, \ 1462 c00c01, c02c03, c04c05, c06c07, \ 1463 c08c09, c10c11, c12c13, c14c15) \ 1464 { \ 1465 c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \ 1466 c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \ 1467 c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \ 1468 c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \ 1469 } 1470 1471 1472 1473 #define dct_4x4_out(c00, c01, c02, c03, \ 1474 c04, c05, c06, c07, \ 1475 c08, c09, c10, c11, \ 1476 c12, c13, c14, c15, \ 1477 c00c01, c02c03, c04c05, c06c07, \ 1478 c08c09, c10c11, c12c13, c14c15) \ 1479 { \ 1480 IDCT_4_INNER(c00, c01, c02, c03) \ 1481 IDCT_4_INNER(c04, c05, c06, c07) \ 1482 IDCT_4_INNER(c08, c09, c10, c11) \ 1483 IDCT_4_INNER(c12, c13, c14, c15) \ 1484 \ 1485 PACK_4x4(c00, c01, c02, c03, \ 1486 c04, c05, c06, c07, \ 1487 c08, c09, c10, c11, \ 1488 c12, c13, c14, c15, \ 1489 c00c01, c02c03, c04c05, c06c07, \ 1490 c08c09, c10c11, c12c13, c14c15) \ 1491 } 1492 1493 #define IDENTITY_4_I32(a, b, c, d) \ 1494 { \ 1495 DECLARE_SPLAT_I32(5793) \ 1496 DECLARE_SPLAT_I32(2048) \ 1497 MUL_4_INPLACE(a, b, c, d, v5793) \ 1498 SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \ 1499 } 1500 1501 #define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1502 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ 1503 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ 1504 { \ 1505 IDENTITY_4_I32(cA0, cA1, cA2, cA3) \ 1506 IDENTITY_4_I32(cB0, cB1, cB2, cB3) \ 1507 IDENTITY_4_I32(cC0, cC1, cC2, cC3) \ 1508 IDENTITY_4_I32(cD0, cD1, cD2, cD3) \ 1509 } 1510 1511 #define identity_4x4_out(c00, c01, c02, c03, \ 1512 c04, c05, c06, c07, \ 1513 c08, c09, c10, c11, \ 1514 c12, c13, c14, c15, \ 1515 c00c01, c02c03, c04c05, c06c07, \ 1516 c08c09, c10c11, c12c13, c14c15) \ 1517 { \ 1518 PACK_4x4(c00, c01, c02, c03, \ 1519 c04, c05, c06, c07, \ 1520 c08, c09, c10, c11, \ 1521 c12, c13, c14, c15, \ 1522 c00c01, c02c03, c04c05, c06c07, \ 1523 c08c09, c10c11, c12c13, c14c15) \ 1524 IDENTITY_4(c00c01, c02c03) \ 1525 IDENTITY_4(c04c05, c06c07) \ 1526 IDENTITY_4(c08c09, c10c11) \ 1527 IDENTITY_4(c12c13, c14c15) \ 1528 } 1529 1530 #define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1531 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ 1532 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ 1533 adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ 1534 adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ 1535 adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ 1536 adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) 1537 1538 #define adst_4x4_out(c00, c01, c02, c03, \ 1539 c04, c05, c06, c07, \ 1540 c08, c09, c10, c11, \ 1541 c12, c13, c14, c15, \ 1542 c00c01, c02c03, c04c05, c06c07, \ 1543 c08c09, c10c11, c12c13, c14c15) \ 1544 { \ 1545 ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \ 1546 ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \ 1547 ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \ 1548 ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \ 1549 \ 1550 PACK_4x4(c00, c01, c02, c03, \ 1551 c04, c05, c06, c07, \ 1552 c08, c09, c10, c11, \ 1553 c12, c13, c14, c15, \ 1554 c00c01, c02c03, c04c05, c06c07, \ 1555 c08c09, c10c11, c12c13, c14c15) \ 1556 } 1557 1558 #define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1559 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ 1560 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ 1561 flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ 1562 flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ 1563 flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ 1564 flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) 1565 1566 #define flipadst_4x4_out(c00, c01, c02, c03, \ 1567 c04, c05, c06, c07, \ 1568 c08, c09, c10, c11, \ 1569 c12, c13, c14, c15, \ 1570 c00c01, c02c03, c04c05, c06c07, \ 1571 c08c09, c10c11, c12c13, c14c15) \ 1572 { \ 1573 ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \ 1574 ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \ 1575 ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \ 1576 ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \ 1577 \ 1578 PACK_4x4(c00, c01, c02, c03, \ 1579 c04, c05, c06, c07, \ 1580 c08, c09, c10, c11, \ 1581 c12, c13, c14, c15, \ 1582 c00c01, c02c03, c04c05, c06c07, \ 1583 c08c09, c10c11, c12c13, c14c15) \ 1584 } 1585 1586 #define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \ 1587 c08, c09, c10, c11, c12, c13, c14, c15, \ 1588 o00, o01, o02, o03, o04, o05, o06, o07, \ 1589 o08, o09, o10, o11, o12, o13, o14, o15, \ 1590 c00c01, c02c03, c04c05, c06c07) \ 1591 DECLARE_SPLAT_I32(2048); \ 1592 u32x4 v12 = vec_splat_u32(12); \ 1593 DECLARE_SPLAT_I32(4091) \ 1594 DECLARE_SPLAT_I32(201) \ 1595 DECLARE_SPLAT_I32(3973) \ 1596 DECLARE_SPLAT_I32(995) \ 1597 DECLARE_SPLAT_I32(3703) \ 1598 DECLARE_SPLAT_I32(1751) \ 1599 DECLARE_SPLAT_I32(3290) \ 1600 DECLARE_SPLAT_I32(2440) \ 1601 DECLARE_SPLAT_I32(2751) \ 1602 DECLARE_SPLAT_I32(3035) \ 1603 DECLARE_SPLAT_I32(2106) \ 1604 DECLARE_SPLAT_I32(3513) \ 1605 DECLARE_SPLAT_I32(1380) \ 1606 DECLARE_SPLAT_I32(3857) \ 1607 DECLARE_SPLAT_I32(601) \ 1608 DECLARE_SPLAT_I32(4052) \ 1609 \ 1610 DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \ 1611 DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \ 1612 DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \ 1613 DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \ 1614 DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \ 1615 DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \ 1616 DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \ 1617 DECLARE_MUL_PAIR_I32(c01, c14, v601, v4052) \ 1618 \ 1619 DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\ 1620 DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \ 1621 DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \ 1622 DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \ 1623 DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \ 1624 DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \ 1625 DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \ 1626 DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14, v601, v4052) \ 1627 \ 1628 SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \ 1629 SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \ 1630 SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ 1631 SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \ 1632 \ 1633 DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \ 1634 DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \ 1635 DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \ 1636 DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \ 1637 DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \ 1638 DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \ 1639 DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \ 1640 DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \ 1641 \ 1642 CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \ 1643 c00c01, c02c03, c04c05, c06c07); \ 1644 CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \ 1645 c00c01, c02c03, c04c05, c06c07); \ 1646 \ 1647 DECLARE_SPLAT_I32(4017) \ 1648 DECLARE_SPLAT_I32(799) \ 1649 DECLARE_SPLAT_I32(2276) \ 1650 DECLARE_SPLAT_I32(3406) \ 1651 \ 1652 DECLARE_MUL_PAIR_I32(t08a, t09a, v4017, v799); \ 1653 DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \ 1654 DECLARE_MUL_PAIR_I32(t13a, t12a, v799, v4017); \ 1655 DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \ 1656 \ 1657 ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017, v799); \ 1658 ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \ 1659 ADD_SUB_PAIR(t13, t12, t13a, t12a, v799, v4017); \ 1660 ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \ 1661 \ 1662 SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ 1663 SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \ 1664 \ 1665 ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \ 1666 ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \ 1667 ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \ 1668 ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \ 1669 ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \ 1670 ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \ 1671 ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \ 1672 ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \ 1673 \ 1674 CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \ 1675 c00c01, c02c03, c04c05, c06c07) \ 1676 CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \ 1677 c00c01, c02c03, c04c05, c06c07) \ 1678 \ 1679 DECLARE_SPLAT_I32(3784) \ 1680 DECLARE_SPLAT_I32(1567) \ 1681 \ 1682 DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \ 1683 DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \ 1684 DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \ 1685 DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \ 1686 \ 1687 ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \ 1688 ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \ 1689 ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \ 1690 ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \ 1691 \ 1692 SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \ 1693 SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \ 1694 \ 1695 ADD_SUB_PAIR(o00, t02a, t00, t02,,) \ 1696 ADD_SUB_PAIR(o15, t03a, t01, t03,,) \ 1697 ADD_SUB_PAIR(o03, t06, t04a, t06a,,) \ 1698 ADD_SUB_PAIR(o12, t07, t05a, t07a,,) \ 1699 ADD_SUB_PAIR(o01, t10, t08a, t10a,,) \ 1700 ADD_SUB_PAIR(o14, t11, t09a, t11a,,) \ 1701 ADD_SUB_PAIR(o02, t14a, t12, t14,,) \ 1702 ADD_SUB_PAIR(o13, t15a, t13, t15,,) \ 1703 \ 1704 CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \ 1705 c00c01, c02c03, c04c05, c06c07) \ 1706 CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \ 1707 c00c01, c02c03, c04c05, c06c07) \ 1708 \ 1709 DECLARE_SPLAT_I32(181) \ 1710 DECLARE_SPLAT_I32(128) \ 1711 u32x4 v8 = vec_splat_u32(8); \ 1712 \ 1713 ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \ 1714 ADD_SUB_PAIR(o04, o11, t06, t07,,) \ 1715 ADD_SUB_PAIR(o06, o09, t10, t11,,) \ 1716 ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \ 1717 \ 1718 MUL_4_INPLACE(o07, o08, o04, o11, v181) \ 1719 MUL_4_INPLACE(o06, o09, o05, o10, v181) \ 1720 \ 1721 SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \ 1722 SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \ 1723 \ 1724 o01 = -o01; \ 1725 o03 = -o03; \ 1726 o05 = -o05; \ 1727 o07 = -o07; \ 1728 o09 = -o09; \ 1729 o11 = -o11; \ 1730 o13 = -o13; \ 1731 o15 = -o15; \ 1732 1733 #define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ 1734 c08, c09, c10, c11, c12, c13, c14, c15, \ 1735 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1736 { \ 1737 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1738 c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1739 c00c01, c02c03, c04c05, c06c07) \ 1740 } 1741 1742 #define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ 1743 c08, c09, c10, c11, c12, c13, c14, c15, \ 1744 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1745 { \ 1746 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1747 c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1748 c00c01, c02c03, c04c05, c06c07) \ 1749 PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ 1750 c00, c02, c04, c06, c08, c10, c12, c14, \ 1751 c01, c03, c05, c07, c09, c11, c13, c15) \ 1752 } 1753 1754 #define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ 1755 c08, c09, c10, c11, c12, c13, c14, c15, \ 1756 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1757 { \ 1758 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1759 c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ 1760 c00c01, c02c03, c04c05, c06c07) \ 1761 } 1762 1763 #define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ 1764 c08, c09, c10, c11, c12, c13, c14, c15, \ 1765 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1766 { \ 1767 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ 1768 c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ 1769 c00c01, c02c03, c04c05, c06c07) \ 1770 PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ 1771 c00, c02, c04, c06, c08, c10, c12, c14, \ 1772 c01, c03, c05, c07, c09, c11, c13, c15) \ 1773 } 1774 1775 1776 void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, 1777 int16_t *const coeff, const int eob 1778 HIGHBD_DECL_SUFFIX) 1779 { 1780 if (eob < 1) { 1781 return dc_only_4xN(dst, stride, coeff, 4, 0, 1); 1782 } 1783 1784 LOAD_COEFF_4x16(coeff) 1785 1786 dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, 1787 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, 1788 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) 1789 1790 memset(coeff, 0, sizeof(*coeff) * 4 * 16); 1791 1792 SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) 1793 SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) 1794 SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) 1795 SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) 1796 TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, 1797 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) 1798 1799 dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, 1800 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, 1801 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) 1802 1803 LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) 1804 LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) 1805 LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) 1806 LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) 1807 1808 APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); 1809 APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); 1810 APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); 1811 APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); 1812 1813 STORE_4(dst, stride, l00, l01, l02, l03); 1814 STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); 1815 STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); 1816 STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); 1817 } 1818 1819 #define inv_txfm_fn4x16(type1, type2) \ 1820 void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1821 int16_t *const coeff, const int eob) \ 1822 { \ 1823 LOAD_COEFF_4x16(coeff) \ 1824 type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1825 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ 1826 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ 1827 memset(coeff, 0, sizeof(*coeff) * 4 * 16); \ 1828 SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \ 1829 SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \ 1830 SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \ 1831 SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \ 1832 TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1833 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \ 1834 type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ 1835 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ 1836 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ 1837 LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \ 1838 LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \ 1839 LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \ 1840 LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \ 1841 APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \ 1842 APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \ 1843 APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \ 1844 APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \ 1845 STORE_4(dst, stride, l00, l01, l02, l03); \ 1846 STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); \ 1847 STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); \ 1848 STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \ 1849 } 1850 inv_txfm_fn4x16(adst, dct ) 1851 inv_txfm_fn4x16(dct, adst ) 1852 inv_txfm_fn4x16(dct, flipadst) 1853 inv_txfm_fn4x16(flipadst, dct ) 1854 inv_txfm_fn4x16(adst, flipadst) 1855 inv_txfm_fn4x16(flipadst, adst ) 1856 inv_txfm_fn4x16(identity, dct ) 1857 inv_txfm_fn4x16(dct, identity) 1858 inv_txfm_fn4x16(identity, flipadst) 1859 inv_txfm_fn4x16(flipadst, identity) 1860 inv_txfm_fn4x16(identity, adst ) 1861 inv_txfm_fn4x16(adst, identity) 1862 inv_txfm_fn4x16(identity, identity) 1863 inv_txfm_fn4x16(adst, adst ) 1864 inv_txfm_fn4x16(flipadst, flipadst) 1865 1866 void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, 1867 int16_t *const coeff, const int eob) 1868 { 1869 1870 if (eob < 1) { 1871 return dc_only_16xN(dst, stride, coeff, 1, 0, 1); 1872 } 1873 1874 LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ 1875 LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ 1876 LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ 1877 LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ 1878 UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) 1879 UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) 1880 UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) 1881 UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) 1882 1883 dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, 1884 c08, c09, c10, c11, c12, c13, c14, c15, 1885 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) 1886 memset(coeff, 0, sizeof(*coeff) * 16 * 4); 1887 SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) 1888 SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) 1889 SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) 1890 SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) 1891 1892 TRANSPOSE4_I32(c00, c01, c02, c03); 1893 TRANSPOSE4_I32(c04, c05, c06, c07); 1894 TRANSPOSE4_I32(c08, c09, c10, c11); 1895 TRANSPOSE4_I32(c12, c13, c14, c15); 1896 1897 dct_4x4_out(c00, c01, c02, c03, 1898 c04, c05, c06, c07, 1899 c08, c09, c10, c11, 1900 c12, c13, c14, c15, 1901 c00c01, c02c03, c04c05, c06c07, 1902 c08c09, c10c11, c12c13, c14c15) 1903 1904 LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) 1905 1906 APPLY_COEFF_16x4(l0, l1, l2, l3, 1907 c00c01, c02c03, c04c05, c06c07, 1908 c08c09, c10c11, c12c13, c14c15) 1909 1910 STORE_16(dst, stride, l0, l1, l2, l3) 1911 } 1912 1913 #define inv_txfm_fn16x4(type1, type2) \ 1914 void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1915 int16_t *const coeff, const int eob) \ 1916 { \ 1917 LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ 1918 LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ 1919 LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ 1920 LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ 1921 UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ 1922 UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ 1923 UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ 1924 UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ 1925 type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ 1926 c08, c09, c10, c11, c12, c13, c14, c15, \ 1927 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1928 memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ 1929 SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ 1930 SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ 1931 SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ 1932 SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ 1933 TRANSPOSE4_I32(c00, c01, c02, c03); \ 1934 TRANSPOSE4_I32(c04, c05, c06, c07); \ 1935 TRANSPOSE4_I32(c08, c09, c10, c11); \ 1936 TRANSPOSE4_I32(c12, c13, c14, c15); \ 1937 type2##_4x4_out(c00, c01, c02, c03, \ 1938 c04, c05, c06, c07, \ 1939 c08, c09, c10, c11, \ 1940 c12, c13, c14, c15, \ 1941 c00c01, c02c03, c04c05, c06c07, \ 1942 c08c09, c10c11, c12c13, c14c15); \ 1943 LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ 1944 APPLY_COEFF_16x4(l0, l1, l2, l3, \ 1945 c00c01, c02c03, c04c05, c06c07, \ 1946 c08c09, c10c11, c12c13, c14c15) \ 1947 STORE_16(dst, stride, l0, l1, l2, l3) \ 1948 } 1949 1950 inv_txfm_fn16x4(adst, dct ) 1951 inv_txfm_fn16x4(dct, adst ) 1952 inv_txfm_fn16x4(dct, flipadst) 1953 inv_txfm_fn16x4(flipadst, dct ) 1954 inv_txfm_fn16x4(adst, flipadst) 1955 inv_txfm_fn16x4(flipadst, adst ) 1956 inv_txfm_fn16x4(dct, identity) 1957 inv_txfm_fn16x4(flipadst, identity) 1958 inv_txfm_fn16x4(adst, identity) 1959 inv_txfm_fn16x4(identity, identity) 1960 inv_txfm_fn16x4(adst, adst ) 1961 inv_txfm_fn16x4(flipadst, flipadst) 1962 1963 #define inv_txfm_fn16x4_identity(type2) \ 1964 void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ 1965 int16_t *const coeff, const int eob) \ 1966 { \ 1967 LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ 1968 LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ 1969 LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ 1970 LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ 1971 UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ 1972 UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ 1973 UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ 1974 UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ 1975 identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ 1976 c08, c09, c10, c11, c12, c13, c14, c15, \ 1977 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ 1978 memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ 1979 SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ 1980 SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ 1981 SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ 1982 SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ 1983 CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \ 1984 CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \ 1985 TRANSPOSE4_I32(c00, c01, c02, c03); \ 1986 TRANSPOSE4_I32(c04, c05, c06, c07); \ 1987 TRANSPOSE4_I32(c08, c09, c10, c11); \ 1988 TRANSPOSE4_I32(c12, c13, c14, c15); \ 1989 type2##_4x4_out(c00, c01, c02, c03, \ 1990 c04, c05, c06, c07, \ 1991 c08, c09, c10, c11, \ 1992 c12, c13, c14, c15, \ 1993 c00c01, c02c03, c04c05, c06c07, \ 1994 c08c09, c10c11, c12c13, c14c15); \ 1995 LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ 1996 APPLY_COEFF_16x4(l0, l1, l2, l3, \ 1997 c00c01, c02c03, c04c05, c06c07, \ 1998 c08c09, c10c11, c12c13, c14c15) \ 1999 STORE_16(dst, stride, l0, l1, l2, l3) \ 2000 } 2001 2002 inv_txfm_fn16x4_identity(dct) 2003 inv_txfm_fn16x4_identity(adst) 2004 inv_txfm_fn16x4_identity(flipadst) 2005 2006 #endif // BITDEPTH