tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx_tmpl.c (71466B)


      1 /*
      2 * Copyright © 2024, VideoLAN and dav1d authors
      3 * Copyright © 2024, Luca Barbato
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/ppc/dav1d_types.h"
     29 #include "src/ppc/itx.h"
     30 #include "src/ppc/utils.h"
     31 
     32 #if BITDEPTH == 8
     33 
     34 #define LOAD_4(src, stride, a, b, c, d) \
     35 {  \
     36    uint8_t *s = src; \
     37    a = vec_xl(0, s); \
     38    s += stride; \
     39    b = vec_xl(0, s); \
     40    s += stride; \
     41    c = vec_xl(0, s); \
     42    s += stride; \
     43    d = vec_xl(0, s); \
     44 }
     45 
     46 #define LOAD_DECLARE_2_I16(src, a, b) \
     47    i16x8 a = vec_xl(0, src); \
     48    i16x8 b = vec_xl(0, src + 8);
     49 
     50 #define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \
     51    i32x4 a = i16h_to_i32(sa); \
     52    i32x4 b = i16l_to_i32(sa); \
     53    i32x4 c = i16h_to_i32(sb); \
     54    i32x4 d = i16l_to_i32(sb);
     55 
     56 #define LOAD_COEFF_4(coeff) \
     57    LOAD_DECLARE_2_I16(coeff, c01, c23) \
     58    UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3)
     59 
     60 #define LOAD_SCALE_COEFF_4x8(coeff, scale) \
     61    LOAD_DECLARE_2_I16(coeff, c04, c15) \
     62    LOAD_DECLARE_2_I16(coeff+16, c26, c37) \
     63    i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \
     64    i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \
     65    i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \
     66    i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \
     67    c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
     68    c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
     69    UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
     70    c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
     71    c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
     72    UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
     73 
     74 #define LOAD_SCALE_COEFF_8x4(coeff, scale) \
     75    LOAD_DECLARE_2_I16(coeff, c01, c23) \
     76    LOAD_DECLARE_2_I16(coeff+16, c45, c67) \
     77    c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
     78    c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
     79    UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
     80    c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
     81    c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
     82    UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
     83 
     84 #define LOAD_COEFF_8x8(coeff) \
     85    LOAD_DECLARE_2_I16(coeff, c0, c1) \
     86    LOAD_DECLARE_2_I16(coeff+16, c2, c3) \
     87    LOAD_DECLARE_2_I16(coeff+32, c4, c5) \
     88    LOAD_DECLARE_2_I16(coeff+48, c6, c7) \
     89    UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \
     90    UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \
     91    UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \
     92    UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \
     93 
     94 #define LOAD_COEFF_4x16(coeff) \
     95    LOAD_DECLARE_2_I16(coeff,    a0b0, c0d0) \
     96    LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \
     97    LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \
     98    LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \
     99    UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \
    100    UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \
    101    UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \
    102    UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3)
    103 
    104 #define LOAD_DECLARE_4(src, stride, a, b, c, d) \
    105    u8x16 a, b, c, d; \
    106    LOAD_4(src, stride, a, b, c, d)
    107 
    108 #define STORE_LEN(l, dst, stride, a, b, c, d) \
    109 { \
    110    uint8_t *dst2 = dst; \
    111    vec_xst_len(a, dst2, l); \
    112    dst2 += stride; \
    113    vec_xst_len(b, dst2, l); \
    114    dst2 += stride; \
    115    vec_xst_len(c, dst2, l); \
    116    dst2 += stride; \
    117    vec_xst_len(d, dst2, l); \
    118 }
    119 
    120 #define STORE_4(dst, stride, a, b, c, d) \
    121    STORE_LEN(4, dst, stride, a, b, c, d)
    122 
    123 #define STORE_8(dst, stride, ab, cd, ef, gh) \
    124    STORE_LEN(8, dst, stride, ab, cd, ef, gh)
    125 
    126 #define STORE_16(dst, stride, l0, l1, l2, l3) \
    127 { \
    128    uint8_t *dst##2 = dst; \
    129    vec_xst(l0, 0, dst##2); \
    130    dst##2 += stride; \
    131    vec_xst(l1, 0, dst##2); \
    132    dst##2 += stride; \
    133    vec_xst(l2, 0, dst##2); \
    134    dst##2 += stride; \
    135    vec_xst(l3, 0, dst##2); \
    136 }
    137 
    138 #define APPLY_COEFF_4(a, b, c, d, c01, c23) \
    139 { \
    140    u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \
    141    u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \
    142 \
    143    c01 = vec_adds(c01, vec_splat_s16(8)); \
    144    c23 = vec_adds(c23, vec_splat_s16(8)); \
    145    c01 = vec_sra(c01, vec_splat_u16(4)); \
    146    c23 = vec_sra(c23, vec_splat_u16(4)); \
    147 \
    148    i16x8 abs = u8h_to_i16(ab); \
    149    i16x8 cds = u8h_to_i16(cd); \
    150 \
    151    abs = vec_adds(abs, c01); \
    152    cds = vec_adds(cds, c23); \
    153 \
    154    a = vec_packsu(abs, abs); \
    155    c = vec_packsu(cds, cds); \
    156 \
    157    b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \
    158    d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \
    159 }
    160 
    161 #define APPLY_COEFF_8x4(ab, cd, c01, c23) \
    162 { \
    163    i16x8 abs = u8h_to_i16(ab); \
    164    i16x8 cds = u8h_to_i16(cd); \
    165    c01 = vec_adds(c01, vec_splat_s16(8)); \
    166    c23 = vec_adds(c23, vec_splat_s16(8)); \
    167    c01 = vec_sra(c01, vec_splat_u16(4)); \
    168    c23 = vec_sra(c23, vec_splat_u16(4)); \
    169 \
    170    abs = vec_adds(abs, c01); \
    171    cds = vec_adds(cds, c23); \
    172 \
    173    ab = vec_packsu(abs, abs); \
    174    cd = vec_packsu(cds, cds); \
    175 }
    176 
    177 #define APPLY_COEFF_16x4(a, b, c, d, \
    178                         c00c01, c02c03, c04c05, c06c07, \
    179                         c08c09, c10c11, c12c13, c14c15) \
    180 { \
    181    i16x8 ah = u8h_to_i16(a); \
    182    i16x8 al = u8l_to_i16(a); \
    183    i16x8 bh = u8h_to_i16(b); \
    184    i16x8 bl = u8l_to_i16(b); \
    185    i16x8 ch = u8h_to_i16(c); \
    186    i16x8 cl = u8l_to_i16(c); \
    187    i16x8 dh = u8h_to_i16(d); \
    188    i16x8 dl = u8l_to_i16(d); \
    189    SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \
    190    SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \
    191 \
    192    ah = vec_adds(ah, c00c01); \
    193    al = vec_adds(al, c02c03); \
    194    bh = vec_adds(bh, c04c05); \
    195    bl = vec_adds(bl, c06c07); \
    196    ch = vec_adds(ch, c08c09); \
    197    cl = vec_adds(cl, c10c11); \
    198    dh = vec_adds(dh, c12c13); \
    199    dl = vec_adds(dl, c14c15); \
    200 \
    201    a = vec_packsu(ah, al); \
    202    b = vec_packsu(bh, bl); \
    203    c = vec_packsu(ch, cl); \
    204    d = vec_packsu(dh, dl); \
    205 }
    206 
    207 #define IDCT_4_INNER(c0, c1, c2, c3) \
    208 { \
    209    i32x4 o0 = vec_add(c0, c2); \
    210    i32x4 o1 = vec_sub(c0, c2); \
    211 \
    212    i32x4 v2896 = vec_splats(2896); \
    213    i32x4 v1567 = vec_splats(1567); \
    214    i32x4 v3784 = vec_splats(3784); \
    215    i32x4 v2048 = vec_splats(2048); \
    216 \
    217    o0 = vec_mul(o0, v2896); \
    218    o1 = vec_mul(o1, v2896); \
    219 \
    220    i32x4 o2a = vec_mul(c1, v1567); \
    221    i32x4 o2b = vec_mul(c3, v3784); \
    222    i32x4 o3a = vec_mul(c1, v3784); \
    223    i32x4 o3b = vec_mul(c3, v1567); \
    224 \
    225    i32x4 o2 = vec_sub(o2a, o2b); \
    226    i32x4 o3 = vec_add(o3a, o3b); \
    227 \
    228    u32x4 v12 = vec_splat_u32(12); \
    229 \
    230    o0 = vec_add(o0, v2048); \
    231    o1 = vec_add(o1, v2048); \
    232    o2 = vec_add(o2, v2048); \
    233    o3 = vec_add(o3, v2048); \
    234 \
    235    o0 = vec_sra(o0, v12); \
    236    o1 = vec_sra(o1, v12); \
    237    o2 = vec_sra(o2, v12); \
    238    o3 = vec_sra(o3, v12); \
    239 \
    240    c0 = vec_add(o0, o3); \
    241    c1 = vec_add(o1, o2); \
    242    c2 = vec_sub(o1, o2); \
    243    c3 = vec_sub(o0, o3); \
    244 \
    245 }
    246 
    247 #define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \
    248    IDCT_4_INNER(c0, c1, c2, c3) \
    249    c03 = vec_packs(c0, c3); \
    250    c12 = vec_packs(c1, c2); \
    251 
    252 #define dct_4_in(c0, c1, c2, c3, c01, c23) \
    253 { \
    254    IDCT_4_INNER(c0, c1, c2, c3) \
    255    c01 = vec_packs(c0, c1); \
    256    c23 = vec_packs(c2, c3); \
    257    c0 = i16h_to_i32(c01); \
    258    c1 = i16l_to_i32(c01); \
    259    c2 = i16h_to_i32(c23); \
    260    c3 = i16l_to_i32(c23); \
    261 }
    262 
    263 #define dct_4_out(c0, c1, c2, c3, c01, c23) \
    264    IDCT_4_INNER(c0, c1, c2, c3) \
    265    c01 = vec_packs(c0, c1); \
    266    c23 = vec_packs(c2, c3); \
    267 
    268 
    269 #define IDENTITY_4(c01, c23) \
    270 { \
    271    i16x8 v1697 = vec_splats((int16_t)(1697*8)); \
    272    i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \
    273    i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \
    274    c01 = vec_adds(c01, o01); \
    275    c23 = vec_adds(c23, o23); \
    276 }
    277 
    278 #define identity_4_in(c0, c1, c2, c3, c01, c23) \
    279 { \
    280    IDENTITY_4(c01, c23) \
    281    c0 = i16h_to_i32(c01); \
    282    c1 = i16l_to_i32(c01); \
    283    c2 = i16h_to_i32(c23); \
    284    c3 = i16l_to_i32(c23); \
    285 }
    286 
    287 #define identity_4_out(c0, c1, c2, c3, c01, c23) \
    288 { \
    289    c01 = vec_packs(c0, c1); \
    290    c23 = vec_packs(c2, c3); \
    291    IDENTITY_4(c01, c23) \
    292 }
    293 
    294 #define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \
    295 { \
    296    i32x4 v1321 = vec_splats(1321); \
    297    i32x4 v3803 = vec_splats(3803); \
    298    i32x4 v2482 = vec_splats(2482); \
    299    i32x4 v3344 = vec_splats(3344); \
    300    i32x4 v2048 = vec_splats(2048); \
    301    i32x4 i0_v1321 = vec_mul(c0, v1321); \
    302    i32x4 i0_v2482 = vec_mul(c0, v2482); \
    303    i32x4 i0_v3803 = vec_mul(c0, v3803); \
    304    i32x4 i1 = vec_mul(c1, v3344); \
    305    i32x4 i2_v1321 = vec_mul(c2, v1321); \
    306    i32x4 i2_v2482 = vec_mul(c2, v2482); \
    307    i32x4 i2_v3803 = vec_mul(c2, v3803); \
    308    i32x4 i3_v1321 = vec_mul(c3, v1321); \
    309    i32x4 i3_v2482 = vec_mul(c3, v2482); \
    310    i32x4 i3_v3803 = vec_mul(c3, v3803); \
    311 \
    312    i32x4 n1 = vec_sub(i1, v2048); \
    313    i1 = vec_add(i1, v2048); \
    314 \
    315 \
    316    i32x4 o0 = vec_add(i0_v1321, i2_v3803); \
    317    i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \
    318    i32x4 o2 = vec_sub(c0, c2); \
    319    i32x4 o3 = vec_add(i0_v3803, i2_v2482); \
    320 \
    321    o0 = vec_add(o0, i3_v2482); \
    322    o1 = vec_sub(o1, i3_v3803); \
    323    o2 = vec_add(o2, c3); \
    324    o3 = vec_sub(o3, i3_v1321); \
    325 \
    326    o0 = vec_add(o0, i1); \
    327    o1 = vec_add(o1, i1); \
    328    o2 = vec_mul(o2, v3344); \
    329    o3 = vec_sub(o3, n1); \
    330 \
    331    o2 = vec_add(o2, v2048); \
    332 \
    333    oc0 = vec_sra(o0, vec_splat_u32(12)); \
    334    oc1 = vec_sra(o1, vec_splat_u32(12)); \
    335    oc2 = vec_sra(o2, vec_splat_u32(12)); \
    336    oc3 = vec_sra(o3, vec_splat_u32(12)); \
    337 }
    338 
    339 #define adst_4_in(c0, c1, c2, c3, c01, c23) \
    340 { \
    341    ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
    342 }
    343 
    344 #define flipadst_4_in(c0, c1, c2, c3, c01, c23) \
    345 { \
    346    ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
    347 }
    348 
    349 #define adst_4_out(c0, c1, c2, c3, c01, c23) \
    350 { \
    351    ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
    352    c01 = vec_packs(c0, c1); \
    353    c23 = vec_packs(c2, c3); \
    354 }
    355 
    356 #define flipadst_4_out(c0, c1, c2, c3, c01, c23) \
    357 { \
    358    ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
    359    c01 = vec_packs(c0, c1); \
    360    c23 = vec_packs(c2, c3); \
    361 }
    362 
    363 static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
    364 {
    365    int dc = coeff[0];
    366    const int rnd = (1 << shift) >> 1;
    367    if (is_rect2)
    368        dc = (dc * 181 + 128) >> 8;
    369    dc = (dc * 181 + 128) >> 8;
    370    dc = (dc + rnd) >> shift;
    371    dc = (dc * 181 + 128 + 2048) >> 12;
    372 
    373    i16x8 vdc = vec_splats((int16_t)dc);
    374    coeff[0] = 0;
    375    for (int i = 0; i < n; i++, dst += 4 * stride) {
    376        LOAD_DECLARE_4(dst, stride, a, b, c, d)
    377 
    378        i16x8 as = u8h_to_i16(a);
    379        i16x8 bs = u8h_to_i16(b);
    380        i16x8 cs = u8h_to_i16(c);
    381        i16x8 ds = u8h_to_i16(d);
    382 
    383        as = vec_adds(as, vdc);
    384        bs = vec_adds(bs, vdc);
    385        cs = vec_adds(cs, vdc);
    386        ds = vec_adds(ds, vdc);
    387 
    388        a = vec_packsu(as, as);
    389        b = vec_packsu(bs, bs);
    390        c = vec_packsu(cs, cs);
    391        d = vec_packsu(ds, ds);
    392 
    393        STORE_4(dst, stride, a, b, c, d)
    394    }
    395 }
    396 
    397 static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
    398 {
    399    int dc = coeff[0];
    400    const int rnd = (1 << shift) >> 1;
    401    if (is_rect2)
    402        dc = (dc * 181 + 128) >> 8;
    403    dc = (dc * 181 + 128) >> 8;
    404    dc = (dc + rnd) >> shift;
    405    dc = (dc * 181 + 128 + 2048) >> 12;
    406 
    407    i16x8 vdc = vec_splats((int16_t)dc);
    408    coeff[0] = 0;
    409 
    410    for (int i = 0; i < n; i++, dst += 4 * stride) {
    411        LOAD_DECLARE_4(dst, stride, a, b, c, d)
    412 
    413        i16x8 as = u8h_to_i16(a);
    414        i16x8 bs = u8h_to_i16(b);
    415        i16x8 cs = u8h_to_i16(c);
    416        i16x8 ds = u8h_to_i16(d);
    417 
    418        as = vec_adds(as, vdc);
    419        bs = vec_adds(bs, vdc);
    420        cs = vec_adds(cs, vdc);
    421        ds = vec_adds(ds, vdc);
    422 
    423        a = vec_packsu(as, as);
    424        b = vec_packsu(bs, bs);
    425        c = vec_packsu(cs, cs);
    426        d = vec_packsu(ds, ds);
    427 
    428        STORE_8(dst, stride, a, b, c, d)
    429    }
    430 }
    431 
    432 static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
    433 {
    434    int dc = coeff[0];
    435    const int rnd = (1 << shift) >> 1;
    436    if (is_rect2)
    437        dc = (dc * 181 + 128) >> 8;
    438    dc = (dc * 181 + 128) >> 8;
    439    dc = (dc + rnd) >> shift;
    440    dc = (dc * 181 + 128 + 2048) >> 12;
    441 
    442    i16x8 vdc = vec_splats((int16_t)dc);
    443    coeff[0] = 0;
    444 
    445    for (int i = 0; i < n; i++, dst += 4 * stride) {
    446        LOAD_DECLARE_4(dst, stride, a, b, c, d)
    447 
    448        i16x8 ah = u8h_to_i16(a);
    449        i16x8 bh = u8h_to_i16(b);
    450        i16x8 ch = u8h_to_i16(c);
    451        i16x8 dh = u8h_to_i16(d);
    452        i16x8 al = u8l_to_i16(a);
    453        i16x8 bl = u8l_to_i16(b);
    454        i16x8 cl = u8l_to_i16(c);
    455        i16x8 dl = u8l_to_i16(d);
    456 
    457        ah = vec_adds(ah, vdc);
    458        bh = vec_adds(bh, vdc);
    459        ch = vec_adds(ch, vdc);
    460        dh = vec_adds(dh, vdc);
    461        al = vec_adds(al, vdc);
    462        bl = vec_adds(bl, vdc);
    463        cl = vec_adds(cl, vdc);
    464        dl = vec_adds(dl, vdc);
    465 
    466        a = vec_packsu(ah, al);
    467        b = vec_packsu(bh, bl);
    468        c = vec_packsu(ch, cl);
    469        d = vec_packsu(dh, dl);
    470 
    471        STORE_16(dst, stride, a, b, c, d)
    472    }
    473 }
    474 
    475 void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
    476                                              int16_t *const coeff, const int eob)
    477 {
    478    assert(eob >= 0);
    479 
    480    if (eob < 1) {
    481        return dc_only_4xN(dst, stride, coeff, 1, 0, 0);
    482    }
    483 
    484    LOAD_COEFF_4(coeff)
    485 
    486    dct_4_in(c0, c1, c2, c3, c01, c23)
    487 
    488    TRANSPOSE4_I32(c0, c1, c2, c3)
    489 
    490    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
    491 
    492    dct_4_out(c0, c1, c2, c3, c01, c23)
    493 
    494    LOAD_DECLARE_4(dst, stride, a, b, c, d)
    495 
    496    APPLY_COEFF_4(a, b, c, d, c01, c23)
    497 
    498    STORE_4(dst, stride, a, b, c, d)
    499 }
    500 
    501 void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride,
    502                                              coef *const coeff, const int eob)
    503 {
    504    LOAD_COEFF_4(coeff)
    505 
    506    u32x4 v2 = vec_splat_u32(2);
    507 
    508    c0 = vec_sra(c0, v2);
    509    c1 = vec_sra(c1, v2);
    510    c2 = vec_sra(c2, v2);
    511    c3 = vec_sra(c3, v2);
    512 
    513    i32x4 t0 = vec_add(c0, c1);
    514    i32x4 t2 = vec_sub(c2, c3);
    515    i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
    516    i32x4 t3 = vec_sub(t4, c3);
    517    i32x4 t1 = vec_sub(t4, c1);
    518    c0 = vec_sub(t0, t3);
    519    c1 = t3;
    520    c2 = t1;
    521    c3 = vec_add(t2, t1);
    522 
    523    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
    524 
    525    TRANSPOSE4_I32(c0, c1, c2, c3)
    526 
    527    t0 = vec_add(c0, c1);
    528    t2 = vec_sub(c2, c3);
    529    t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
    530    t3 = vec_sub(t4, c3);
    531    t1 = vec_sub(t4, c1);
    532    c0 = vec_sub(t0, t3);
    533    c1 = t3;
    534    c2 = t1;
    535    c3 = vec_add(t2, t1);
    536 
    537    c01 = vec_packs(c0, c1);
    538    c23 = vec_packs(c2, c3);
    539 
    540    LOAD_DECLARE_4(dst, stride, a, b, c, d)
    541 
    542    u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b);
    543    u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d);
    544 
    545    i16x8 abs = u8h_to_i16(ab);
    546    i16x8 cds = u8h_to_i16(cd);
    547 
    548    abs = vec_adds(abs, c01);
    549    cds = vec_adds(cds, c23);
    550 
    551    a = vec_packsu(abs, abs);
    552    c = vec_packsu(cds, cds);
    553 
    554    b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a);
    555    d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c);
    556 
    557    STORE_4(dst, stride, a, b, c, d)
    558 }
    559 
    560 #define inv_txfm_fn4x4(type1, type2) \
    561 void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
    562                                                          int16_t *const coeff, const int eob) \
    563 { \
    564    LOAD_COEFF_4(coeff) \
    565    type1##_4_in(c0, c1, c2, c3, c01, c23) \
    566    memset(coeff, 0, sizeof(*coeff) * 4 * 4); \
    567    TRANSPOSE4_I32(c0, c1, c2, c3) \
    568    type2##_4_out(c0, c1, c2, c3, c01, c23) \
    569    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
    570    APPLY_COEFF_4(a, b, c, d, c01, c23) \
    571    STORE_4(dst, stride, a, b, c, d) \
    572 }
    573 
    574 inv_txfm_fn4x4(adst,     dct     )
    575 inv_txfm_fn4x4(dct,      adst    )
    576 inv_txfm_fn4x4(dct,      flipadst)
    577 inv_txfm_fn4x4(flipadst, dct     )
    578 inv_txfm_fn4x4(adst,     flipadst)
    579 inv_txfm_fn4x4(flipadst, adst    )
    580 inv_txfm_fn4x4(identity, dct     )
    581 inv_txfm_fn4x4(dct,      identity)
    582 inv_txfm_fn4x4(identity, flipadst)
    583 inv_txfm_fn4x4(flipadst, identity)
    584 inv_txfm_fn4x4(identity, adst   )
    585 inv_txfm_fn4x4(adst,     identity)
    586 inv_txfm_fn4x4(identity, identity)
    587 inv_txfm_fn4x4(adst,     adst    )
    588 inv_txfm_fn4x4(flipadst, flipadst)
    589 
    590 
    591 #define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
    592    dct4_for_dct8(c0, c2, c4, c6, c03, c12) \
    593 \
    594    i32x4 v799 = vec_splats(799); \
    595    i32x4 v4017 = vec_splats(4017); \
    596    i32x4 v3406 = vec_splats(3406); \
    597    i32x4 v2276 = vec_splats(2276); \
    598    i32x4 v2048 = vec_splats(2048); \
    599    u32x4 v12 = vec_splat_u32(12); \
    600 \
    601    i32x4 c1v799 = vec_mul(c1, v799); \
    602    i32x4 c7v4017 = vec_mul(c7, v4017); \
    603    i32x4 c5v3406 = vec_mul(c5, v3406); \
    604    i32x4 c3v2276 = vec_mul(c3, v2276); \
    605    i32x4 c5v2276 = vec_mul(c5, v2276); \
    606    i32x4 c3v3406 = vec_mul(c3, v3406); \
    607    i32x4 c1v4017 = vec_mul(c1, v4017); \
    608    i32x4 c7v799 = vec_mul(c7, v799); \
    609 \
    610    i32x4 t4a = vec_subs(c1v799, c7v4017); \
    611    i32x4 t5a = vec_subs(c5v3406, c3v2276); \
    612    i32x4 t6a = vec_adds(c5v2276, c3v3406); \
    613    i32x4 t7a = vec_adds(c1v4017, c7v799); \
    614 \
    615    t4a = vec_adds(t4a, v2048); \
    616    t5a = vec_adds(t5a, v2048); \
    617    t6a = vec_adds(t6a, v2048); \
    618    t7a = vec_adds(t7a, v2048); \
    619 \
    620    t4a = vec_sra(t4a, v12); \
    621    t7a = vec_sra(t7a, v12); \
    622    t5a = vec_sra(t5a, v12); \
    623    t6a = vec_sra(t6a, v12); \
    624 \
    625    i16x8 t7at4a = vec_packs(t7a, t4a); \
    626    i16x8 t6at5a = vec_packs(t6a, t5a); \
    627 \
    628    i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \
    629    t6at5a = vec_subs(t7at4a, t6at5a); \
    630 \
    631    t6a = i16h_to_i32(t6at5a); \
    632    t5a = i16l_to_i32(t6at5a); \
    633 \
    634    i32x4 t6 = vec_add(t6a, t5a); \
    635    i32x4 t5 = vec_sub(t6a, t5a); \
    636 \
    637    t6 = vec_mul(t6, vec_splats(181)); \
    638    t5 = vec_mul(t5, vec_splats(181)); \
    639    t6 = vec_add(t6, vec_splats(128)); \
    640    t5 = vec_add(t5, vec_splats(128)); \
    641 \
    642    t6 = vec_sra(t6, vec_splat_u32(8)); \
    643    t5 = vec_sra(t5, vec_splat_u32(8)); \
    644 \
    645    i16x8 t6t5 = vec_packs(t6, t5); \
    646 \
    647    c74 = vec_subs(c03, t7t4); \
    648    c65 = vec_subs(c12, t6t5); \
    649    c03 = vec_adds(c03, t7t4); \
    650    c12 = vec_adds(c12, t6t5); \
    651 
    652 #define UNPACK_4_I16_I32(t0, t1, t2, t3) \
    653    t0 = i16h_to_i32(t0##t1); \
    654    t1 = i16l_to_i32(t0##t1); \
    655    t2 = i16h_to_i32(t2##t3); \
    656    t3 = i16l_to_i32(t2##t3);
    657 
    658 #define UNPACK_PAIR_I16_I32(hi, lo, v) \
    659    hi = i16h_to_i32(v); \
    660    lo = i16l_to_i32(v); \
    661 
    662 
    663 #define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \
    664 { \
    665    i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \
    666    IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \
    667    UNPACK_4_I16_I32(c0, c3, c1, c2) \
    668    UNPACK_4_I16_I32(c7, c4, c6, c5) \
    669 }
    670 
    671 #define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    672 { \
    673    i16x8 c03, c12, c74, c65; \
    674    IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
    675    c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \
    676    c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \
    677    c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \
    678    c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \
    679 }
    680 
    681 #define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    682                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    683                   c0, c1, c2, c3, c4, c5, c6, c7) \
    684 { \
    685    dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \
    686    dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \
    687 }
    688 
    689 #define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    690                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    691                    c0, c1, c2, c3, c4, c5, c6, c7) \
    692 { \
    693    i16x8 c03h, c12h, c74h, c65h; \
    694    i16x8 c03l, c12l, c74l, c65l; \
    695    { \
    696        IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \
    697    } \
    698    { \
    699        IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \
    700    } \
    701    c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \
    702    c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \
    703    c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \
    704    c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \
    705    c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \
    706    c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \
    707    c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \
    708    c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \
    709 }
    710 
    711 #define IDENTITY_8(c01, c23, c45, c67) \
    712 { \
    713    c01 = vec_adds(c01, c01); \
    714    c23 = vec_adds(c23, c23); \
    715    c45 = vec_adds(c45, c45); \
    716    c67 = vec_adds(c67, c67); \
    717 }
    718 
    719 #define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    720 { \
    721    IDENTITY_8(c01, c23, c45, c67) \
    722    UNPACK_PAIR_I16_I32(c0, c1, c01) \
    723    UNPACK_PAIR_I16_I32(c2, c3, c23) \
    724    UNPACK_PAIR_I16_I32(c4, c5, c45) \
    725    UNPACK_PAIR_I16_I32(c6, c7, c67) \
    726 }
    727 
    728 #define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    729    c01 = vec_packs(c0, c1); \
    730    c23 = vec_packs(c2, c3); \
    731    c45 = vec_packs(c4, c5); \
    732    c67 = vec_packs(c6, c7); \
    733    IDENTITY_8(c01, c23, c45, c67)
    734 
    735 #define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    736                        c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    737                        c0, c1, c2, c3, c4, c5, c6, c7) \
    738 { \
    739    IDENTITY_8(c0, c1, c2, c3) \
    740    IDENTITY_8(c4, c5, c6, c7) \
    741    UNPACK_PAIR_I16_I32(c0h, c0l, c0) \
    742    UNPACK_PAIR_I16_I32(c1h, c1l, c1) \
    743    UNPACK_PAIR_I16_I32(c2h, c2l, c2) \
    744    UNPACK_PAIR_I16_I32(c3h, c3l, c3) \
    745    UNPACK_PAIR_I16_I32(c4h, c4l, c4) \
    746    UNPACK_PAIR_I16_I32(c5h, c5l, c5) \
    747    UNPACK_PAIR_I16_I32(c6h, c6l, c6) \
    748    UNPACK_PAIR_I16_I32(c7h, c7l, c7) \
    749 }
    750 
    751 #define PACK_4(c0, c1, c2, c3, \
    752               c0h, c1h, c2h, c3h, \
    753               c0l, c1l, c2l, c3l) \
    754 { \
    755    c0 = vec_packs(c0h, c0l); \
    756    c1 = vec_packs(c1h, c1l); \
    757    c2 = vec_packs(c2h, c2l); \
    758    c3 = vec_packs(c3h, c3l); \
    759 }
    760 
    761 #define DECLARE_PACK_4(c0, c1, c2, c3, \
    762                       c0h, c1h, c2h, c3h, \
    763                       c0l, c1l, c2l, c3l) \
    764    i16x8 c0, c1, c2, c3; \
    765    PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l);
    766 
    767 #define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    768               c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    769               c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
    770 { \
    771    c0 = vec_packs(c0h, c0l); \
    772    c1 = vec_packs(c1h, c1l); \
    773    c2 = vec_packs(c2h, c2l); \
    774    c3 = vec_packs(c3h, c3l); \
    775    c4 = vec_packs(c4h, c4l); \
    776    c5 = vec_packs(c5h, c5l); \
    777    c6 = vec_packs(c6h, c6l); \
    778    c7 = vec_packs(c7h, c7l); \
    779 }
    780 
    781 #define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    782                         c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    783                         c0, c1, c2, c3, c4, c5, c6, c7) \
    784 { \
    785    PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    786           c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    787           c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
    788    IDENTITY_8(c0, c1, c2, c3) \
    789    IDENTITY_8(c4, c5, c6, c7) \
    790 }
    791 
    792 #define DECLARE_SPLAT_I32(val) \
    793    i32x4 v##val = vec_splats(val);
    794 
    795 #define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \
    796    i32x4 ca##va = vec_mul(ca, va); \
    797    i32x4 cb##vb = vec_mul(cb, vb); \
    798    i32x4 ca##vb = vec_mul(ca, vb); \
    799    i32x4 cb##va = vec_mul(cb, va);
    800 
    801 #define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
    802    r0 = vec_adds(ca##va, cb##vb); \
    803    r1 = vec_subs(ca##vb, cb##va);
    804 
    805 #define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
    806    i32x4 r0, r1; \
    807    ADD_SUB_PAIR(r0, r1, ca, cb, va, vb)
    808 
    809 #define SCALE_ROUND_4(a, b, c, d, rnd, shift) \
    810    a = vec_adds(a, rnd); \
    811    b = vec_adds(b, rnd); \
    812    c = vec_adds(c, rnd); \
    813    d = vec_adds(d, rnd); \
    814    a = vec_sra(a, shift); \
    815    b = vec_sra(b, shift); \
    816    c = vec_sra(c, shift); \
    817    d = vec_sra(d, shift);
    818 
    819 #define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    820                     o0, o1, o2, o3, o4, o5, o6, o7) \
    821 { \
    822    DECLARE_SPLAT_I32(4076) \
    823    DECLARE_SPLAT_I32(401) \
    824 \
    825    DECLARE_SPLAT_I32(3612) \
    826    DECLARE_SPLAT_I32(1931) \
    827 \
    828    DECLARE_SPLAT_I32(2598) \
    829    DECLARE_SPLAT_I32(3166) \
    830 \
    831    DECLARE_SPLAT_I32(1189) \
    832    DECLARE_SPLAT_I32(3920) \
    833 \
    834    DECLARE_SPLAT_I32(3784) \
    835    DECLARE_SPLAT_I32(1567) \
    836 \
    837    DECLARE_SPLAT_I32(2048) \
    838    u32x4 v12 = vec_splat_u32(12); \
    839 \
    840    DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \
    841    DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \
    842    DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \
    843    DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \
    844 \
    845    DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \
    846    DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \
    847    DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \
    848    DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \
    849 \
    850    SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \
    851    SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
    852 \
    853    i32x4 t0 = vec_add(t0a, t4a); \
    854    i32x4 t1 = vec_add(t1a, t5a); \
    855    i32x4 t2 = vec_add(t2a, t6a); \
    856    i32x4 t3 = vec_add(t3a, t7a); \
    857    i32x4 t4 = vec_sub(t0a, t4a); \
    858    i32x4 t5 = vec_sub(t1a, t5a); \
    859    i32x4 t6 = vec_sub(t2a, t6a); \
    860    i32x4 t7 = vec_sub(t3a, t7a); \
    861 \
    862    i16x8 t0t1 = vec_packs(t0, t1); \
    863    i16x8 t2t3 = vec_packs(t2, t3); \
    864    i16x8 t4t5 = vec_packs(t4, t5); \
    865    i16x8 t6t7 = vec_packs(t6, t7); \
    866 \
    867    UNPACK_4_I16_I32(t4, t5, t6, t7) \
    868    UNPACK_4_I16_I32(t0, t1, t2, t3) \
    869 \
    870    DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \
    871    DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \
    872 \
    873    ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \
    874    ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \
    875 \
    876    SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
    877  \
    878    o0 = vec_add(t0, t2); \
    879    o1 = vec_add(t4a, t6a); \
    880    o7 = vec_add(t1, t3); \
    881    o6 = vec_add(t5a, t7a); \
    882    t2 = vec_sub(t0, t2); \
    883    t3 = vec_sub(t1, t3); \
    884    t6 = vec_sub(t4a, t6a); \
    885    t7 = vec_sub(t5a, t7a); \
    886 \
    887    i16x8 o7##o1 = vec_packs(o7, o1); \
    888    i16x8 o0##o6 = vec_packs(o0, o6); \
    889    t2t3 = vec_packs(t2, t3); \
    890    t6t7 = vec_packs(t6, t7); \
    891 \
    892    UNPACK_4_I16_I32(t2, t3, t6, t7) \
    893    UNPACK_4_I16_I32(o7, o1, o0, o6) \
    894 \
    895    o7 = -o7; \
    896    o1 = -o1; \
    897 \
    898    o3 = vec_add(t2, t3); \
    899    o4 = vec_sub(t2, t3); \
    900    o5 = vec_sub(t6, t7); \
    901    o2 = vec_add(t6, t7); \
    902 \
    903    i32x4 v181 = vec_splats(181); \
    904    i32x4 v128 = vec_splats(128); \
    905    u32x4 v8 = vec_splat_u32(8); \
    906 \
    907    o2 = vec_mul(o2, v181); \
    908    o3 = vec_mul(o3, v181); \
    909    o4 = vec_mul(o4, v181); \
    910    o5 = vec_mul(o5, v181); \
    911 \
    912    SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \
    913 \
    914    o3 = -o3; \
    915    o5 = -o5; \
    916 }
    917 
    918 #define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    919 {\
    920    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    921                 c0, c1, c2, c3, c4, c5, c6, c7) \
    922    c01 = vec_packs(c0, c1); \
    923    c23 = vec_packs(c2, c3); \
    924    c45 = vec_packs(c4, c5); \
    925    c67 = vec_packs(c6, c7); \
    926    UNPACK_PAIR_I16_I32(c0, c1, c01) \
    927    UNPACK_PAIR_I16_I32(c2, c3, c23) \
    928    UNPACK_PAIR_I16_I32(c4, c5, c45) \
    929    UNPACK_PAIR_I16_I32(c6, c7, c67) \
    930 }
    931 
    932 #define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    933 {\
    934    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    935                 c0, c1, c2, c3, c4, c5, c6, c7) \
    936    c01 = vec_packs(c0, c1); \
    937    c23 = vec_packs(c2, c3); \
    938    c45 = vec_packs(c4, c5); \
    939    c67 = vec_packs(c6, c7); \
    940 }
    941 
    942 #define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    943                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    944                    c0, c1, c2, c3, c4, c5, c6, c7) \
    945 { \
    946    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    947                 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
    948    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    949                 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
    950 }
    951 
    952 #define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    953                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    954                    c0, c1, c2, c3, c4, c5, c6, c7) \
    955 { \
    956    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    957                 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
    958    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    959                 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
    960    PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    961           c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    962           c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
    963 }
    964 
    965 #define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    966 {\
    967    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    968                 c7, c6, c5, c4, c3, c2, c1, c0) \
    969    c01 = vec_packs(c0, c1); \
    970    c23 = vec_packs(c2, c3); \
    971    c45 = vec_packs(c4, c5); \
    972    c67 = vec_packs(c6, c7); \
    973    UNPACK_PAIR_I16_I32(c0, c1, c01) \
    974    UNPACK_PAIR_I16_I32(c2, c3, c23) \
    975    UNPACK_PAIR_I16_I32(c4, c5, c45) \
    976    UNPACK_PAIR_I16_I32(c6, c7, c67) \
    977 }
    978 
    979 #define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
    980 {\
    981    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
    982                 c7, c6, c5, c4, c3, c2, c1, c0) \
    983    c01 = vec_packs(c0, c1); \
    984    c23 = vec_packs(c2, c3); \
    985    c45 = vec_packs(c4, c5); \
    986    c67 = vec_packs(c6, c7); \
    987 }
    988 
    989 #define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    990                        c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    991                        c0, c1, c2, c3, c4, c5, c6, c7) \
    992 { \
    993    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
    994                 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
    995    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
    996                 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
    997 }
    998 
    999 #define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1000                         c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
   1001                         c0, c1, c2, c3, c4, c5, c6, c7) \
   1002 { \
   1003    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1004                 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
   1005    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
   1006                 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
   1007    PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
   1008           c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1009           c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
   1010 }
   1011 
   1012 void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
   1013                                              int16_t *const coeff, const int eob)
   1014 {
   1015    i16x8 v = vec_splats((int16_t)(2896*8));
   1016 
   1017    if (eob < 1) {
   1018        return dc_only_4xN(dst, stride, coeff, 2, 1, 0);
   1019    }
   1020 
   1021    LOAD_SCALE_COEFF_4x8(coeff, v)
   1022 
   1023    dct_4_in(c0, c1, c2, c3, c01, c23)
   1024    dct_4_in(c4, c5, c6, c7, c45, c67)
   1025 
   1026 
   1027    memset(coeff, 0, sizeof(*coeff) * 4 * 8);
   1028 
   1029    TRANSPOSE4_I32(c0, c1, c2, c3);
   1030    TRANSPOSE4_I32(c4, c5, c6, c7);
   1031 
   1032    dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
   1033 
   1034    LOAD_DECLARE_4(dst, stride, a, b, cc, d)
   1035    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
   1036 
   1037    APPLY_COEFF_4(a, b, cc, d, c01, c23)
   1038    APPLY_COEFF_4(e, f, g, hh, c45, c67)
   1039 
   1040    STORE_4(dst, stride, a, b, cc, d)
   1041    STORE_4(dst + 4 * stride, stride, e, f, g, hh)
   1042 }
   1043 
   1044 
   1045 #define inv_txfm_fn4x8(type1, type2) \
   1046 void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1047                                                          int16_t *const coeff, const int eob) \
   1048 { \
   1049    i16x8 v = vec_splats((int16_t)(2896*8)); \
   1050    LOAD_SCALE_COEFF_4x8(coeff, v) \
   1051    type1##_4_in(c0, c1, c2, c3, c01, c23) \
   1052    type1##_4_in(c4, c5, c6, c7, c45, c67) \
   1053    memset(coeff, 0, sizeof(*coeff) * 4 * 8); \
   1054    TRANSPOSE4_I32(c0, c1, c2, c3); \
   1055    TRANSPOSE4_I32(c4, c5, c6, c7); \
   1056    type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
   1057    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
   1058    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
   1059    APPLY_COEFF_4(a, b, c, d, c01, c23) \
   1060    APPLY_COEFF_4(e, f, g, h, c45, c67) \
   1061    STORE_4(dst, stride, a, b, c, d) \
   1062    STORE_4(dst + 4 * stride, stride, e, f, g, h) \
   1063 }
   1064 
   1065 inv_txfm_fn4x8(adst,     dct     )
   1066 inv_txfm_fn4x8(dct,      adst    )
   1067 inv_txfm_fn4x8(dct,      flipadst)
   1068 inv_txfm_fn4x8(flipadst, dct     )
   1069 inv_txfm_fn4x8(adst,     flipadst)
   1070 inv_txfm_fn4x8(flipadst, adst    )
   1071 inv_txfm_fn4x8(identity, dct     )
   1072 inv_txfm_fn4x8(dct,      identity)
   1073 inv_txfm_fn4x8(identity, flipadst)
   1074 inv_txfm_fn4x8(flipadst, identity)
   1075 inv_txfm_fn4x8(identity, adst   )
   1076 inv_txfm_fn4x8(adst,     identity)
   1077 inv_txfm_fn4x8(identity, identity)
   1078 inv_txfm_fn4x8(adst,     adst    )
   1079 inv_txfm_fn4x8(flipadst, flipadst)
   1080 
   1081 
   1082 void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
   1083                                              int16_t *const coeff, const int eob)
   1084 {
   1085    i16x8 v = vec_splats((int16_t)(2896*8));
   1086 
   1087    if (eob < 1) {
   1088        return dc_only_8xN(dst, stride, coeff, 1, 1, 0);
   1089    }
   1090 
   1091    LOAD_SCALE_COEFF_8x4(coeff, v)
   1092 
   1093    dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
   1094 
   1095    memset(coeff, 0, sizeof(*coeff) * 8 * 4);
   1096 
   1097    TRANSPOSE4_I32(c0, c1, c2, c3)
   1098    TRANSPOSE4_I32(c4, c5, c6, c7)
   1099 
   1100    dct_4_out(c0, c1, c2, c3, c01, c23)
   1101    dct_4_out(c4, c5, c6, c7, c45, c67)
   1102 
   1103    LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh)
   1104 
   1105    i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45);
   1106    i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45);
   1107    i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67);
   1108    i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67);
   1109 
   1110    APPLY_COEFF_8x4(ae, bf, c04, c15)
   1111    APPLY_COEFF_8x4(cg, dh, c26, c37)
   1112 
   1113    STORE_8(dst, stride, ae, bf, cg, dh)
   1114 }
   1115 
   1116 
   1117 #define inv_txfm_fn8x4(type1, type2) \
   1118 void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1119                                                          int16_t *const coeff, const int eob) \
   1120 { \
   1121    i16x8 v = vec_splats((int16_t)(2896*8)); \
   1122    LOAD_SCALE_COEFF_8x4(coeff, v) \
   1123    type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
   1124    memset(coeff, 0, sizeof(*coeff) * 8 * 4); \
   1125    TRANSPOSE4_I32(c0, c1, c2, c3) \
   1126    TRANSPOSE4_I32(c4, c5, c6, c7) \
   1127    type2##_4_out(c0, c1, c2, c3, c01, c23) \
   1128    type2##_4_out(c4, c5, c6, c7, c45, c67) \
   1129    LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \
   1130    i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \
   1131    i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \
   1132    i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \
   1133    i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \
   1134    APPLY_COEFF_8x4(ae, bf, c04, c15) \
   1135    APPLY_COEFF_8x4(cg, dh, c26, c37) \
   1136    STORE_8(dst, stride, ae, bf, cg, dh) \
   1137 }
   1138 inv_txfm_fn8x4(adst,     dct     )
   1139 inv_txfm_fn8x4(dct,      adst    )
   1140 inv_txfm_fn8x4(dct,      flipadst)
   1141 inv_txfm_fn8x4(flipadst, dct     )
   1142 inv_txfm_fn8x4(adst,     flipadst)
   1143 inv_txfm_fn8x4(flipadst, adst    )
   1144 inv_txfm_fn8x4(identity, dct     )
   1145 inv_txfm_fn8x4(dct,      identity)
   1146 inv_txfm_fn8x4(identity, flipadst)
   1147 inv_txfm_fn8x4(flipadst, identity)
   1148 inv_txfm_fn8x4(identity, adst   )
   1149 inv_txfm_fn8x4(adst,     identity)
   1150 inv_txfm_fn8x4(identity, identity)
   1151 inv_txfm_fn8x4(adst,     adst    )
   1152 inv_txfm_fn8x4(flipadst, flipadst)
   1153 
   1154 void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
   1155                                              int16_t *const coeff, const int eob)
   1156 {
   1157    if (eob < 1) {
   1158        return dc_only_8xN(dst, stride, coeff, 2, 0, 1);
   1159    }
   1160 
   1161    LOAD_COEFF_8x8(coeff)
   1162 
   1163    dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
   1164               c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
   1165               c0, c1, c2, c3, c4, c5, c6, c7)
   1166 
   1167    memset(coeff, 0, sizeof(*coeff) * 8 * 8);
   1168 
   1169    SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1))
   1170    SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1))
   1171    SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1))
   1172    SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1))
   1173 
   1174    TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
   1175                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l)
   1176 
   1177    dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
   1178                c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
   1179                c0, c1, c2, c3, c4, c5, c6, c7)
   1180 
   1181    LOAD_DECLARE_4(dst, stride, a, b, cc, d)
   1182    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
   1183 
   1184    APPLY_COEFF_8x4(a, b, c0, c1)
   1185    APPLY_COEFF_8x4(cc, d, c2, c3)
   1186    APPLY_COEFF_8x4(e, f, c4, c5)
   1187    APPLY_COEFF_8x4(g, hh, c6, c7)
   1188 
   1189    STORE_8(dst, stride, a, b, cc, d)
   1190    STORE_8(dst + 4 * stride, stride, e, f, g, hh)
   1191 }
   1192 
   1193 #define inv_txfm_fn8x8(type1, type2) \
   1194 void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1195                                                          int16_t *const coeff, const int eob) \
   1196 { \
   1197    LOAD_COEFF_8x8(coeff) \
   1198    type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1199                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
   1200                   c0, c1, c2, c3, c4, c5, c6, c7) \
   1201    SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \
   1202    SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \
   1203    SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \
   1204    SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \
   1205    memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
   1206    TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1207                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
   1208    type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1209                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
   1210                    c0, c1, c2, c3, c4, c5, c6, c7) \
   1211    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
   1212    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
   1213    APPLY_COEFF_8x4(a, b, c0, c1) \
   1214    APPLY_COEFF_8x4(c, d, c2, c3) \
   1215    APPLY_COEFF_8x4(e, f, c4, c5) \
   1216    APPLY_COEFF_8x4(g, h, c6, c7) \
   1217    STORE_8(dst, stride, a, b, c, d) \
   1218    STORE_8(dst + 4 * stride, stride, e, f, g, h) \
   1219 }
   1220 inv_txfm_fn8x8(adst,     dct     )
   1221 inv_txfm_fn8x8(dct,      adst    )
   1222 inv_txfm_fn8x8(dct,      flipadst)
   1223 inv_txfm_fn8x8(flipadst, dct     )
   1224 inv_txfm_fn8x8(adst,     flipadst)
   1225 inv_txfm_fn8x8(flipadst, adst    )
   1226 inv_txfm_fn8x8(dct,      identity)
   1227 inv_txfm_fn8x8(flipadst, identity)
   1228 inv_txfm_fn8x8(adst,     identity)
   1229 inv_txfm_fn8x8(adst,     adst    )
   1230 inv_txfm_fn8x8(flipadst, flipadst)
   1231 
   1232 // identity + scale is a no op
   1233 #define inv_txfm_fn8x8_identity(type2) \
   1234 void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1235                                                         int16_t *const coeff, const int eob) \
   1236 { \
   1237    LOAD_COEFF_8x8(coeff) \
   1238    memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
   1239    TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1240                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
   1241    type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
   1242                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
   1243                    c0, c1, c2, c3, c4, c5, c6, c7) \
   1244    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
   1245    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
   1246    APPLY_COEFF_8x4(a, b, c0, c1) \
   1247    APPLY_COEFF_8x4(c, d, c2, c3) \
   1248    APPLY_COEFF_8x4(e, f, c4, c5) \
   1249    APPLY_COEFF_8x4(g, h, c6, c7) \
   1250    STORE_8(dst, stride, a, b, c, d) \
   1251    STORE_8(dst + 4 * stride, stride, e, f, g, h) \
   1252 }
   1253 inv_txfm_fn8x8_identity(dct     )
   1254 inv_txfm_fn8x8_identity(flipadst)
   1255 inv_txfm_fn8x8_identity(adst    )
   1256 inv_txfm_fn8x8_identity(identity)
   1257 
   1258 #define CLIP16_I32_8(a, b, c, d, e, f, g, h, \
   1259                     ab, cd, ef, gh) \
   1260 { \
   1261    ab = vec_packs(a, b); \
   1262    cd = vec_packs(c, d); \
   1263    ef = vec_packs(e, f); \
   1264    gh = vec_packs(g, h); \
   1265    UNPACK_PAIR_I16_I32(a, b, ab) \
   1266    UNPACK_PAIR_I16_I32(c, d, cd) \
   1267    UNPACK_PAIR_I16_I32(e, f, ef) \
   1268    UNPACK_PAIR_I16_I32(g, h, gh) \
   1269 }
   1270 
   1271 #define MUL_4_INPLACE(a, b, c, d, v) \
   1272    a = vec_mul(a, v); \
   1273    b = vec_mul(b, v); \
   1274    c = vec_mul(c, v); \
   1275    d = vec_mul(d, v); \
   1276 
   1277 #define IDENTITY_16_V(v) \
   1278 { \
   1279    i16x8 v_ = vec_adds(v, v); \
   1280    v = vec_mradds(v, v1697_16, v_); \
   1281 }
   1282 
   1283 #define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
   1284                          c08c09, c10c11, c12c13, c14c15) \
   1285 { \
   1286    i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \
   1287    IDENTITY_16_V(c00c01) \
   1288    IDENTITY_16_V(c02c03) \
   1289    IDENTITY_16_V(c04c05) \
   1290    IDENTITY_16_V(c06c07) \
   1291    IDENTITY_16_V(c08c09) \
   1292    IDENTITY_16_V(c10c11) \
   1293    IDENTITY_16_V(c12c13) \
   1294    IDENTITY_16_V(c14c15) \
   1295 }
   1296 
   1297 #define IDENTITY_16_4_I32(a, b, c, d) \
   1298 { \
   1299    i32x4 a2 = vec_add(a, a); \
   1300    i32x4 b2 = vec_add(b, b); \
   1301    i32x4 c2 = vec_add(c, c); \
   1302    i32x4 d2 = vec_add(d, d); \
   1303    MUL_4_INPLACE(a, b, c, d, v1697) \
   1304    SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \
   1305    a = vec_add(a2, a); \
   1306    b = vec_add(b2, b); \
   1307    c = vec_add(c2, c); \
   1308    d = vec_add(d2, d); \
   1309 }
   1310 
   1311 
   1312 #define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
   1313                       c08, c09, c10, c11, c12, c13, c14, c15, \
   1314                       c00c01, c02c03, c04c05, c06c07, \
   1315                       c08c09, c10c11, c12c13, c14c15) \
   1316 { \
   1317    DECLARE_SPLAT_I32(1697) \
   1318    DECLARE_SPLAT_I32(1024) \
   1319    IDENTITY_16_4_I32(c00, c01, c02, c03) \
   1320    IDENTITY_16_4_I32(c04, c05, c06, c07) \
   1321    IDENTITY_16_4_I32(c08, c09, c10, c11) \
   1322    IDENTITY_16_4_I32(c12, c13, c14, c15) \
   1323 }
   1324 
   1325 #define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
   1326                        c08, c09, c10, c11, c12, c13, c14, c15, \
   1327                        c00c01, c02c03, c04c05, c06c07, \
   1328                        c08c09, c10c11, c12c13, c14c15) \
   1329 { \
   1330    PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
   1331           c00, c02, c04, c06, c08, c10, c12, c14, \
   1332           c01, c03, c05, c07, c09, c11, c13, c15)  \
   1333    IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
   1334                      c08c09, c10c11, c12c13, c14c15) \
   1335 }
   1336 
   1337 #define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \
   1338                      c08, c09, c10, c11, c12, c13, c14, c15, \
   1339                      c00c03, c01c02, c07c04, c06c05, \
   1340                      c08c11, c09c10, c14c13, c15c12) \
   1341    IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \
   1342                 c00c03, c01c02, c07c04, c06c05) \
   1343    DECLARE_SPLAT_I32(128) \
   1344    DECLARE_SPLAT_I32(181) \
   1345    DECLARE_SPLAT_I32(401) \
   1346    DECLARE_SPLAT_I32(4076) \
   1347    DECLARE_SPLAT_I32(3166) \
   1348    DECLARE_SPLAT_I32(2598) \
   1349    DECLARE_SPLAT_I32(1931) \
   1350    DECLARE_SPLAT_I32(3612) \
   1351    DECLARE_SPLAT_I32(3920) \
   1352    DECLARE_SPLAT_I32(1189) \
   1353    DECLARE_SPLAT_I32(1567) \
   1354    DECLARE_SPLAT_I32(3784) \
   1355 \
   1356    DECLARE_MUL_PAIR_I32(c01, c15,  v401, v4076) \
   1357    DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \
   1358    DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \
   1359    DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \
   1360 \
   1361    DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076,  v401) \
   1362    DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \
   1363    DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \
   1364    DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \
   1365 \
   1366    SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \
   1367    SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \
   1368 \
   1369    CLIP16_I32_8(t15a, t08a, t14a, t09a, \
   1370                 t13a, t10a, t12a, t11a, \
   1371                 c08c11, c09c10, c14c13, c15c12) \
   1372    DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \
   1373    DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \
   1374    DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \
   1375    DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \
   1376 \
   1377    CLIP16_I32_8(t08, t09, t11, t10, \
   1378                 t12, t13, t15, t14, \
   1379                 c08c11, c09c10, c14c13, c15c12) \
   1380 \
   1381    DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \
   1382    DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \
   1383    \
   1384    ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \
   1385    ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \
   1386    t10a = -t10a; \
   1387 \
   1388    SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \
   1389 \
   1390    ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \
   1391    ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \
   1392    ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \
   1393    ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \
   1394 \
   1395    CLIP16_I32_8(t08a, t11a, t09, t10, \
   1396                 t15a, t12a, t14, t13, \
   1397                 c08c11, c09c10, c14c13, c15c12) \
   1398    ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \
   1399    ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \
   1400 \
   1401    MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \
   1402    SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \
   1403 \
   1404    DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \
   1405                   t15a, t14, t08a, t09, \
   1406                   t12, t13a, t11,  t10a) \
   1407 \
   1408    c15c12 = vec_subs(c00c03, t15at12); \
   1409    c14c13 = vec_subs(c01c02, t14t13a); \
   1410    c08c11 = vec_subs(c07c04, t08at11); \
   1411    c09c10 = vec_subs(c06c05, t09t10a); \
   1412    c00c03 = vec_adds(c00c03, t15at12); \
   1413    c01c02 = vec_adds(c01c02, t14t13a); \
   1414    c07c04 = vec_adds(c07c04, t08at11); \
   1415    c06c05 = vec_adds(c06c05, t09t10a); \
   1416 
   1417 #define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
   1418                   c08, c09, c10, c11, c12, c13, c14, c15, \
   1419                   c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1420 \
   1421    i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \
   1422    IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1423                  c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
   1424    c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \
   1425    c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \
   1426    c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \
   1427    c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \
   1428    c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \
   1429    c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \
   1430    c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \
   1431    c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \
   1432 
   1433 #define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
   1434                  c08, c09, c10, c11, c12, c13, c14, c15, \
   1435                  c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
   1436 \
   1437    IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1438                  c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
   1439    UNPACK_PAIR_I16_I32(c00, c03, c00c03) \
   1440    UNPACK_PAIR_I16_I32(c01, c02, c01c02) \
   1441    UNPACK_PAIR_I16_I32(c07, c04, c07c04) \
   1442    UNPACK_PAIR_I16_I32(c06, c05, c06c05) \
   1443    UNPACK_PAIR_I16_I32(c08, c11, c08c11) \
   1444    UNPACK_PAIR_I16_I32(c09, c10, c09c10) \
   1445    UNPACK_PAIR_I16_I32(c14, c13, c14c13) \
   1446    UNPACK_PAIR_I16_I32(c15, c12, c15c12) \
   1447 
   1448 
   1449 #define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1450                   cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
   1451                   a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
   1452    dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
   1453    dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
   1454    dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
   1455    dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
   1456 
   1457 
   1458 #define PACK_4x4(c00, c01, c02, c03, \
   1459                 c04, c05, c06, c07, \
   1460                 c08, c09, c10, c11, \
   1461                 c12, c13, c14, c15, \
   1462                 c00c01, c02c03, c04c05, c06c07, \
   1463                 c08c09, c10c11, c12c13, c14c15) \
   1464 { \
   1465    c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \
   1466    c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \
   1467    c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \
   1468    c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \
   1469 }
   1470 
   1471 
   1472 
   1473 #define dct_4x4_out(c00, c01, c02, c03, \
   1474                    c04, c05, c06, c07, \
   1475                    c08, c09, c10, c11, \
   1476                    c12, c13, c14, c15, \
   1477                    c00c01, c02c03, c04c05, c06c07, \
   1478                    c08c09, c10c11, c12c13, c14c15) \
   1479 { \
   1480    IDCT_4_INNER(c00, c01, c02, c03) \
   1481    IDCT_4_INNER(c04, c05, c06, c07) \
   1482    IDCT_4_INNER(c08, c09, c10, c11) \
   1483    IDCT_4_INNER(c12, c13, c14, c15) \
   1484 \
   1485    PACK_4x4(c00, c01, c02, c03, \
   1486             c04, c05, c06, c07, \
   1487             c08, c09, c10, c11, \
   1488             c12, c13, c14, c15, \
   1489             c00c01, c02c03, c04c05, c06c07, \
   1490             c08c09, c10c11, c12c13, c14c15) \
   1491 }
   1492 
   1493 #define IDENTITY_4_I32(a, b, c, d) \
   1494 { \
   1495    DECLARE_SPLAT_I32(5793) \
   1496    DECLARE_SPLAT_I32(2048) \
   1497    MUL_4_INPLACE(a, b, c, d, v5793) \
   1498    SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \
   1499 }
   1500 
   1501 #define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1502                       cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
   1503                       a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
   1504 { \
   1505    IDENTITY_4_I32(cA0, cA1, cA2, cA3) \
   1506    IDENTITY_4_I32(cB0, cB1, cB2, cB3) \
   1507    IDENTITY_4_I32(cC0, cC1, cC2, cC3) \
   1508    IDENTITY_4_I32(cD0, cD1, cD2, cD3) \
   1509 }
   1510 
   1511 #define identity_4x4_out(c00, c01, c02, c03, \
   1512                         c04, c05, c06, c07, \
   1513                         c08, c09, c10, c11, \
   1514                         c12, c13, c14, c15, \
   1515                         c00c01, c02c03, c04c05, c06c07, \
   1516                         c08c09, c10c11, c12c13, c14c15) \
   1517 { \
   1518    PACK_4x4(c00, c01, c02, c03, \
   1519             c04, c05, c06, c07, \
   1520             c08, c09, c10, c11, \
   1521             c12, c13, c14, c15, \
   1522             c00c01, c02c03, c04c05, c06c07, \
   1523             c08c09, c10c11, c12c13, c14c15) \
   1524    IDENTITY_4(c00c01, c02c03) \
   1525    IDENTITY_4(c04c05, c06c07) \
   1526    IDENTITY_4(c08c09, c10c11) \
   1527    IDENTITY_4(c12c13, c14c15) \
   1528 }
   1529 
   1530 #define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1531                    cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
   1532                    a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
   1533    adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
   1534    adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
   1535    adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
   1536    adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
   1537 
   1538 #define adst_4x4_out(c00, c01, c02, c03, \
   1539                     c04, c05, c06, c07, \
   1540                     c08, c09, c10, c11, \
   1541                     c12, c13, c14, c15, \
   1542                     c00c01, c02c03, c04c05, c06c07, \
   1543                     c08c09, c10c11, c12c13, c14c15) \
   1544 { \
   1545    ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \
   1546    ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \
   1547    ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \
   1548    ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \
   1549 \
   1550    PACK_4x4(c00, c01, c02, c03, \
   1551             c04, c05, c06, c07, \
   1552             c08, c09, c10, c11, \
   1553             c12, c13, c14, c15, \
   1554             c00c01, c02c03, c04c05, c06c07, \
   1555             c08c09, c10c11, c12c13, c14c15) \
   1556 }
   1557 
   1558 #define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1559                        cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
   1560                        a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
   1561    flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
   1562    flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
   1563    flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
   1564    flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
   1565 
   1566 #define flipadst_4x4_out(c00, c01, c02, c03, \
   1567                         c04, c05, c06, c07, \
   1568                         c08, c09, c10, c11, \
   1569                         c12, c13, c14, c15, \
   1570                         c00c01, c02c03, c04c05, c06c07, \
   1571                         c08c09, c10c11, c12c13, c14c15) \
   1572 { \
   1573    ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \
   1574    ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \
   1575    ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \
   1576    ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \
   1577 \
   1578    PACK_4x4(c00, c01, c02, c03, \
   1579             c04, c05, c06, c07, \
   1580             c08, c09, c10, c11, \
   1581             c12, c13, c14, c15, \
   1582             c00c01, c02c03, c04c05, c06c07, \
   1583             c08c09, c10c11, c12c13, c14c15) \
   1584 }
   1585 
   1586 #define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \
   1587                      c08, c09, c10, c11, c12, c13, c14, c15, \
   1588                      o00, o01, o02, o03, o04, o05, o06, o07, \
   1589                      o08, o09, o10, o11, o12, o13, o14, o15, \
   1590                      c00c01, c02c03, c04c05, c06c07) \
   1591    DECLARE_SPLAT_I32(2048); \
   1592    u32x4 v12 = vec_splat_u32(12); \
   1593    DECLARE_SPLAT_I32(4091) \
   1594    DECLARE_SPLAT_I32(201) \
   1595    DECLARE_SPLAT_I32(3973) \
   1596    DECLARE_SPLAT_I32(995) \
   1597    DECLARE_SPLAT_I32(3703) \
   1598    DECLARE_SPLAT_I32(1751) \
   1599    DECLARE_SPLAT_I32(3290) \
   1600    DECLARE_SPLAT_I32(2440) \
   1601    DECLARE_SPLAT_I32(2751) \
   1602    DECLARE_SPLAT_I32(3035) \
   1603    DECLARE_SPLAT_I32(2106) \
   1604    DECLARE_SPLAT_I32(3513) \
   1605    DECLARE_SPLAT_I32(1380) \
   1606    DECLARE_SPLAT_I32(3857) \
   1607    DECLARE_SPLAT_I32(601) \
   1608    DECLARE_SPLAT_I32(4052) \
   1609 \
   1610    DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \
   1611    DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \
   1612    DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \
   1613    DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \
   1614    DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \
   1615    DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \
   1616    DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \
   1617    DECLARE_MUL_PAIR_I32(c01, c14,  v601, v4052) \
   1618 \
   1619    DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\
   1620    DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \
   1621    DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \
   1622    DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \
   1623    DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \
   1624    DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \
   1625    DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \
   1626    DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14,  v601, v4052) \
   1627 \
   1628    SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \
   1629    SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \
   1630    SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
   1631    SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \
   1632 \
   1633    DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \
   1634    DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \
   1635    DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \
   1636    DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \
   1637    DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \
   1638    DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \
   1639    DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \
   1640    DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \
   1641 \
   1642    CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \
   1643                 c00c01, c02c03, c04c05, c06c07); \
   1644    CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \
   1645                 c00c01, c02c03, c04c05, c06c07); \
   1646 \
   1647    DECLARE_SPLAT_I32(4017) \
   1648    DECLARE_SPLAT_I32(799) \
   1649    DECLARE_SPLAT_I32(2276) \
   1650    DECLARE_SPLAT_I32(3406) \
   1651 \
   1652    DECLARE_MUL_PAIR_I32(t08a, t09a, v4017,  v799); \
   1653    DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \
   1654    DECLARE_MUL_PAIR_I32(t13a, t12a,  v799, v4017); \
   1655    DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \
   1656 \
   1657    ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017,  v799); \
   1658    ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \
   1659    ADD_SUB_PAIR(t13, t12, t13a, t12a,  v799, v4017); \
   1660    ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \
   1661 \
   1662    SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
   1663    SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \
   1664 \
   1665    ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \
   1666    ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \
   1667    ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \
   1668    ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \
   1669    ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \
   1670    ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \
   1671    ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \
   1672    ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \
   1673 \
   1674    CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \
   1675                 c00c01, c02c03, c04c05, c06c07) \
   1676    CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \
   1677                 c00c01, c02c03, c04c05, c06c07) \
   1678 \
   1679    DECLARE_SPLAT_I32(3784) \
   1680    DECLARE_SPLAT_I32(1567) \
   1681 \
   1682    DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \
   1683    DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \
   1684    DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \
   1685    DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \
   1686 \
   1687    ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \
   1688    ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \
   1689    ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \
   1690    ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \
   1691 \
   1692    SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \
   1693    SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \
   1694 \
   1695    ADD_SUB_PAIR(o00, t02a, t00,  t02,,) \
   1696    ADD_SUB_PAIR(o15, t03a, t01,  t03,,) \
   1697    ADD_SUB_PAIR(o03, t06,  t04a, t06a,,) \
   1698    ADD_SUB_PAIR(o12, t07,  t05a, t07a,,) \
   1699    ADD_SUB_PAIR(o01, t10,  t08a, t10a,,) \
   1700    ADD_SUB_PAIR(o14, t11,  t09a, t11a,,) \
   1701    ADD_SUB_PAIR(o02, t14a, t12,  t14,,) \
   1702    ADD_SUB_PAIR(o13, t15a, t13,  t15,,) \
   1703 \
   1704    CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \
   1705                 c00c01, c02c03, c04c05, c06c07) \
   1706    CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \
   1707                 c00c01, c02c03, c04c05, c06c07) \
   1708 \
   1709    DECLARE_SPLAT_I32(181) \
   1710    DECLARE_SPLAT_I32(128) \
   1711    u32x4 v8 = vec_splat_u32(8); \
   1712 \
   1713    ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \
   1714    ADD_SUB_PAIR(o04, o11, t06,  t07,,) \
   1715    ADD_SUB_PAIR(o06, o09, t10,  t11,,) \
   1716    ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \
   1717 \
   1718    MUL_4_INPLACE(o07, o08, o04, o11, v181) \
   1719    MUL_4_INPLACE(o06, o09, o05, o10, v181) \
   1720 \
   1721    SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \
   1722    SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \
   1723 \
   1724    o01 = -o01; \
   1725    o03 = -o03; \
   1726    o05 = -o05; \
   1727    o07 = -o07; \
   1728    o09 = -o09; \
   1729    o11 = -o11; \
   1730    o13 = -o13; \
   1731    o15 = -o15; \
   1732 
   1733 #define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
   1734                   c08, c09, c10, c11, c12, c13, c14, c15, \
   1735                   c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1736 { \
   1737    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1738                  c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1739                  c00c01, c02c03, c04c05, c06c07) \
   1740 }
   1741 
   1742 #define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
   1743                    c08, c09, c10, c11, c12, c13, c14, c15, \
   1744                    c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1745 { \
   1746    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1747                  c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1748                  c00c01, c02c03, c04c05, c06c07) \
   1749    PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
   1750           c00, c02, c04, c06, c08, c10, c12, c14, \
   1751           c01, c03, c05, c07, c09, c11, c13, c15) \
   1752 }
   1753 
   1754 #define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
   1755                       c08, c09, c10, c11, c12, c13, c14, c15, \
   1756                       c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1757 { \
   1758    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1759                  c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
   1760                  c00c01, c02c03, c04c05, c06c07) \
   1761 }
   1762 
   1763 #define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
   1764                        c08, c09, c10, c11, c12, c13, c14, c15, \
   1765                        c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1766 { \
   1767    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
   1768                  c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
   1769                  c00c01, c02c03, c04c05, c06c07) \
   1770    PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
   1771           c00, c02, c04, c06, c08, c10, c12, c14, \
   1772           c01, c03, c05, c07, c09, c11, c13, c15) \
   1773 }
   1774 
   1775 
   1776 void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
   1777                                               int16_t *const coeff, const int eob
   1778                                               HIGHBD_DECL_SUFFIX)
   1779 {
   1780    if (eob < 1) {
   1781        return dc_only_4xN(dst, stride, coeff, 4, 0, 1);
   1782    }
   1783 
   1784    LOAD_COEFF_4x16(coeff)
   1785 
   1786    dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
   1787               cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
   1788               a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
   1789 
   1790    memset(coeff, 0, sizeof(*coeff) * 4 * 16);
   1791 
   1792    SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1))
   1793    SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1))
   1794    SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1))
   1795    SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1))
   1796    TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
   1797                      cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3)
   1798 
   1799    dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
   1800               cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
   1801               a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
   1802 
   1803    LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03)
   1804    LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07)
   1805    LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11)
   1806    LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15)
   1807 
   1808    APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0);
   1809    APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1);
   1810    APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2);
   1811    APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3);
   1812 
   1813    STORE_4(dst, stride,               l00, l01, l02, l03);
   1814    STORE_4(dst + 4 * stride, stride,  l04, l05, l06, l07);
   1815    STORE_4(dst + 8 * stride, stride,  l08, l09, l10, l11);
   1816    STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15);
   1817 }
   1818 
   1819 #define inv_txfm_fn4x16(type1, type2) \
   1820 void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1821                                                          int16_t *const coeff, const int eob) \
   1822 { \
   1823    LOAD_COEFF_4x16(coeff) \
   1824    type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1825                   cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
   1826                   a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
   1827    memset(coeff, 0, sizeof(*coeff) * 4 * 16); \
   1828    SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \
   1829    SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \
   1830    SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \
   1831    SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \
   1832    TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1833                      cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \
   1834    type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
   1835                   cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
   1836                   a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
   1837    LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \
   1838    LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \
   1839    LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \
   1840    LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \
   1841    APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \
   1842    APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \
   1843    APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \
   1844    APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \
   1845    STORE_4(dst, stride,               l00, l01, l02, l03); \
   1846    STORE_4(dst + 4 * stride, stride,  l04, l05, l06, l07); \
   1847    STORE_4(dst + 8 * stride, stride,  l08, l09, l10, l11); \
   1848    STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \
   1849 }
   1850 inv_txfm_fn4x16(adst,     dct     )
   1851 inv_txfm_fn4x16(dct,      adst    )
   1852 inv_txfm_fn4x16(dct,      flipadst)
   1853 inv_txfm_fn4x16(flipadst, dct     )
   1854 inv_txfm_fn4x16(adst,     flipadst)
   1855 inv_txfm_fn4x16(flipadst, adst    )
   1856 inv_txfm_fn4x16(identity, dct     )
   1857 inv_txfm_fn4x16(dct,      identity)
   1858 inv_txfm_fn4x16(identity, flipadst)
   1859 inv_txfm_fn4x16(flipadst, identity)
   1860 inv_txfm_fn4x16(identity, adst   )
   1861 inv_txfm_fn4x16(adst,     identity)
   1862 inv_txfm_fn4x16(identity, identity)
   1863 inv_txfm_fn4x16(adst,     adst    )
   1864 inv_txfm_fn4x16(flipadst, flipadst)
   1865 
   1866 void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
   1867                                               int16_t *const coeff, const int eob)
   1868 {
   1869 
   1870    if (eob < 1) {
   1871        return dc_only_16xN(dst, stride, coeff, 1, 0, 1);
   1872    }
   1873 
   1874    LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
   1875    LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
   1876    LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
   1877    LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
   1878    UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03)
   1879    UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07)
   1880    UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11)
   1881    UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15)
   1882 
   1883    dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07,
   1884              c08, c09, c10, c11, c12, c13, c14, c15,
   1885              c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15)
   1886    memset(coeff, 0, sizeof(*coeff) * 16 * 4);
   1887    SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1))
   1888    SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1))
   1889    SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1))
   1890    SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1))
   1891 
   1892    TRANSPOSE4_I32(c00, c01, c02, c03);
   1893    TRANSPOSE4_I32(c04, c05, c06, c07);
   1894    TRANSPOSE4_I32(c08, c09, c10, c11);
   1895    TRANSPOSE4_I32(c12, c13, c14, c15);
   1896 
   1897    dct_4x4_out(c00, c01, c02, c03,
   1898                c04, c05, c06, c07,
   1899                c08, c09, c10, c11,
   1900                c12, c13, c14, c15,
   1901                c00c01, c02c03, c04c05, c06c07,
   1902                c08c09, c10c11, c12c13, c14c15)
   1903 
   1904    LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3)
   1905 
   1906    APPLY_COEFF_16x4(l0, l1, l2, l3,
   1907                     c00c01, c02c03, c04c05, c06c07,
   1908                     c08c09, c10c11, c12c13, c14c15)
   1909 
   1910    STORE_16(dst, stride, l0, l1, l2, l3)
   1911 }
   1912 
   1913 #define inv_txfm_fn16x4(type1, type2) \
   1914 void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1915                                                          int16_t *const coeff, const int eob) \
   1916 { \
   1917    LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
   1918    LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
   1919    LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
   1920    LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
   1921    UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
   1922    UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
   1923    UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
   1924    UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
   1925    type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
   1926                  c08, c09, c10, c11, c12, c13, c14, c15, \
   1927                  c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1928    memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
   1929    SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
   1930    SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
   1931    SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
   1932    SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
   1933    TRANSPOSE4_I32(c00, c01, c02, c03); \
   1934    TRANSPOSE4_I32(c04, c05, c06, c07); \
   1935    TRANSPOSE4_I32(c08, c09, c10, c11); \
   1936    TRANSPOSE4_I32(c12, c13, c14, c15); \
   1937    type2##_4x4_out(c00, c01, c02, c03, \
   1938                    c04, c05, c06, c07, \
   1939                    c08, c09, c10, c11, \
   1940                    c12, c13, c14, c15, \
   1941                    c00c01, c02c03, c04c05, c06c07, \
   1942                    c08c09, c10c11, c12c13, c14c15); \
   1943    LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
   1944    APPLY_COEFF_16x4(l0, l1, l2, l3, \
   1945                     c00c01, c02c03, c04c05, c06c07, \
   1946                     c08c09, c10c11, c12c13, c14c15) \
   1947    STORE_16(dst, stride, l0, l1, l2, l3) \
   1948 }
   1949 
   1950 inv_txfm_fn16x4(adst,     dct     )
   1951 inv_txfm_fn16x4(dct,      adst    )
   1952 inv_txfm_fn16x4(dct,      flipadst)
   1953 inv_txfm_fn16x4(flipadst, dct     )
   1954 inv_txfm_fn16x4(adst,     flipadst)
   1955 inv_txfm_fn16x4(flipadst, adst    )
   1956 inv_txfm_fn16x4(dct,      identity)
   1957 inv_txfm_fn16x4(flipadst, identity)
   1958 inv_txfm_fn16x4(adst,     identity)
   1959 inv_txfm_fn16x4(identity, identity)
   1960 inv_txfm_fn16x4(adst,     adst    )
   1961 inv_txfm_fn16x4(flipadst, flipadst)
   1962 
   1963 #define inv_txfm_fn16x4_identity(type2) \
   1964 void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
   1965                                                          int16_t *const coeff, const int eob) \
   1966 { \
   1967    LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
   1968    LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
   1969    LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
   1970    LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
   1971    UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
   1972    UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
   1973    UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
   1974    UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
   1975    identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
   1976                  c08, c09, c10, c11, c12, c13, c14, c15, \
   1977                  c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
   1978    memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
   1979    SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
   1980    SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
   1981    SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
   1982    SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
   1983    CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \
   1984    CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \
   1985    TRANSPOSE4_I32(c00, c01, c02, c03); \
   1986    TRANSPOSE4_I32(c04, c05, c06, c07); \
   1987    TRANSPOSE4_I32(c08, c09, c10, c11); \
   1988    TRANSPOSE4_I32(c12, c13, c14, c15); \
   1989    type2##_4x4_out(c00, c01, c02, c03, \
   1990                    c04, c05, c06, c07, \
   1991                    c08, c09, c10, c11, \
   1992                    c12, c13, c14, c15, \
   1993                    c00c01, c02c03, c04c05, c06c07, \
   1994                    c08c09, c10c11, c12c13, c14c15); \
   1995    LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
   1996    APPLY_COEFF_16x4(l0, l1, l2, l3, \
   1997                     c00c01, c02c03, c04c05, c06c07, \
   1998                     c08c09, c10c11, c12c13, c14c15) \
   1999    STORE_16(dst, stride, l0, l1, l2, l3) \
   2000 }
   2001 
   2002 inv_txfm_fn16x4_identity(dct)
   2003 inv_txfm_fn16x4_identity(adst)
   2004 inv_txfm_fn16x4_identity(flipadst)
   2005 
   2006 #endif // BITDEPTH