itx_tmpl.c (11401B)
1 /* 2 * Copyright © 2018-2019, VideoLAN and dav1d authors 3 * Copyright © 2018-2019, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 30 #include <stddef.h> 31 #include <stdint.h> 32 #include <stdlib.h> 33 #include <string.h> 34 35 #include "common/attributes.h" 36 #include "common/intops.h" 37 38 #include "src/itx.h" 39 #include "src/itx_1d.h" 40 #include "src/scan.h" 41 #include "src/tables.h" 42 43 static NOINLINE void 44 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, 45 const int eob, const /*enum RectTxfmSize*/ int tx, const int shift, 46 const enum TxfmType txtp HIGHBD_DECL_SUFFIX) 47 { 48 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; 49 const int w = 4 * t_dim->w, h = 4 * t_dim->h; 50 const int has_dconly = txtp == DCT_DCT; 51 assert(w >= 4 && w <= 64); 52 assert(h >= 4 && h <= 64); 53 assert(eob >= 0); 54 55 const int is_rect2 = w * 2 == h || h * 2 == w; 56 const int rnd = (1 << shift) >> 1; 57 58 if (eob < has_dconly) { 59 int dc = coeff[0]; 60 coeff[0] = 0; 61 if (is_rect2) 62 dc = (dc * 181 + 128) >> 8; 63 dc = (dc * 181 + 128) >> 8; 64 dc = (dc + rnd) >> shift; 65 dc = (dc * 181 + 128 + 2048) >> 12; 66 for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) 67 for (int x = 0; x < w; x++) 68 dst[x] = iclip_pixel(dst[x] + dc); 69 return; 70 } 71 72 const uint8_t *const txtps = dav1d_tx1d_types[txtp]; 73 const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]]; 74 const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]]; 75 const int sh = imin(h, 32), sw = imin(w, 32); 76 #if BITDEPTH == 8 77 const int row_clip_min = INT16_MIN; 78 const int col_clip_min = INT16_MIN; 79 #else 80 const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7); 81 const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5); 82 #endif 83 const int row_clip_max = ~row_clip_min; 84 const int col_clip_max = ~col_clip_min; 85 86 int32_t tmp[64 * 64], *c = tmp; 87 int last_nonzero_col; // in first 1d itx 88 if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) { 89 last_nonzero_col = imin(sh - 1, eob); 90 } else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) { 91 last_nonzero_col = eob >> (t_dim->lw + 2); 92 } else { 93 last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob]; 94 } 95 assert(last_nonzero_col < sh); 96 for (int y = 0; y <= last_nonzero_col; y++, c += w) { 97 if (is_rect2) 98 for (int x = 0; x < sw; x++) 99 c[x] = (coeff[y + x * sh] * 181 + 128) >> 8; 100 else 101 for (int x = 0; x < sw; x++) 102 c[x] = coeff[y + x * sh]; 103 first_1d_fn(c, 1, row_clip_min, row_clip_max); 104 } 105 if (last_nonzero_col + 1 < sh) 106 memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w); 107 108 memset(coeff, 0, sizeof(*coeff) * sw * sh); 109 for (int i = 0; i < w * sh; i++) 110 tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max); 111 112 for (int x = 0; x < w; x++) 113 second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max); 114 115 c = tmp; 116 for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) 117 for (int x = 0; x < w; x++) 118 dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4)); 119 } 120 121 #define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \ 122 static void \ 123 inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ 124 const ptrdiff_t stride, \ 125 coef *const coeff, \ 126 const int eob \ 127 HIGHBD_DECL_SUFFIX) \ 128 { \ 129 inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \ 130 HIGHBD_TAIL_SUFFIX); \ 131 } 132 133 #define inv_txfm_fn64(pfx, w, h, shift) \ 134 inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift) 135 136 #define inv_txfm_fn32(pfx, w, h, shift) \ 137 inv_txfm_fn64(pfx, w, h, shift) \ 138 inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift) 139 140 #define inv_txfm_fn16(pfx, w, h, shift) \ 141 inv_txfm_fn32(pfx, w, h, shift) \ 142 inv_txfm_fn(adst, dct, ADST_DCT, pfx, w, h, shift) \ 143 inv_txfm_fn(dct, adst, DCT_ADST, pfx, w, h, shift) \ 144 inv_txfm_fn(adst, adst, ADST_ADST, pfx, w, h, shift) \ 145 inv_txfm_fn(dct, flipadst, DCT_FLIPADST, pfx, w, h, shift) \ 146 inv_txfm_fn(flipadst, dct, FLIPADST_DCT, pfx, w, h, shift) \ 147 inv_txfm_fn(adst, flipadst, ADST_FLIPADST, pfx, w, h, shift) \ 148 inv_txfm_fn(flipadst, adst, FLIPADST_ADST, pfx, w, h, shift) \ 149 inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \ 150 inv_txfm_fn(identity, dct, H_DCT, pfx, w, h, shift) \ 151 inv_txfm_fn(dct, identity, V_DCT, pfx, w, h, shift) \ 152 153 #define inv_txfm_fn84(pfx, w, h, shift) \ 154 inv_txfm_fn16(pfx, w, h, shift) \ 155 inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \ 156 inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \ 157 inv_txfm_fn(identity, adst, H_ADST, pfx, w, h, shift) \ 158 inv_txfm_fn(adst, identity, V_ADST, pfx, w, h, shift) \ 159 160 inv_txfm_fn84( , 4, 4, 0) 161 inv_txfm_fn84(R, 4, 8, 0) 162 inv_txfm_fn84(R, 4, 16, 1) 163 inv_txfm_fn84(R, 8, 4, 0) 164 inv_txfm_fn84( , 8, 8, 1) 165 inv_txfm_fn84(R, 8, 16, 1) 166 inv_txfm_fn32(R, 8, 32, 2) 167 inv_txfm_fn84(R, 16, 4, 1) 168 inv_txfm_fn84(R, 16, 8, 1) 169 inv_txfm_fn16( , 16, 16, 2) 170 inv_txfm_fn32(R, 16, 32, 1) 171 inv_txfm_fn64(R, 16, 64, 2) 172 inv_txfm_fn32(R, 32, 8, 2) 173 inv_txfm_fn32(R, 32, 16, 1) 174 inv_txfm_fn32( , 32, 32, 2) 175 inv_txfm_fn64(R, 32, 64, 1) 176 inv_txfm_fn64(R, 64, 16, 2) 177 inv_txfm_fn64(R, 64, 32, 1) 178 inv_txfm_fn64( , 64, 64, 2) 179 180 #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ 181 ARCH_AARCH64 || \ 182 (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ 183 )) 184 static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, 185 coef *const coeff, const int eob 186 HIGHBD_DECL_SUFFIX) 187 { 188 int32_t tmp[4 * 4], *c = tmp; 189 for (int y = 0; y < 4; y++, c += 4) { 190 for (int x = 0; x < 4; x++) 191 c[x] = coeff[y + x * 4] >> 2; 192 dav1d_inv_wht4_1d_c(c, 1); 193 } 194 memset(coeff, 0, sizeof(*coeff) * 4 * 4); 195 196 for (int x = 0; x < 4; x++) 197 dav1d_inv_wht4_1d_c(&tmp[x], 4); 198 199 c = tmp; 200 for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride)) 201 for (int x = 0; x < 4; x++) 202 dst[x] = iclip_pixel(dst[x] + *c++); 203 } 204 #endif 205 206 #if HAVE_ASM 207 #if ARCH_AARCH64 || ARCH_ARM 208 #include "src/arm/itx.h" 209 #elif ARCH_LOONGARCH64 210 #include "src/loongarch/itx.h" 211 #elif ARCH_PPC64LE 212 #include "src/ppc/itx.h" 213 #elif ARCH_RISCV 214 #include "src/riscv/itx.h" 215 #elif ARCH_X86 216 #include "src/x86/itx.h" 217 #endif 218 #endif 219 220 COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { 221 #define assign_itx_all_fn64(w, h, pfx) \ 222 c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ 223 inv_txfm_add_dct_dct_##w##x##h##_c 224 225 #define assign_itx_all_fn32(w, h, pfx) \ 226 assign_itx_all_fn64(w, h, pfx); \ 227 c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \ 228 inv_txfm_add_identity_identity_##w##x##h##_c 229 230 #define assign_itx_all_fn16(w, h, pfx) \ 231 assign_itx_all_fn32(w, h, pfx); \ 232 c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \ 233 inv_txfm_add_adst_dct_##w##x##h##_c; \ 234 c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \ 235 inv_txfm_add_dct_adst_##w##x##h##_c; \ 236 c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \ 237 inv_txfm_add_adst_adst_##w##x##h##_c; \ 238 c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \ 239 inv_txfm_add_flipadst_adst_##w##x##h##_c; \ 240 c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \ 241 inv_txfm_add_adst_flipadst_##w##x##h##_c; \ 242 c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \ 243 inv_txfm_add_flipadst_dct_##w##x##h##_c; \ 244 c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \ 245 inv_txfm_add_dct_flipadst_##w##x##h##_c; \ 246 c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \ 247 inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \ 248 c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \ 249 inv_txfm_add_dct_identity_##w##x##h##_c; \ 250 c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \ 251 inv_txfm_add_identity_dct_##w##x##h##_c 252 253 #define assign_itx_all_fn84(w, h, pfx) \ 254 assign_itx_all_fn16(w, h, pfx); \ 255 c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \ 256 inv_txfm_add_flipadst_identity_##w##x##h##_c; \ 257 c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \ 258 inv_txfm_add_identity_flipadst_##w##x##h##_c; \ 259 c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \ 260 inv_txfm_add_adst_identity_##w##x##h##_c; \ 261 c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \ 262 inv_txfm_add_identity_adst_##w##x##h##_c; \ 263 264 #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ 265 ARCH_AARCH64 || \ 266 (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ 267 )) 268 c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c; 269 #endif 270 assign_itx_all_fn84( 4, 4, ); 271 assign_itx_all_fn84( 4, 8, R); 272 assign_itx_all_fn84( 4, 16, R); 273 assign_itx_all_fn84( 8, 4, R); 274 assign_itx_all_fn84( 8, 8, ); 275 assign_itx_all_fn84( 8, 16, R); 276 assign_itx_all_fn32( 8, 32, R); 277 assign_itx_all_fn84(16, 4, R); 278 assign_itx_all_fn84(16, 8, R); 279 assign_itx_all_fn16(16, 16, ); 280 assign_itx_all_fn32(16, 32, R); 281 assign_itx_all_fn64(16, 64, R); 282 assign_itx_all_fn32(32, 8, R); 283 assign_itx_all_fn32(32, 16, R); 284 assign_itx_all_fn32(32, 32, ); 285 assign_itx_all_fn64(32, 64, R); 286 assign_itx_all_fn64(64, 16, R); 287 assign_itx_all_fn64(64, 32, R); 288 assign_itx_all_fn64(64, 64, ); 289 290 int all_simd = 0; 291 #if HAVE_ASM 292 #if ARCH_AARCH64 || ARCH_ARM 293 itx_dsp_init_arm(c, bpc, &all_simd); 294 #endif 295 #if ARCH_LOONGARCH64 296 itx_dsp_init_loongarch(c, bpc); 297 #endif 298 #if ARCH_PPC64LE 299 itx_dsp_init_ppc(c, bpc); 300 #endif 301 #if ARCH_RISCV 302 itx_dsp_init_riscv(c, bpc); 303 #endif 304 #if ARCH_X86 305 itx_dsp_init_x86(c, bpc, &all_simd); 306 #endif 307 #endif 308 309 if (!all_simd) 310 dav1d_init_last_nonzero_col_from_eob_tables(); 311 }