dct-inl.h (8167B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) 7 #ifdef LIB_JPEGLI_DCT_INL_H_ 8 #undef LIB_JPEGLI_DCT_INL_H_ 9 #else 10 #define LIB_JPEGLI_DCT_INL_H_ 11 #endif 12 13 #include "lib/jpegli/transpose-inl.h" 14 #include "lib/jxl/base/compiler_specific.h" 15 16 HWY_BEFORE_NAMESPACE(); 17 namespace jpegli { 18 namespace HWY_NAMESPACE { 19 namespace { 20 21 // These templates are not found via ADL. 22 using hwy::HWY_NAMESPACE::Abs; 23 using hwy::HWY_NAMESPACE::Add; 24 using hwy::HWY_NAMESPACE::DemoteTo; 25 using hwy::HWY_NAMESPACE::Ge; 26 using hwy::HWY_NAMESPACE::IfThenElseZero; 27 using hwy::HWY_NAMESPACE::Mul; 28 using hwy::HWY_NAMESPACE::MulAdd; 29 using hwy::HWY_NAMESPACE::Rebind; 30 using hwy::HWY_NAMESPACE::Round; 31 using hwy::HWY_NAMESPACE::Sub; 32 using hwy::HWY_NAMESPACE::Vec; 33 34 using D = HWY_FULL(float); 35 using DI = HWY_FULL(int32_t); 36 37 template <size_t N> 38 void AddReverse(const float* JXL_RESTRICT a_in1, 39 const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) { 40 HWY_CAPPED(float, 8) d8; 41 for (size_t i = 0; i < N; i++) { 42 auto in1 = Load(d8, a_in1 + i * 8); 43 auto in2 = Load(d8, a_in2 + (N - i - 1) * 8); 44 Store(Add(in1, in2), d8, a_out + i * 8); 45 } 46 } 47 48 template <size_t N> 49 void SubReverse(const float* JXL_RESTRICT a_in1, 50 const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) { 51 HWY_CAPPED(float, 8) d8; 52 for (size_t i = 0; i < N; i++) { 53 auto in1 = Load(d8, a_in1 + i * 8); 54 auto in2 = Load(d8, a_in2 + (N - i - 1) * 8); 55 Store(Sub(in1, in2), d8, a_out + i * 8); 56 } 57 } 58 59 template <size_t N> 60 void B(float* JXL_RESTRICT coeff) { 61 HWY_CAPPED(float, 8) d8; 62 constexpr float kSqrt2 = 1.41421356237f; 63 auto sqrt2 = Set(d8, kSqrt2); 64 auto in1 = Load(d8, coeff); 65 auto in2 = Load(d8, coeff + 8); 66 Store(MulAdd(in1, sqrt2, in2), d8, coeff); 67 for (size_t i = 1; i + 1 < N; i++) { 68 auto in1 = Load(d8, coeff + i * 8); 69 auto in2 = Load(d8, coeff + (i + 1) * 8); 70 Store(Add(in1, in2), d8, coeff + i * 8); 71 } 72 } 73 74 // Ideally optimized away by compiler (except the multiply). 75 template <size_t N> 76 void InverseEvenOdd(const float* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) { 77 HWY_CAPPED(float, 8) d8; 78 for (size_t i = 0; i < N / 2; i++) { 79 auto in1 = Load(d8, a_in + i * 8); 80 Store(in1, d8, a_out + 2 * i * 8); 81 } 82 for (size_t i = N / 2; i < N; i++) { 83 auto in1 = Load(d8, a_in + i * 8); 84 Store(in1, d8, a_out + (2 * (i - N / 2) + 1) * 8); 85 } 86 } 87 88 // Constants for DCT implementation. Generated by the following snippet: 89 // for i in range(N // 2): 90 // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ") 91 template <size_t N> 92 struct WcMultipliers; 93 94 template <> 95 struct WcMultipliers<4> { 96 static constexpr float kMultipliers[] = { 97 0.541196100146197, 98 1.3065629648763764, 99 }; 100 }; 101 102 template <> 103 struct WcMultipliers<8> { 104 static constexpr float kMultipliers[] = { 105 0.5097955791041592, 106 0.6013448869350453, 107 0.8999762231364156, 108 2.5629154477415055, 109 }; 110 }; 111 112 #if JXL_CXX_LANG < JXL_CXX_17 113 constexpr float WcMultipliers<4>::kMultipliers[]; 114 constexpr float WcMultipliers<8>::kMultipliers[]; 115 #endif 116 117 // Invoked on full vector. 118 template <size_t N> 119 void Multiply(float* JXL_RESTRICT coeff) { 120 HWY_CAPPED(float, 8) d8; 121 for (size_t i = 0; i < N / 2; i++) { 122 auto in1 = Load(d8, coeff + (N / 2 + i) * 8); 123 auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); 124 Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8); 125 } 126 } 127 128 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride, 129 size_t off, float* JXL_RESTRICT coeff) { 130 HWY_CAPPED(float, 8) d8; 131 for (size_t i = 0; i < 8; i++) { 132 Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8); 133 } 134 } 135 136 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output, 137 size_t off) { 138 HWY_CAPPED(float, 8) d8; 139 auto mul = Set(d8, 1.0f / 8); 140 for (size_t i = 0; i < 8; i++) { 141 StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off); 142 } 143 } 144 145 template <size_t N> 146 struct DCT1DImpl; 147 148 template <> 149 struct DCT1DImpl<1> { 150 JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} 151 }; 152 153 template <> 154 struct DCT1DImpl<2> { 155 JXL_INLINE void operator()(float* JXL_RESTRICT mem) { 156 HWY_CAPPED(float, 8) d8; 157 auto in1 = Load(d8, mem); 158 auto in2 = Load(d8, mem + 8); 159 Store(Add(in1, in2), d8, mem); 160 Store(Sub(in1, in2), d8, mem + 8); 161 } 162 }; 163 164 template <size_t N> 165 struct DCT1DImpl { 166 void operator()(float* JXL_RESTRICT mem) { 167 HWY_ALIGN float tmp[N * 8]; 168 AddReverse<N / 2>(mem, mem + N * 4, tmp); 169 DCT1DImpl<N / 2>()(tmp); 170 SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4); 171 Multiply<N>(tmp); 172 DCT1DImpl<N / 2>()(tmp + N * 4); 173 B<N / 2>(tmp + N * 4); 174 InverseEvenOdd<N>(tmp, mem); 175 } 176 }; 177 178 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride, 179 float* JXL_RESTRICT output) { 180 HWY_CAPPED(float, 8) d8; 181 HWY_ALIGN float tmp[64]; 182 for (size_t i = 0; i < 8; i += Lanes(d8)) { 183 // TODO(veluca): consider removing the temporary memory here (as is done in 184 // IDCT), if it turns out that some compilers don't optimize away the loads 185 // and this is performance-critical. 186 LoadFromBlock(pixels, pixels_stride, i, tmp); 187 DCT1DImpl<8>()(tmp); 188 StoreToBlockAndScale(tmp, output, i); 189 } 190 } 191 192 JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels( 193 const float* JXL_RESTRICT pixels, size_t pixels_stride, 194 float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) { 195 DCT1D(pixels, pixels_stride, scratch_space); 196 Transpose8x8Block(scratch_space, coefficients); 197 DCT1D(coefficients, 8, scratch_space); 198 Transpose8x8Block(scratch_space, coefficients); 199 } 200 201 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival, 202 int16_t* out) { 203 Rebind<int16_t, DI> di16; 204 Store(DemoteTo(di16, ival), di16, out); 205 } 206 207 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival, 208 int32_t* out) { 209 DI di; 210 Store(ival, di, out); 211 } 212 213 template <typename T> 214 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength, 215 const float* zero_bias_offset, const float* zero_bias_mul, 216 T* block) { 217 D d; 218 DI di; 219 const auto aq_mul = Set(d, aq_strength); 220 for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) { 221 const auto val = Load(d, dct + k); 222 const auto q = Load(d, qmc + k); 223 const auto qval = Mul(val, q); 224 const auto zb_offset = Load(d, zero_bias_offset + k); 225 const auto zb_mul = Load(d, zero_bias_mul + k); 226 const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul)); 227 const auto nzero_mask = Ge(Abs(qval), threshold); 228 const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval))); 229 StoreQuantizedValue(ival, block + k); 230 } 231 } 232 233 template <typename T> 234 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride, 235 const float* JXL_RESTRICT qmc, 236 int16_t last_dc_coeff, float aq_strength, 237 const float* zero_bias_offset, 238 const float* zero_bias_mul, 239 float* JXL_RESTRICT tmp, T* block) { 240 float* JXL_RESTRICT dct = tmp; 241 float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2; 242 TransformFromPixels(pixels, stride, dct, scratch_space); 243 QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block); 244 // Center DC values around zero. 245 static constexpr float kDCBias = 128.0f; 246 const float dc = (dct[0] - kDCBias) * qmc[0]; 247 float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0]; 248 if (std::abs(dc - last_dc_coeff) < dc_threshold) { 249 block[0] = last_dc_coeff; 250 } else { 251 block[0] = std::round(dc); 252 } 253 } 254 255 // NOLINTNEXTLINE(google-readability-namespace-comments) 256 } // namespace 257 } // namespace HWY_NAMESPACE 258 } // namespace jpegli 259 HWY_AFTER_NAMESPACE(); 260 #endif // LIB_JPEGLI_DCT_INL_H_