[ tor-browser ].git.dasho

input.cc (16428B)
      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jpegli/input.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jpegli/encode_internal.h"
     14 #include "lib/jpegli/error.h"
     15 #include "lib/jxl/base/byte_order.h"
     16 #include "lib/jxl/base/compiler_specific.h"
     17 
     18 HWY_BEFORE_NAMESPACE();
     19 namespace jpegli {
     20 namespace HWY_NAMESPACE {
     21 
     22 using hwy::HWY_NAMESPACE::Mul;
     23 using hwy::HWY_NAMESPACE::Rebind;
     24 using hwy::HWY_NAMESPACE::Vec;
     25 
     26 using D = HWY_FULL(float);
     27 using DU = HWY_FULL(uint32_t);
     28 using DU8 = Rebind<uint8_t, D>;
     29 using DU16 = Rebind<uint16_t, D>;
     30 
     31 constexpr D d;
     32 constexpr DU du;
     33 constexpr DU8 du8;
     34 constexpr DU16 du16;
     35 
     36 static constexpr double kMul16 = 1.0 / 257.0;
     37 static constexpr double kMulFloat = 255.0;
     38 
     39 template <size_t C>
     40 void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
     41                  float* row_out[kMaxComponents]) {
     42  for (size_t x = x0; x < len; ++x) {
     43    for (size_t c = 0; c < C; ++c) {
     44      row_out[c][x] = row_in[C * x + c];
     45    }
     46  }
     47 }
     48 
     49 template <size_t C, bool swap_endianness = false>
     50 void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
     51                   float* row_out[kMaxComponents]) {
     52  const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
     53  for (size_t x = x0; x < len; ++x) {
     54    for (size_t c = 0; c < C; ++c) {
     55      uint16_t val = row16[C * x + c];
     56      if (swap_endianness) val = JXL_BSWAP16(val);
     57      row_out[c][x] = val * kMul16;
     58    }
     59  }
     60 }
     61 
     62 template <size_t C, bool swap_endianness = false>
     63 void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
     64                  float* row_out[kMaxComponents]) {
     65  const float* rowf = reinterpret_cast<const float*>(row_in);
     66  for (size_t x = x0; x < len; ++x) {
     67    for (size_t c = 0; c < C; ++c) {
     68      float val = rowf[C * x + c];
     69      if (swap_endianness) val = BSwapFloat(val);
     70      row_out[c][x] = val * kMulFloat;
     71    }
     72  }
     73 }
     74 
     75 void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
     76                        float* row_out[kMaxComponents]) {
     77  const size_t N = Lanes(d);
     78  const size_t simd_len = len & (~(N - 1));
     79  float* JXL_RESTRICT const row0 = row_out[0];
     80  for (size_t x = 0; x < simd_len; x += N) {
     81    Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
     82  }
     83  ReadUint8Row<1>(row_in, simd_len, len, row_out);
     84 }
     85 
     86 void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
     87                              float* row_out[kMaxComponents]) {
     88  const size_t N = Lanes(d);
     89  const size_t simd_len = len & (~(N - 1));
     90  float* JXL_RESTRICT const row0 = row_out[0];
     91  float* JXL_RESTRICT const row1 = row_out[1];
     92  Vec<DU8> out0, out1;  // NOLINT
     93  for (size_t x = 0; x < simd_len; x += N) {
     94    LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
     95    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
     96    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
     97  }
     98  ReadUint8Row<2>(row_in, simd_len, len, row_out);
     99 }
    100 
    101 void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
    102                              float* row_out[kMaxComponents]) {
    103  const size_t N = Lanes(d);
    104  const size_t simd_len = len & (~(N - 1));
    105  float* JXL_RESTRICT const row0 = row_out[0];
    106  float* JXL_RESTRICT const row1 = row_out[1];
    107  float* JXL_RESTRICT const row2 = row_out[2];
    108  Vec<DU8> out0, out1, out2;  // NOLINT
    109  for (size_t x = 0; x < simd_len; x += N) {
    110    LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
    111    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
    112    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
    113    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
    114  }
    115  ReadUint8Row<3>(row_in, simd_len, len, row_out);
    116 }
    117 
    118 void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
    119                              float* row_out[kMaxComponents]) {
    120  const size_t N = Lanes(d);
    121  const size_t simd_len = len & (~(N - 1));
    122  float* JXL_RESTRICT const row0 = row_out[0];
    123  float* JXL_RESTRICT const row1 = row_out[1];
    124  float* JXL_RESTRICT const row2 = row_out[2];
    125  float* JXL_RESTRICT const row3 = row_out[3];
    126  Vec<DU8> out0, out1, out2, out3;  // NOLINT
    127  for (size_t x = 0; x < simd_len; x += N) {
    128    LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
    129    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
    130    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
    131    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
    132    Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
    133  }
    134  ReadUint8Row<4>(row_in, simd_len, len, row_out);
    135 }
    136 
    137 void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
    138                         float* row_out[kMaxComponents]) {
    139  const size_t N = Lanes(d);
    140  const size_t simd_len = len & (~(N - 1));
    141  const auto mul = Set(d, kMul16);
    142  const uint16_t* JXL_RESTRICT const row =
    143      reinterpret_cast<const uint16_t*>(row_in);
    144  float* JXL_RESTRICT const row0 = row_out[0];
    145  for (size_t x = 0; x < simd_len; x += N) {
    146    Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
    147          row0 + x);
    148  }
    149  ReadUint16Row<1>(row_in, simd_len, len, row_out);
    150 }
    151 
    152 void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
    153                               float* row_out[kMaxComponents]) {
    154  const size_t N = Lanes(d);
    155  const size_t simd_len = len & (~(N - 1));
    156  const auto mul = Set(d, kMul16);
    157  const uint16_t* JXL_RESTRICT const row =
    158      reinterpret_cast<const uint16_t*>(row_in);
    159  float* JXL_RESTRICT const row0 = row_out[0];
    160  float* JXL_RESTRICT const row1 = row_out[1];
    161  Vec<DU16> out0, out1;  // NOLINT
    162  for (size_t x = 0; x < simd_len; x += N) {
    163    LoadInterleaved2(du16, row + 2 * x, out0, out1);
    164    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
    165    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
    166  }
    167  ReadUint16Row<2>(row_in, simd_len, len, row_out);
    168 }
    169 
    170 void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
    171                               float* row_out[kMaxComponents]) {
    172  const size_t N = Lanes(d);
    173  const size_t simd_len = len & (~(N - 1));
    174  const auto mul = Set(d, kMul16);
    175  const uint16_t* JXL_RESTRICT const row =
    176      reinterpret_cast<const uint16_t*>(row_in);
    177  float* JXL_RESTRICT const row0 = row_out[0];
    178  float* JXL_RESTRICT const row1 = row_out[1];
    179  float* JXL_RESTRICT const row2 = row_out[2];
    180  Vec<DU16> out0, out1, out2;  // NOLINT
    181  for (size_t x = 0; x < simd_len; x += N) {
    182    LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
    183    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
    184    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
    185    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
    186  }
    187  ReadUint16Row<3>(row_in, simd_len, len, row_out);
    188 }
    189 
    190 void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
    191                               float* row_out[kMaxComponents]) {
    192  const size_t N = Lanes(d);
    193  const size_t simd_len = len & (~(N - 1));
    194  const auto mul = Set(d, kMul16);
    195  const uint16_t* JXL_RESTRICT const row =
    196      reinterpret_cast<const uint16_t*>(row_in);
    197  float* JXL_RESTRICT const row0 = row_out[0];
    198  float* JXL_RESTRICT const row1 = row_out[1];
    199  float* JXL_RESTRICT const row2 = row_out[2];
    200  float* JXL_RESTRICT const row3 = row_out[3];
    201  Vec<DU16> out0, out1, out2, out3;  // NOLINT
    202  for (size_t x = 0; x < simd_len; x += N) {
    203    LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
    204    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
    205    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
    206    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
    207    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
    208  }
    209  ReadUint16Row<4>(row_in, simd_len, len, row_out);
    210 }
    211 
    212 void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
    213                             float* row_out[kMaxComponents]) {
    214  ReadUint16Row<1, true>(row_in, 0, len, row_out);
    215 }
    216 
    217 void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
    218                                   float* row_out[kMaxComponents]) {
    219  ReadUint16Row<2, true>(row_in, 0, len, row_out);
    220 }
    221 
    222 void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
    223                                   float* row_out[kMaxComponents]) {
    224  ReadUint16Row<3, true>(row_in, 0, len, row_out);
    225 }
    226 
    227 void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
    228                                   float* row_out[kMaxComponents]) {
    229  ReadUint16Row<4, true>(row_in, 0, len, row_out);
    230 }
    231 
    232 void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
    233                        float* row_out[kMaxComponents]) {
    234  const size_t N = Lanes(d);
    235  const size_t simd_len = len & (~(N - 1));
    236  const auto mul = Set(d, kMulFloat);
    237  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    238  float* JXL_RESTRICT const row0 = row_out[0];
    239  for (size_t x = 0; x < simd_len; x += N) {
    240    Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
    241  }
    242  ReadFloatRow<1>(row_in, simd_len, len, row_out);
    243 }
    244 
    245 void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
    246                              float* row_out[kMaxComponents]) {
    247  const size_t N = Lanes(d);
    248  const size_t simd_len = len & (~(N - 1));
    249  const auto mul = Set(d, kMulFloat);
    250  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    251  float* JXL_RESTRICT const row0 = row_out[0];
    252  float* JXL_RESTRICT const row1 = row_out[1];
    253  Vec<D> out0, out1;  // NOLINT
    254  for (size_t x = 0; x < simd_len; x += N) {
    255    LoadInterleaved2(d, row + 2 * x, out0, out1);
    256    Store(Mul(mul, out0), d, row0 + x);
    257    Store(Mul(mul, out1), d, row1 + x);
    258  }
    259  ReadFloatRow<2>(row_in, simd_len, len, row_out);
    260 }
    261 
    262 void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
    263                              float* row_out[kMaxComponents]) {
    264  const size_t N = Lanes(d);
    265  const size_t simd_len = len & (~(N - 1));
    266  const auto mul = Set(d, kMulFloat);
    267  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    268  float* JXL_RESTRICT const row0 = row_out[0];
    269  float* JXL_RESTRICT const row1 = row_out[1];
    270  float* JXL_RESTRICT const row2 = row_out[2];
    271  Vec<D> out0, out1, out2;  // NOLINT
    272  for (size_t x = 0; x < simd_len; x += N) {
    273    LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
    274    Store(Mul(mul, out0), d, row0 + x);
    275    Store(Mul(mul, out1), d, row1 + x);
    276    Store(Mul(mul, out2), d, row2 + x);
    277  }
    278  ReadFloatRow<3>(row_in, simd_len, len, row_out);
    279 }
    280 
    281 void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
    282                              float* row_out[kMaxComponents]) {
    283  const size_t N = Lanes(d);
    284  const size_t simd_len = len & (~(N - 1));
    285  const auto mul = Set(d, kMulFloat);
    286  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    287  float* JXL_RESTRICT const row0 = row_out[0];
    288  float* JXL_RESTRICT const row1 = row_out[1];
    289  float* JXL_RESTRICT const row2 = row_out[2];
    290  float* JXL_RESTRICT const row3 = row_out[3];
    291  Vec<D> out0, out1, out2, out3;  // NOLINT
    292  for (size_t x = 0; x < simd_len; x += N) {
    293    LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
    294    Store(Mul(mul, out0), d, row0 + x);
    295    Store(Mul(mul, out1), d, row1 + x);
    296    Store(Mul(mul, out2), d, row2 + x);
    297    Store(Mul(mul, out3), d, row3 + x);
    298  }
    299  ReadFloatRow<4>(row_in, simd_len, len, row_out);
    300 }
    301 
    302 void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
    303                            float* row_out[kMaxComponents]) {
    304  ReadFloatRow<1, true>(row_in, 0, len, row_out);
    305 }
    306 
    307 void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
    308                                  float* row_out[kMaxComponents]) {
    309  ReadFloatRow<2, true>(row_in, 0, len, row_out);
    310 }
    311 
    312 void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
    313                                  float* row_out[kMaxComponents]) {
    314  ReadFloatRow<3, true>(row_in, 0, len, row_out);
    315 }
    316 
    317 void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
    318                                  float* row_out[kMaxComponents]) {
    319  ReadFloatRow<4, true>(row_in, 0, len, row_out);
    320 }
    321 
    322 // NOLINTNEXTLINE(google-readability-namespace-comments)
    323 }  // namespace HWY_NAMESPACE
    324 }  // namespace jpegli
    325 HWY_AFTER_NAMESPACE();
    326 
    327 #if HWY_ONCE
    328 namespace jpegli {
    329 
    330 HWY_EXPORT(ReadUint8RowSingle);
    331 HWY_EXPORT(ReadUint8RowInterleaved2);
    332 HWY_EXPORT(ReadUint8RowInterleaved3);
    333 HWY_EXPORT(ReadUint8RowInterleaved4);
    334 HWY_EXPORT(ReadUint16RowSingle);
    335 HWY_EXPORT(ReadUint16RowInterleaved2);
    336 HWY_EXPORT(ReadUint16RowInterleaved3);
    337 HWY_EXPORT(ReadUint16RowInterleaved4);
    338 HWY_EXPORT(ReadUint16RowSingleSwap);
    339 HWY_EXPORT(ReadUint16RowInterleaved2Swap);
    340 HWY_EXPORT(ReadUint16RowInterleaved3Swap);
    341 HWY_EXPORT(ReadUint16RowInterleaved4Swap);
    342 HWY_EXPORT(ReadFloatRowSingle);
    343 HWY_EXPORT(ReadFloatRowInterleaved2);
    344 HWY_EXPORT(ReadFloatRowInterleaved3);
    345 HWY_EXPORT(ReadFloatRowInterleaved4);
    346 HWY_EXPORT(ReadFloatRowSingleSwap);
    347 HWY_EXPORT(ReadFloatRowInterleaved2Swap);
    348 HWY_EXPORT(ReadFloatRowInterleaved3Swap);
    349 HWY_EXPORT(ReadFloatRowInterleaved4Swap);
    350 
    351 void ChooseInputMethod(j_compress_ptr cinfo) {
    352  jpeg_comp_master* m = cinfo->master;
    353  bool swap_endianness =
    354      (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
    355      (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
    356  m->input_method = nullptr;
    357  if (m->data_type == JPEGLI_TYPE_UINT8) {
    358    if (cinfo->raw_data_in || cinfo->input_components == 1) {
    359      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
    360    } else if (cinfo->input_components == 2) {
    361      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
    362    } else if (cinfo->input_components == 3) {
    363      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
    364    } else if (cinfo->input_components == 4) {
    365      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
    366    }
    367  } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
    368    if (cinfo->raw_data_in || cinfo->input_components == 1) {
    369      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
    370    } else if (cinfo->input_components == 2) {
    371      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
    372    } else if (cinfo->input_components == 3) {
    373      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
    374    } else if (cinfo->input_components == 4) {
    375      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
    376    }
    377  } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
    378    if (cinfo->raw_data_in || cinfo->input_components == 1) {
    379      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
    380    } else if (cinfo->input_components == 2) {
    381      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
    382    } else if (cinfo->input_components == 3) {
    383      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
    384    } else if (cinfo->input_components == 4) {
    385      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
    386    }
    387  } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
    388    if (cinfo->raw_data_in || cinfo->input_components == 1) {
    389      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
    390    } else if (cinfo->input_components == 2) {
    391      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
    392    } else if (cinfo->input_components == 3) {
    393      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
    394    } else if (cinfo->input_components == 4) {
    395      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
    396    }
    397  } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
    398    if (cinfo->raw_data_in || cinfo->input_components == 1) {
    399      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
    400    } else if (cinfo->input_components == 2) {
    401      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
    402    } else if (cinfo->input_components == 3) {
    403      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
    404    } else if (cinfo->input_components == 4) {
    405      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
    406    }
    407  }
    408  if (m->input_method == nullptr) {
    409    JPEGLI_ERROR("Could not find input method.");
    410  }
    411 }
    412 
    413 }  // namespace jpegli
    414 #endif  // HWY_ONCE
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE