tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

downsample.cc (12332B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jpegli/downsample.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jpegli/encode_internal.h"
     14 #include "lib/jpegli/error.h"
     15 
     16 HWY_BEFORE_NAMESPACE();
     17 namespace jpegli {
     18 namespace HWY_NAMESPACE {
     19 
     20 // These templates are not found via ADL.
     21 using hwy::HWY_NAMESPACE::Add;
     22 using hwy::HWY_NAMESPACE::Mul;
     23 using hwy::HWY_NAMESPACE::Vec;
     24 
     25 using D = HWY_CAPPED(float, 8);
     26 constexpr D d;
     27 
     28 void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) {
     29  const size_t N = Lanes(d);
     30  const size_t len_out = len / 2;
     31  const auto mul = Set(d, 0.5f);
     32  Vec<D> v0, v1;  // NOLINT
     33  for (size_t x = 0; x < len_out; x += N) {
     34    LoadInterleaved2(d, row_in + 2 * x, v0, v1);
     35    Store(Mul(mul, Add(v0, v1)), d, row_out + x);
     36  }
     37 }
     38 
     39 void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) {
     40  const size_t N = Lanes(d);
     41  const size_t len_out = len / 3;
     42  const auto mul = Set(d, 1.0f / 3);
     43  Vec<D> v0, v1, v2;  // NOLINT
     44  for (size_t x = 0; x < len_out; x += N) {
     45    LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2);
     46    Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x);
     47  }
     48 }
     49 
     50 void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) {
     51  const size_t N = Lanes(d);
     52  const size_t len_out = len / 4;
     53  const auto mul = Set(d, 0.25f);
     54  Vec<D> v0, v1, v2, v3;  // NOLINT
     55  for (size_t x = 0; x < len_out; x += N) {
     56    LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3);
     57    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
     58  }
     59 }
     60 
     61 void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     62                   float* row_out) {
     63  DownsampleRow2x1(rows_in[0], len, row_out);
     64 }
     65 
     66 void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     67                   float* row_out) {
     68  DownsampleRow3x1(rows_in[0], len, row_out);
     69 }
     70 
     71 void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     72                   float* row_out) {
     73  DownsampleRow4x1(rows_in[0], len, row_out);
     74 }
     75 
     76 void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     77                   float* row_out) {
     78  const size_t N = Lanes(d);
     79  const auto mul = Set(d, 0.5f);
     80  float* row0 = rows_in[0];
     81  float* row1 = rows_in[1];
     82  for (size_t x = 0; x < len; x += N) {
     83    Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x);
     84  }
     85 }
     86 
     87 void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     88                   float* row_out) {
     89  const size_t N = Lanes(d);
     90  const size_t len_out = len / 2;
     91  const auto mul = Set(d, 0.25f);
     92  float* row0 = rows_in[0];
     93  float* row1 = rows_in[1];
     94  Vec<D> v0, v1, v2, v3;  // NOLINT
     95  for (size_t x = 0; x < len_out; x += N) {
     96    LoadInterleaved2(d, row0 + 2 * x, v0, v1);
     97    LoadInterleaved2(d, row1 + 2 * x, v2, v3);
     98    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
     99  }
    100 }
    101 
    102 void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    103                   float* row_out) {
    104  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
    105  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
    106  Downsample1x2(rows_in, len / 3, row_out);
    107 }
    108 
    109 void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    110                   float* row_out) {
    111  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
    112  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
    113  Downsample1x2(rows_in, len / 4, row_out);
    114 }
    115 
    116 void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    117                   float* row_out) {
    118  const size_t N = Lanes(d);
    119  const auto mul = Set(d, 1.0f / 3);
    120  float* row0 = rows_in[0];
    121  float* row1 = rows_in[1];
    122  float* row2 = rows_in[2];
    123  for (size_t x = 0; x < len; x += N) {
    124    const auto in0 = Load(d, row0 + x);
    125    const auto in1 = Load(d, row1 + x);
    126    const auto in2 = Load(d, row2 + x);
    127    Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x);
    128  }
    129 }
    130 
    131 void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    132                   float* row_out) {
    133  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
    134  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
    135  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
    136  Downsample1x3(rows_in, len / 2, row_out);
    137 }
    138 
    139 void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    140                   float* row_out) {
    141  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
    142  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
    143  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
    144  Downsample1x3(rows_in, len / 3, row_out);
    145 }
    146 
    147 void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    148                   float* row_out) {
    149  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
    150  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
    151  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
    152  Downsample1x3(rows_in, len / 4, row_out);
    153 }
    154 
    155 void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    156                   float* row_out) {
    157  const size_t N = Lanes(d);
    158  const auto mul = Set(d, 0.25f);
    159  float* row0 = rows_in[0];
    160  float* row1 = rows_in[1];
    161  float* row2 = rows_in[2];
    162  float* row3 = rows_in[3];
    163  for (size_t x = 0; x < len; x += N) {
    164    const auto in0 = Load(d, row0 + x);
    165    const auto in1 = Load(d, row1 + x);
    166    const auto in2 = Load(d, row2 + x);
    167    const auto in3 = Load(d, row3 + x);
    168    Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x);
    169  }
    170 }
    171 
    172 void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    173                   float* row_out) {
    174  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
    175  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
    176  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
    177  DownsampleRow2x1(rows_in[3], len, rows_in[3]);
    178  Downsample1x4(rows_in, len / 2, row_out);
    179 }
    180 
    181 void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    182                   float* row_out) {
    183  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
    184  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
    185  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
    186  DownsampleRow3x1(rows_in[3], len, rows_in[3]);
    187  Downsample1x4(rows_in, len / 3, row_out);
    188 }
    189 
    190 void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    191                   float* row_out) {
    192  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
    193  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
    194  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
    195  DownsampleRow4x1(rows_in[3], len, rows_in[3]);
    196  Downsample1x4(rows_in, len / 4, row_out);
    197 }
    198 
    199 // NOLINTNEXTLINE(google-readability-namespace-comments)
    200 }  // namespace HWY_NAMESPACE
    201 }  // namespace jpegli
    202 HWY_AFTER_NAMESPACE();
    203 
    204 #if HWY_ONCE
    205 namespace jpegli {
    206 
    207 HWY_EXPORT(Downsample1x2);
    208 HWY_EXPORT(Downsample1x3);
    209 HWY_EXPORT(Downsample1x4);
    210 HWY_EXPORT(Downsample2x1);
    211 HWY_EXPORT(Downsample2x2);
    212 HWY_EXPORT(Downsample2x3);
    213 HWY_EXPORT(Downsample2x4);
    214 HWY_EXPORT(Downsample3x1);
    215 HWY_EXPORT(Downsample3x2);
    216 HWY_EXPORT(Downsample3x3);
    217 HWY_EXPORT(Downsample3x4);
    218 HWY_EXPORT(Downsample4x1);
    219 HWY_EXPORT(Downsample4x2);
    220 HWY_EXPORT(Downsample4x3);
    221 HWY_EXPORT(Downsample4x4);
    222 
    223 void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    224                    float* row_out) {}
    225 
    226 void ChooseDownsampleMethods(j_compress_ptr cinfo) {
    227  jpeg_comp_master* m = cinfo->master;
    228  for (int c = 0; c < cinfo->num_components; c++) {
    229    m->downsample_method[c] = nullptr;
    230    jpeg_component_info* comp = &cinfo->comp_info[c];
    231    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
    232    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
    233    if (v_factor == 1) {
    234      if (h_factor == 1) {
    235        m->downsample_method[c] = NullDownsample;
    236      } else if (h_factor == 2) {
    237        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1);
    238      } else if (h_factor == 3) {
    239        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1);
    240      } else if (h_factor == 4) {
    241        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1);
    242      }
    243    } else if (v_factor == 2) {
    244      if (h_factor == 1) {
    245        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
    246      } else if (h_factor == 2) {
    247        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
    248      } else if (h_factor == 3) {
    249        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
    250      } else if (h_factor == 4) {
    251        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
    252      }
    253    } else if (v_factor == 3) {
    254      if (h_factor == 1) {
    255        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
    256      } else if (h_factor == 2) {
    257        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
    258      } else if (h_factor == 3) {
    259        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
    260      } else if (h_factor == 4) {
    261        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
    262      }
    263    } else if (v_factor == 4) {
    264      if (h_factor == 1) {
    265        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4);
    266      } else if (h_factor == 2) {
    267        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4);
    268      } else if (h_factor == 3) {
    269        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4);
    270      } else if (h_factor == 4) {
    271        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4);
    272      }
    273    }
    274    if (m->downsample_method[c] == nullptr) {
    275      JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor);
    276    }
    277  }
    278 }
    279 
    280 void DownsampleInputBuffer(j_compress_ptr cinfo) {
    281  if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
    282    return;
    283  }
    284  jpeg_comp_master* m = cinfo->master;
    285  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
    286  const size_t y0 = m->next_iMCU_row * iMCU_height;
    287  const size_t y1 = y0 + iMCU_height;
    288  const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
    289  for (int c = 0; c < cinfo->num_components; c++) {
    290    jpeg_component_info* comp = &cinfo->comp_info[c];
    291    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
    292    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
    293    if (h_factor == 1 && v_factor == 1) {
    294      continue;
    295    }
    296    auto& input = *m->smooth_input[c];
    297    auto& output = *m->raw_data[c];
    298    const size_t y_out0 = y0 / v_factor;
    299    float* rows_in[MAX_SAMP_FACTOR];
    300    for (size_t y_in = y0, y_out = y_out0; y_in < y1;
    301         y_in += v_factor, ++y_out) {
    302      for (int iy = 0; iy < v_factor; ++iy) {
    303        rows_in[iy] = input.Row(y_in + iy);
    304      }
    305      float* row_out = output.Row(y_out);
    306      (*m->downsample_method[c])(rows_in, xsize_padded, row_out);
    307    }
    308  }
    309 }
    310 
    311 void ApplyInputSmoothing(j_compress_ptr cinfo) {
    312  if (!cinfo->smoothing_factor) {
    313    return;
    314  }
    315  jpeg_comp_master* m = cinfo->master;
    316  const float kW1 = cinfo->smoothing_factor / 1024.0;
    317  const float kW0 = 1.0f - 8.0f * kW1;
    318  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
    319  const ssize_t y0 = m->next_iMCU_row * iMCU_height;
    320  const ssize_t y1 = y0 + iMCU_height;
    321  const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE;
    322  for (int c = 0; c < cinfo->num_components; c++) {
    323    auto& input = m->input_buffer[c];
    324    auto& output = *m->smooth_input[c];
    325    if (m->next_iMCU_row == 0) {
    326      input.CopyRow(-1, 0, 1);
    327    }
    328    if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
    329      size_t last_row = m->ysize_blocks * DCTSIZE - 1;
    330      input.CopyRow(last_row + 1, last_row, 1);
    331    }
    332    // TODO(szabadka) SIMDify this.
    333    for (ssize_t y = y0; y < y1; ++y) {
    334      const float* row_t = input.Row(y - 1);
    335      const float* row_m = input.Row(y);
    336      const float* row_b = input.Row(y + 1);
    337      float* row_out = output.Row(y);
    338      for (ssize_t x = 0; x < xsize_padded; ++x) {
    339        float val_tl = row_t[x - 1];
    340        float val_tm = row_t[x];
    341        float val_tr = row_t[x + 1];
    342        float val_ml = row_m[x - 1];
    343        float val_mm = row_m[x];
    344        float val_mr = row_m[x + 1];
    345        float val_bl = row_b[x - 1];
    346        float val_bm = row_b[x];
    347        float val_br = row_b[x + 1];
    348        float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl +
    349                      val_bm + val_br);
    350        row_out[x] = val_mm * kW0 + val1 * kW1;
    351      }
    352    }
    353  }
    354 }
    355 
    356 }  // namespace jpegli
    357 #endif  // HWY_ONCE