tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdsample-altivec.c (14201B)


      1 /*
      2 * AltiVec optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
      5 *
      6 * This software is provided 'as-is', without any express or implied
      7 * warranty.  In no event will the authors be held liable for any damages
      8 * arising from the use of this software.
      9 *
     10 * Permission is granted to anyone to use this software for any purpose,
     11 * including commercial applications, and to alter it and redistribute it
     12 * freely, subject to the following restrictions:
     13 *
     14 * 1. The origin of this software must not be misrepresented; you must not
     15 *    claim that you wrote the original software. If you use this software
     16 *    in a product, an acknowledgment in the product documentation would be
     17 *    appreciated but is not required.
     18 * 2. Altered source versions must be plainly marked as such, and must not be
     19 *    misrepresented as being the original software.
     20 * 3. This notice may not be removed or altered from any source distribution.
     21 */
     22 
     23 /* CHROMA UPSAMPLING */
     24 
     25 #include "jsimd_altivec.h"
     26 
     27 
     28 void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
     29                                       JDIMENSION downsampled_width,
     30                                       JSAMPARRAY input_data,
     31                                       JSAMPARRAY *output_data_ptr)
     32 {
     33  JSAMPARRAY output_data = *output_data_ptr;
     34  JSAMPROW inptr, outptr;
     35  int inrow, incol;
     36 
     37  __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
     38    out;
     39  __vector short this0e, this0o, this0l, this0h, last0l, last0h,
     40    next0l, next0h, outle, outhe, outlo, outho;
     41 
     42  /* Constants */
     43  __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
     44    last_index_col0 =
     45      {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
     46    last_index =
     47      { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
     48    next_index =
     49      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
     50    next_index_lastcol =
     51      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
     52 #if __BIG_ENDIAN__
     53    merge_pack_index =
     54      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
     55 #else
     56    merge_pack_index =
     57      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
     58 #endif
     59  __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
     60 
     61  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
     62    inptr = input_data[inrow];
     63    outptr = output_data[inrow];
     64 
     65    if (downsampled_width & 15)
     66      inptr[downsampled_width] = inptr[downsampled_width - 1];
     67 
     68    this0 = vec_ld(0, inptr);
     69    p_last0 = vec_perm(this0, this0, last_index_col0);
     70    last0 = this0;
     71 
     72    for (incol = downsampled_width; incol > 0;
     73         incol -= 16, inptr += 16, outptr += 32) {
     74 
     75      if (downsampled_width - incol > 0) {
     76        p_last0 = vec_perm(last0, this0, last_index);
     77        last0 = this0;
     78      }
     79 
     80      if (incol <= 16)
     81        p_next0 = vec_perm(this0, this0, next_index_lastcol);
     82      else {
     83        next0 = vec_ld(16, inptr);
     84        p_next0 = vec_perm(this0, next0, next_index);
     85      }
     86 
     87      this0e = (__vector short)vec_mule(this0, pb_three);
     88      this0o = (__vector short)vec_mulo(this0, pb_three);
     89      this0l = vec_mergeh(this0e, this0o);
     90      this0h = vec_mergel(this0e, this0o);
     91 
     92      last0l = (__vector short)VEC_UNPACKHU(p_last0);
     93      last0h = (__vector short)VEC_UNPACKLU(p_last0);
     94      last0l = vec_add(last0l, pw_one);
     95 
     96      next0l = (__vector short)VEC_UNPACKHU(p_next0);
     97      next0h = (__vector short)VEC_UNPACKLU(p_next0);
     98      next0l = vec_add(next0l, pw_two);
     99 
    100      outle = vec_add(this0l, last0l);
    101      outlo = vec_add(this0l, next0l);
    102      outle = vec_sr(outle, (__vector unsigned short)pw_two);
    103      outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
    104 
    105      out = vec_perm((__vector unsigned char)outle,
    106                     (__vector unsigned char)outlo, merge_pack_index);
    107      vec_st(out, 0, outptr);
    108 
    109      if (incol > 8) {
    110        last0h = vec_add(last0h, pw_one);
    111        next0h = vec_add(next0h, pw_two);
    112 
    113        outhe = vec_add(this0h, last0h);
    114        outho = vec_add(this0h, next0h);
    115        outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
    116        outho = vec_sr(outho, (__vector unsigned short)pw_two);
    117 
    118        out = vec_perm((__vector unsigned char)outhe,
    119                       (__vector unsigned char)outho, merge_pack_index);
    120        vec_st(out, 16, outptr);
    121      }
    122 
    123      this0 = next0;
    124    }
    125  }
    126 }
    127 
    128 
    129 void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
    130                                       JDIMENSION downsampled_width,
    131                                       JSAMPARRAY input_data,
    132                                       JSAMPARRAY *output_data_ptr)
    133 {
    134  JSAMPARRAY output_data = *output_data_ptr;
    135  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
    136  int inrow, outrow, incol;
    137 
    138  __vector unsigned char this_1, this0, this1, out;
    139  __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
    140    lastcolsum_1h, lastcolsum1h,
    141    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
    142    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
    143    nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
    144    nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
    145    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
    146    tmpl, tmph, outle, outhe, outlo, outho;
    147 
    148  /* Constants */
    149  __vector unsigned char pb_zero = { __16X(0) },
    150    last_index_col0 =
    151      {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
    152    last_index =
    153      { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
    154    next_index =
    155      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
    156    next_index_lastcol =
    157      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
    158 #if __BIG_ENDIAN__
    159    merge_pack_index =
    160      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
    161 #else
    162    merge_pack_index =
    163      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
    164 #endif
    165  __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
    166    pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
    167  __vector unsigned short pw_four = { __8X(4) };
    168 
    169  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    170 
    171    inptr_1 = input_data[inrow - 1];
    172    inptr0 = input_data[inrow];
    173    inptr1 = input_data[inrow + 1];
    174    outptr0 = output_data[outrow++];
    175    outptr1 = output_data[outrow++];
    176 
    177    if (downsampled_width & 15) {
    178      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
    179      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
    180      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
    181    }
    182 
    183    this0 = vec_ld(0, inptr0);
    184    this0l = (__vector short)VEC_UNPACKHU(this0);
    185    this0h = (__vector short)VEC_UNPACKLU(this0);
    186    this0l = vec_mladd(this0l, pw_three, pw_zero);
    187    this0h = vec_mladd(this0h, pw_three, pw_zero);
    188 
    189    this_1 = vec_ld(0, inptr_1);
    190    this_1l = (__vector short)VEC_UNPACKHU(this_1);
    191    this_1h = (__vector short)VEC_UNPACKLU(this_1);
    192    thiscolsum_1l = vec_add(this0l, this_1l);
    193    thiscolsum_1h = vec_add(this0h, this_1h);
    194    lastcolsum_1h = thiscolsum_1h;
    195    p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
    196    p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
    197 
    198    this1 = vec_ld(0, inptr1);
    199    this1l = (__vector short)VEC_UNPACKHU(this1);
    200    this1h = (__vector short)VEC_UNPACKLU(this1);
    201    thiscolsum1l = vec_add(this0l, this1l);
    202    thiscolsum1h = vec_add(this0h, this1h);
    203    lastcolsum1h = thiscolsum1h;
    204    p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
    205    p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
    206 
    207    for (incol = downsampled_width; incol > 0;
    208         incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
    209         outptr0 += 32, outptr1 += 32) {
    210 
    211      if (downsampled_width - incol > 0) {
    212        p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
    213        p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
    214        p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
    215        p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
    216        lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
    217      }
    218 
    219      if (incol <= 16) {
    220        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
    221        p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
    222                                   next_index_lastcol);
    223        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
    224        p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
    225                                  next_index_lastcol);
    226      } else {
    227        this0 = vec_ld(16, inptr0);
    228        this0l = (__vector short)VEC_UNPACKHU(this0);
    229        this0h = (__vector short)VEC_UNPACKLU(this0);
    230        this0l = vec_mladd(this0l, pw_three, pw_zero);
    231        this0h = vec_mladd(this0h, pw_three, pw_zero);
    232 
    233        this_1 = vec_ld(16, inptr_1);
    234        this_1l = (__vector short)VEC_UNPACKHU(this_1);
    235        this_1h = (__vector short)VEC_UNPACKLU(this_1);
    236        nextcolsum_1l = vec_add(this0l, this_1l);
    237        nextcolsum_1h = vec_add(this0h, this_1h);
    238        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
    239        p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
    240 
    241        this1 = vec_ld(16, inptr1);
    242        this1l = (__vector short)VEC_UNPACKHU(this1);
    243        this1h = (__vector short)VEC_UNPACKLU(this1);
    244        nextcolsum1l = vec_add(this0l, this1l);
    245        nextcolsum1h = vec_add(this0h, this1h);
    246        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
    247        p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
    248      }
    249 
    250      /* Process the upper row */
    251 
    252      tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
    253      outle = vec_add(tmpl, p_lastcolsum_1l);
    254      outle = vec_add(outle, pw_eight);
    255      outle = vec_sr(outle, pw_four);
    256 
    257      outlo = vec_add(tmpl, p_nextcolsum_1l);
    258      outlo = vec_add(outlo, pw_seven);
    259      outlo = vec_sr(outlo, pw_four);
    260 
    261      out = vec_perm((__vector unsigned char)outle,
    262                     (__vector unsigned char)outlo, merge_pack_index);
    263      vec_st(out, 0, outptr0);
    264 
    265      if (incol > 8) {
    266        tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
    267        outhe = vec_add(tmph, p_lastcolsum_1h);
    268        outhe = vec_add(outhe, pw_eight);
    269        outhe = vec_sr(outhe, pw_four);
    270 
    271        outho = vec_add(tmph, p_nextcolsum_1h);
    272        outho = vec_add(outho, pw_seven);
    273        outho = vec_sr(outho, pw_four);
    274 
    275        out = vec_perm((__vector unsigned char)outhe,
    276                       (__vector unsigned char)outho, merge_pack_index);
    277        vec_st(out, 16, outptr0);
    278      }
    279 
    280      /* Process the lower row */
    281 
    282      tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
    283      outle = vec_add(tmpl, p_lastcolsum1l);
    284      outle = vec_add(outle, pw_eight);
    285      outle = vec_sr(outle, pw_four);
    286 
    287      outlo = vec_add(tmpl, p_nextcolsum1l);
    288      outlo = vec_add(outlo, pw_seven);
    289      outlo = vec_sr(outlo, pw_four);
    290 
    291      out = vec_perm((__vector unsigned char)outle,
    292                     (__vector unsigned char)outlo, merge_pack_index);
    293      vec_st(out, 0, outptr1);
    294 
    295      if (incol > 8) {
    296        tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
    297        outhe = vec_add(tmph, p_lastcolsum1h);
    298        outhe = vec_add(outhe, pw_eight);
    299        outhe = vec_sr(outhe, pw_four);
    300 
    301        outho = vec_add(tmph, p_nextcolsum1h);
    302        outho = vec_add(outho, pw_seven);
    303        outho = vec_sr(outho, pw_four);
    304 
    305        out = vec_perm((__vector unsigned char)outhe,
    306                       (__vector unsigned char)outho, merge_pack_index);
    307        vec_st(out, 16, outptr1);
    308      }
    309 
    310      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
    311      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
    312    }
    313  }
    314 }
    315 
    316 
    317 /* These are rarely used (mainly just for decompressing YCCK images) */
    318 
    319 void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
    320                                 JDIMENSION output_width,
    321                                 JSAMPARRAY input_data,
    322                                 JSAMPARRAY *output_data_ptr)
    323 {
    324  JSAMPARRAY output_data = *output_data_ptr;
    325  JSAMPROW inptr, outptr;
    326  int inrow, incol;
    327 
    328  __vector unsigned char in, inl, inh;
    329 
    330  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
    331    inptr = input_data[inrow];
    332    outptr = output_data[inrow];
    333 
    334    for (incol = (output_width + 31) & (~31); incol > 0;
    335         incol -= 64, inptr += 32, outptr += 64) {
    336 
    337      in = vec_ld(0, inptr);
    338      inl = vec_mergeh(in, in);
    339      inh = vec_mergel(in, in);
    340 
    341      vec_st(inl, 0, outptr);
    342      vec_st(inh, 16, outptr);
    343 
    344      if (incol > 32) {
    345        in = vec_ld(16, inptr);
    346        inl = vec_mergeh(in, in);
    347        inh = vec_mergel(in, in);
    348 
    349        vec_st(inl, 32, outptr);
    350        vec_st(inh, 48, outptr);
    351      }
    352    }
    353  }
    354 }
    355 
    356 
    357 void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
    358                                 JDIMENSION output_width,
    359                                 JSAMPARRAY input_data,
    360                                 JSAMPARRAY *output_data_ptr)
    361 {
    362  JSAMPARRAY output_data = *output_data_ptr;
    363  JSAMPROW inptr, outptr0, outptr1;
    364  int inrow, outrow, incol;
    365 
    366  __vector unsigned char in, inl, inh;
    367 
    368  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    369 
    370    inptr = input_data[inrow];
    371    outptr0 = output_data[outrow++];
    372    outptr1 = output_data[outrow++];
    373 
    374    for (incol = (output_width + 31) & (~31); incol > 0;
    375         incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
    376 
    377      in = vec_ld(0, inptr);
    378      inl = vec_mergeh(in, in);
    379      inh = vec_mergel(in, in);
    380 
    381      vec_st(inl, 0, outptr0);
    382      vec_st(inl, 0, outptr1);
    383 
    384      vec_st(inh, 16, outptr0);
    385      vec_st(inh, 16, outptr1);
    386 
    387      if (incol > 32) {
    388        in = vec_ld(16, inptr);
    389        inl = vec_mergeh(in, in);
    390        inh = vec_mergel(in, in);
    391 
    392        vec_st(inl, 32, outptr0);
    393        vec_st(inl, 32, outptr1);
    394 
    395        vec_st(inh, 48, outptr0);
    396        vec_st(inh, 48, outptr1);
    397      }
    398    }
    399  }
    400 }