tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jquanti-altivec.c (8121B)


      1 /*
      2 * AltiVec optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
      5 *
      6 * This software is provided 'as-is', without any express or implied
      7 * warranty.  In no event will the authors be held liable for any damages
      8 * arising from the use of this software.
      9 *
     10 * Permission is granted to anyone to use this software for any purpose,
     11 * including commercial applications, and to alter it and redistribute it
     12 * freely, subject to the following restrictions:
     13 *
     14 * 1. The origin of this software must not be misrepresented; you must not
     15 *    claim that you wrote the original software. If you use this software
     16 *    in a product, an acknowledgment in the product documentation would be
     17 *    appreciated but is not required.
     18 * 2. Altered source versions must be plainly marked as such, and must not be
     19 *    misrepresented as being the original software.
     20 * 3. This notice may not be removed or altered from any source distribution.
     21 */
     22 
     23 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
     24 
     25 #include "jsimd_altivec.h"
     26 
     27 
     28 /* NOTE: The address will either be aligned or offset by 8 bytes, so we can
     29 * always get the data we want by using a single vector load (although we may
     30 * have to permute the result.)
     31 */
     32 #if __BIG_ENDIAN__
     33 
     34 #define LOAD_ROW(row) { \
     35  elemptr = sample_data[row] + start_col; \
     36  in##row = vec_ld(0, elemptr); \
     37  if ((size_t)elemptr & 15) \
     38    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
     39 }
     40 
     41 #else
     42 
     43 #define LOAD_ROW(row) { \
     44  elemptr = sample_data[row] + start_col; \
     45  in##row = vec_vsx_ld(0, elemptr); \
     46 }
     47 
     48 #endif
     49 
     50 
     51 void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
     52                            DCTELEM *workspace)
     53 {
     54  JSAMPROW elemptr;
     55 
     56  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
     57  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
     58 
     59  /* Constants */
     60  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
     61  __vector unsigned char pb_zero = { __16X(0) };
     62 
     63  LOAD_ROW(0);
     64  LOAD_ROW(1);
     65  LOAD_ROW(2);
     66  LOAD_ROW(3);
     67  LOAD_ROW(4);
     68  LOAD_ROW(5);
     69  LOAD_ROW(6);
     70  LOAD_ROW(7);
     71 
     72  out0 = (__vector short)VEC_UNPACKHU(in0);
     73  out1 = (__vector short)VEC_UNPACKHU(in1);
     74  out2 = (__vector short)VEC_UNPACKHU(in2);
     75  out3 = (__vector short)VEC_UNPACKHU(in3);
     76  out4 = (__vector short)VEC_UNPACKHU(in4);
     77  out5 = (__vector short)VEC_UNPACKHU(in5);
     78  out6 = (__vector short)VEC_UNPACKHU(in6);
     79  out7 = (__vector short)VEC_UNPACKHU(in7);
     80 
     81  out0 = vec_sub(out0, pw_centerjsamp);
     82  out1 = vec_sub(out1, pw_centerjsamp);
     83  out2 = vec_sub(out2, pw_centerjsamp);
     84  out3 = vec_sub(out3, pw_centerjsamp);
     85  out4 = vec_sub(out4, pw_centerjsamp);
     86  out5 = vec_sub(out5, pw_centerjsamp);
     87  out6 = vec_sub(out6, pw_centerjsamp);
     88  out7 = vec_sub(out7, pw_centerjsamp);
     89 
     90  vec_st(out0, 0, workspace);
     91  vec_st(out1, 16, workspace);
     92  vec_st(out2, 32, workspace);
     93  vec_st(out3, 48, workspace);
     94  vec_st(out4, 64, workspace);
     95  vec_st(out5, 80, workspace);
     96  vec_st(out6, 96, workspace);
     97  vec_st(out7, 112, workspace);
     98 }
     99 
    100 
    101 #define WORD_BIT  16
    102 
    103 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
    104   We basically need an unsigned equivalent of vec_madds(). */
    105 
    106 #define MULTIPLY(vs0, vs1, out) { \
    107  tmpe = vec_mule((__vector unsigned short)vs0, \
    108                  (__vector unsigned short)vs1); \
    109  tmpo = vec_mulo((__vector unsigned short)vs0, \
    110                  (__vector unsigned short)vs1); \
    111  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
    112                                 (__vector unsigned short)tmpo, \
    113                                 shift_pack_index); \
    114 }
    115 
    116 void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
    117                            DCTELEM *workspace)
    118 {
    119  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    120    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
    121    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
    122    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
    123    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
    124  __vector unsigned int tmpe, tmpo;
    125 
    126  /* Constants */
    127  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
    128 #if __BIG_ENDIAN__
    129  __vector unsigned char shift_pack_index =
    130    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
    131 #else
    132  __vector unsigned char shift_pack_index =
    133    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
    134 #endif
    135 
    136  row0 = vec_ld(0, workspace);
    137  row1 = vec_ld(16, workspace);
    138  row2 = vec_ld(32, workspace);
    139  row3 = vec_ld(48, workspace);
    140  row4 = vec_ld(64, workspace);
    141  row5 = vec_ld(80, workspace);
    142  row6 = vec_ld(96, workspace);
    143  row7 = vec_ld(112, workspace);
    144 
    145  /* Branch-less absolute value */
    146  row0s = vec_sra(row0, pw_word_bit_m1);
    147  row1s = vec_sra(row1, pw_word_bit_m1);
    148  row2s = vec_sra(row2, pw_word_bit_m1);
    149  row3s = vec_sra(row3, pw_word_bit_m1);
    150  row4s = vec_sra(row4, pw_word_bit_m1);
    151  row5s = vec_sra(row5, pw_word_bit_m1);
    152  row6s = vec_sra(row6, pw_word_bit_m1);
    153  row7s = vec_sra(row7, pw_word_bit_m1);
    154  row0 = vec_xor(row0, row0s);
    155  row1 = vec_xor(row1, row1s);
    156  row2 = vec_xor(row2, row2s);
    157  row3 = vec_xor(row3, row3s);
    158  row4 = vec_xor(row4, row4s);
    159  row5 = vec_xor(row5, row5s);
    160  row6 = vec_xor(row6, row6s);
    161  row7 = vec_xor(row7, row7s);
    162  row0 = vec_sub(row0, row0s);
    163  row1 = vec_sub(row1, row1s);
    164  row2 = vec_sub(row2, row2s);
    165  row3 = vec_sub(row3, row3s);
    166  row4 = vec_sub(row4, row4s);
    167  row5 = vec_sub(row5, row5s);
    168  row6 = vec_sub(row6, row6s);
    169  row7 = vec_sub(row7, row7s);
    170 
    171  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
    172  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
    173  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
    174  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
    175  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
    176  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
    177  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
    178  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
    179 
    180  row0 = vec_add(row0, corr0);
    181  row1 = vec_add(row1, corr1);
    182  row2 = vec_add(row2, corr2);
    183  row3 = vec_add(row3, corr3);
    184  row4 = vec_add(row4, corr4);
    185  row5 = vec_add(row5, corr5);
    186  row6 = vec_add(row6, corr6);
    187  row7 = vec_add(row7, corr7);
    188 
    189  recip0 = vec_ld(0, divisors);
    190  recip1 = vec_ld(16, divisors);
    191  recip2 = vec_ld(32, divisors);
    192  recip3 = vec_ld(48, divisors);
    193  recip4 = vec_ld(64, divisors);
    194  recip5 = vec_ld(80, divisors);
    195  recip6 = vec_ld(96, divisors);
    196  recip7 = vec_ld(112, divisors);
    197 
    198  MULTIPLY(row0, recip0, row0);
    199  MULTIPLY(row1, recip1, row1);
    200  MULTIPLY(row2, recip2, row2);
    201  MULTIPLY(row3, recip3, row3);
    202  MULTIPLY(row4, recip4, row4);
    203  MULTIPLY(row5, recip5, row5);
    204  MULTIPLY(row6, recip6, row6);
    205  MULTIPLY(row7, recip7, row7);
    206 
    207  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
    208  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
    209  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
    210  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
    211  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
    212  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
    213  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
    214  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
    215 
    216  MULTIPLY(row0, scale0, row0);
    217  MULTIPLY(row1, scale1, row1);
    218  MULTIPLY(row2, scale2, row2);
    219  MULTIPLY(row3, scale3, row3);
    220  MULTIPLY(row4, scale4, row4);
    221  MULTIPLY(row5, scale5, row5);
    222  MULTIPLY(row6, scale6, row6);
    223  MULTIPLY(row7, scale7, row7);
    224 
    225  row0 = vec_xor(row0, row0s);
    226  row1 = vec_xor(row1, row1s);
    227  row2 = vec_xor(row2, row2s);
    228  row3 = vec_xor(row3, row3s);
    229  row4 = vec_xor(row4, row4s);
    230  row5 = vec_xor(row5, row5s);
    231  row6 = vec_xor(row6, row6s);
    232  row7 = vec_xor(row7, row7s);
    233  row0 = vec_sub(row0, row0s);
    234  row1 = vec_sub(row1, row1s);
    235  row2 = vec_sub(row2, row2s);
    236  row3 = vec_sub(row3, row3s);
    237  row4 = vec_sub(row4, row4s);
    238  row5 = vec_sub(row5, row5s);
    239  row6 = vec_sub(row6, row6s);
    240  row7 = vec_sub(row7, row7s);
    241 
    242  vec_st(row0, 0, coef_block);
    243  vec_st(row1, 16, coef_block);
    244  vec_st(row2, 32, coef_block);
    245  vec_st(row3, 48, coef_block);
    246  vec_st(row4, 64, coef_block);
    247  vec_st(row5, 80, coef_block);
    248  vec_st(row6, 96, coef_block);
    249  vec_st(row7, 112, coef_block);
    250 }