tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctfst-altivec.c (8109B)


      1 /*
      2 * AltiVec optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
      5 *
      6 * This software is provided 'as-is', without any express or implied
      7 * warranty.  In no event will the authors be held liable for any damages
      8 * arising from the use of this software.
      9 *
     10 * Permission is granted to anyone to use this software for any purpose,
     11 * including commercial applications, and to alter it and redistribute it
     12 * freely, subject to the following restrictions:
     13 *
     14 * 1. The origin of this software must not be misrepresented; you must not
     15 *    claim that you wrote the original software. If you use this software
     16 *    in a product, an acknowledgment in the product documentation would be
     17 *    appreciated but is not required.
     18 * 2. Altered source versions must be plainly marked as such, and must not be
     19 *    misrepresented as being the original software.
     20 * 3. This notice may not be removed or altered from any source distribution.
     21 */
     22 
     23 /* FAST INTEGER INVERSE DCT
     24 *
     25 * This is similar to the SSE2 implementation, except that we left-shift the
     26 * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
     27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
     28 *   the elements in arg3 + the most significant 17 bits of
     29 *     (the elements in arg1 * the elements in arg2).
     30 */
     31 
     32 #include "jsimd_altivec.h"
     33 
     34 
     35 #define F_1_082  277              /* FIX(1.082392200) */
     36 #define F_1_414  362              /* FIX(1.414213562) */
     37 #define F_1_847  473              /* FIX(1.847759065) */
     38 #define F_2_613  669              /* FIX(2.613125930) */
     39 #define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
     40 
     41 #define CONST_BITS  8
     42 #define PASS1_BITS  2
     43 #define PRE_MULTIPLY_SCALE_BITS  2
     44 #define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
     45 
     46 
     47 #define DO_IDCT(in) { \
     48  /* Even part */ \
     49  \
     50  tmp10 = vec_add(in##0, in##4); \
     51  tmp11 = vec_sub(in##0, in##4); \
     52  tmp13 = vec_add(in##2, in##6); \
     53  \
     54  tmp12 = vec_sub(in##2, in##6); \
     55  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
     56  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
     57  tmp12 = vec_sub(tmp12, tmp13); \
     58  \
     59  tmp0 = vec_add(tmp10, tmp13); \
     60  tmp3 = vec_sub(tmp10, tmp13); \
     61  tmp1 = vec_add(tmp11, tmp12); \
     62  tmp2 = vec_sub(tmp11, tmp12); \
     63  \
     64  /* Odd part */ \
     65  \
     66  z13 = vec_add(in##5, in##3); \
     67  z10 = vec_sub(in##5, in##3); \
     68  z10s = vec_sl(z10, pre_multiply_scale_bits); \
     69  z11 = vec_add(in##1, in##7); \
     70  z12s = vec_sub(in##1, in##7); \
     71  z12s = vec_sl(z12s, pre_multiply_scale_bits); \
     72  \
     73  tmp11 = vec_sub(z11, z13); \
     74  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
     75  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
     76  \
     77  tmp7 = vec_add(z11, z13); \
     78  \
     79  /* To avoid overflow... \
     80   * \
     81   * (Original) \
     82   * tmp12 = -2.613125930 * z10 + z5; \
     83   * \
     84   * (This implementation) \
     85   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
     86   *       = -1.613125930 * z10 - z10 + z5; \
     87   */ \
     88  \
     89  z5 = vec_add(z10s, z12s); \
     90  z5 = vec_madds(z5, pw_F1847, pw_zero); \
     91  \
     92  tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
     93  tmp10 = vec_sub(tmp10, z5); \
     94  tmp12 = vec_madds(z10s, pw_MF1613, z5); \
     95  tmp12 = vec_sub(tmp12, z10); \
     96  \
     97  tmp6 = vec_sub(tmp12, tmp7); \
     98  tmp5 = vec_sub(tmp11, tmp6); \
     99  tmp4 = vec_add(tmp10, tmp5); \
    100  \
    101  out0 = vec_add(tmp0, tmp7); \
    102  out1 = vec_add(tmp1, tmp6); \
    103  out2 = vec_add(tmp2, tmp5); \
    104  out3 = vec_sub(tmp3, tmp4); \
    105  out4 = vec_add(tmp3, tmp4); \
    106  out5 = vec_sub(tmp2, tmp5); \
    107  out6 = vec_sub(tmp1, tmp6); \
    108  out7 = vec_sub(tmp0, tmp7); \
    109 }
    110 
    111 
    112 void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
    113                              JSAMPARRAY output_buf, JDIMENSION output_col)
    114 {
    115  short *dct_table = (short *)dct_table_;
    116  int *outptr;
    117 
    118  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    119    col0, col1, col2, col3, col4, col5, col6, col7,
    120    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
    121    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    122    z5, z10, z10s, z11, z12s, z13,
    123    out0, out1, out2, out3, out4, out5, out6, out7;
    124  __vector signed char outb;
    125 
    126  /* Constants */
    127  __vector short pw_zero = { __8X(0) },
    128    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
    129    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
    130    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
    131    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
    132  __vector unsigned short
    133    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
    134    pass1_bits3 = { __8X(PASS1_BITS + 3) };
    135  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
    136 
    137  /* Pass 1: process columns */
    138 
    139  col0 = vec_ld(0, coef_block);
    140  col1 = vec_ld(16, coef_block);
    141  col2 = vec_ld(32, coef_block);
    142  col3 = vec_ld(48, coef_block);
    143  col4 = vec_ld(64, coef_block);
    144  col5 = vec_ld(80, coef_block);
    145  col6 = vec_ld(96, coef_block);
    146  col7 = vec_ld(112, coef_block);
    147 
    148  tmp1 = vec_or(col1, col2);
    149  tmp2 = vec_or(col3, col4);
    150  tmp1 = vec_or(tmp1, tmp2);
    151  tmp3 = vec_or(col5, col6);
    152  tmp3 = vec_or(tmp3, col7);
    153  tmp1 = vec_or(tmp1, tmp3);
    154 
    155  quant0 = vec_ld(0, dct_table);
    156  col0 = vec_mladd(col0, quant0, pw_zero);
    157 
    158  if (vec_all_eq(tmp1, pw_zero)) {
    159    /* AC terms all zero */
    160 
    161    row0 = vec_splat(col0, 0);
    162    row1 = vec_splat(col0, 1);
    163    row2 = vec_splat(col0, 2);
    164    row3 = vec_splat(col0, 3);
    165    row4 = vec_splat(col0, 4);
    166    row5 = vec_splat(col0, 5);
    167    row6 = vec_splat(col0, 6);
    168    row7 = vec_splat(col0, 7);
    169 
    170  } else {
    171 
    172    quant1 = vec_ld(16, dct_table);
    173    quant2 = vec_ld(32, dct_table);
    174    quant3 = vec_ld(48, dct_table);
    175    quant4 = vec_ld(64, dct_table);
    176    quant5 = vec_ld(80, dct_table);
    177    quant6 = vec_ld(96, dct_table);
    178    quant7 = vec_ld(112, dct_table);
    179 
    180    col1 = vec_mladd(col1, quant1, pw_zero);
    181    col2 = vec_mladd(col2, quant2, pw_zero);
    182    col3 = vec_mladd(col3, quant3, pw_zero);
    183    col4 = vec_mladd(col4, quant4, pw_zero);
    184    col5 = vec_mladd(col5, quant5, pw_zero);
    185    col6 = vec_mladd(col6, quant6, pw_zero);
    186    col7 = vec_mladd(col7, quant7, pw_zero);
    187 
    188    DO_IDCT(col);
    189 
    190    TRANSPOSE(out, row);
    191  }
    192 
    193  /* Pass 2: process rows */
    194 
    195  DO_IDCT(row);
    196 
    197  out0 = vec_sra(out0, pass1_bits3);
    198  out1 = vec_sra(out1, pass1_bits3);
    199  out2 = vec_sra(out2, pass1_bits3);
    200  out3 = vec_sra(out3, pass1_bits3);
    201  out4 = vec_sra(out4, pass1_bits3);
    202  out5 = vec_sra(out5, pass1_bits3);
    203  out6 = vec_sra(out6, pass1_bits3);
    204  out7 = vec_sra(out7, pass1_bits3);
    205 
    206  TRANSPOSE(out, col);
    207 
    208  outb = vec_packs(col0, col0);
    209  outb = vec_add(outb, pb_centerjsamp);
    210  outptr = (int *)(output_buf[0] + output_col);
    211  vec_ste((__vector int)outb, 0, outptr);
    212  vec_ste((__vector int)outb, 4, outptr);
    213 
    214  outb = vec_packs(col1, col1);
    215  outb = vec_add(outb, pb_centerjsamp);
    216  outptr = (int *)(output_buf[1] + output_col);
    217  vec_ste((__vector int)outb, 0, outptr);
    218  vec_ste((__vector int)outb, 4, outptr);
    219 
    220  outb = vec_packs(col2, col2);
    221  outb = vec_add(outb, pb_centerjsamp);
    222  outptr = (int *)(output_buf[2] + output_col);
    223  vec_ste((__vector int)outb, 0, outptr);
    224  vec_ste((__vector int)outb, 4, outptr);
    225 
    226  outb = vec_packs(col3, col3);
    227  outb = vec_add(outb, pb_centerjsamp);
    228  outptr = (int *)(output_buf[3] + output_col);
    229  vec_ste((__vector int)outb, 0, outptr);
    230  vec_ste((__vector int)outb, 4, outptr);
    231 
    232  outb = vec_packs(col4, col4);
    233  outb = vec_add(outb, pb_centerjsamp);
    234  outptr = (int *)(output_buf[4] + output_col);
    235  vec_ste((__vector int)outb, 0, outptr);
    236  vec_ste((__vector int)outb, 4, outptr);
    237 
    238  outb = vec_packs(col5, col5);
    239  outb = vec_add(outb, pb_centerjsamp);
    240  outptr = (int *)(output_buf[5] + output_col);
    241  vec_ste((__vector int)outb, 0, outptr);
    242  vec_ste((__vector int)outb, 4, outptr);
    243 
    244  outb = vec_packs(col6, col6);
    245  outb = vec_add(outb, pb_centerjsamp);
    246  outptr = (int *)(output_buf[6] + output_col);
    247  vec_ste((__vector int)outb, 0, outptr);
    248  vec_ste((__vector int)outb, 4, outptr);
    249 
    250  outb = vec_packs(col7, col7);
    251  outb = vec_add(outb, pb_centerjsamp);
    252  outptr = (int *)(output_buf[7] + output_col);
    253  vec_ste((__vector int)outb, 0, outptr);
    254  vec_ste((__vector int)outb, 4, outptr);
    255 }