tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jfdctfst-neon.c (7969B)


      1 /*
      2 * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 *
      6 * This software is provided 'as-is', without any express or implied
      7 * warranty.  In no event will the authors be held liable for any damages
      8 * arising from the use of this software.
      9 *
     10 * Permission is granted to anyone to use this software for any purpose,
     11 * including commercial applications, and to alter it and redistribute it
     12 * freely, subject to the following restrictions:
     13 *
     14 * 1. The origin of this software must not be misrepresented; you must not
     15 *    claim that you wrote the original software. If you use this software
     16 *    in a product, an acknowledgment in the product documentation would be
     17 *    appreciated but is not required.
     18 * 2. Altered source versions must be plainly marked as such, and must not be
     19 *    misrepresented as being the original software.
     20 * 3. This notice may not be removed or altered from any source distribution.
     21 */
     22 
     23 #define JPEG_INTERNALS
     24 #include "../../jinclude.h"
     25 #include "../../jpeglib.h"
     26 #include "../../jsimd.h"
     27 #include "../../jdct.h"
     28 #include "../../jsimddct.h"
     29 #include "../jsimd.h"
     30 #include "align.h"
     31 
     32 #include <arm_neon.h>
     33 
     34 
     35 /* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
     36 * (Discrete Cosine Transform) on one block of samples.  It uses the same
     37 * calculations and produces exactly the same output as IJG's original
     38 * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
     39 *
     40 * Scaled integer constants are used to avoid floating-point arithmetic:
     41 *    0.382683433 = 12544 * 2^-15
     42 *    0.541196100 = 17795 * 2^-15
     43 *    0.707106781 = 23168 * 2^-15
     44 *    0.306562965 =  9984 * 2^-15
     45 *
     46 * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
     47 * the variable names and comments here in jsimd_fdct_ifast_neon() match up
     48 * with those in jpeg_fdct_ifast().
     49 */
     50 
     51 #define F_0_382  12544
     52 #define F_0_541  17792
     53 #define F_0_707  23168
     54 #define F_0_306  9984
     55 
     56 
     57 ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
     58  F_0_382, F_0_541, F_0_707, F_0_306
     59 };
     60 
     61 void jsimd_fdct_ifast_neon(DCTELEM *data)
     62 {
     63  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
     64   * are used, followed by vuzp to transpose the block such that we have a
     65   * column of samples per vector - allowing all rows to be processed at once.
     66   */
     67  int16x8x4_t data1 = vld4q_s16(data);
     68  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
     69 
     70  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
     71  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
     72  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
     73  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
     74 
     75  int16x8_t col0 = cols_04.val[0];
     76  int16x8_t col1 = cols_15.val[0];
     77  int16x8_t col2 = cols_26.val[0];
     78  int16x8_t col3 = cols_37.val[0];
     79  int16x8_t col4 = cols_04.val[1];
     80  int16x8_t col5 = cols_15.val[1];
     81  int16x8_t col6 = cols_26.val[1];
     82  int16x8_t col7 = cols_37.val[1];
     83 
     84  /* Pass 1: process rows. */
     85 
     86  /* Load DCT conversion constants. */
     87  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
     88 
     89  int16x8_t tmp0 = vaddq_s16(col0, col7);
     90  int16x8_t tmp7 = vsubq_s16(col0, col7);
     91  int16x8_t tmp1 = vaddq_s16(col1, col6);
     92  int16x8_t tmp6 = vsubq_s16(col1, col6);
     93  int16x8_t tmp2 = vaddq_s16(col2, col5);
     94  int16x8_t tmp5 = vsubq_s16(col2, col5);
     95  int16x8_t tmp3 = vaddq_s16(col3, col4);
     96  int16x8_t tmp4 = vsubq_s16(col3, col4);
     97 
     98  /* Even part */
     99  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
    100  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
    101  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
    102  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
    103 
    104  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
    105  col4 = vsubq_s16(tmp10, tmp11);
    106 
    107  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
    108  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
    109  col6 = vsubq_s16(tmp13, z1);
    110 
    111  /* Odd part */
    112  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
    113  tmp11 = vaddq_s16(tmp5, tmp6);
    114  tmp12 = vaddq_s16(tmp6, tmp7);
    115 
    116  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
    117  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
    118  z2 = vaddq_s16(z2, z5);
    119  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
    120  z5 = vaddq_s16(tmp12, z5);
    121  z4 = vaddq_s16(z4, z5);
    122  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
    123 
    124  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
    125  int16x8_t z13 = vsubq_s16(tmp7, z3);
    126 
    127  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
    128  col3 = vsubq_s16(z13, z2);
    129  col1 = vaddq_s16(z11, z4);
    130  col7 = vsubq_s16(z11, z4);
    131 
    132  /* Transpose to work on columns in pass 2. */
    133  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
    134  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
    135  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
    136  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
    137 
    138  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
    139                                      vreinterpretq_s32_s16(cols_45.val[0]));
    140  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
    141                                      vreinterpretq_s32_s16(cols_45.val[1]));
    142  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
    143                                      vreinterpretq_s32_s16(cols_67.val[0]));
    144  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
    145                                      vreinterpretq_s32_s16(cols_67.val[1]));
    146 
    147  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
    148  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
    149  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
    150  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
    151 
    152  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
    153  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
    154  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
    155  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
    156  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
    157  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
    158  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
    159  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
    160 
    161  /* Pass 2: process columns. */
    162 
    163  tmp0 = vaddq_s16(row0, row7);
    164  tmp7 = vsubq_s16(row0, row7);
    165  tmp1 = vaddq_s16(row1, row6);
    166  tmp6 = vsubq_s16(row1, row6);
    167  tmp2 = vaddq_s16(row2, row5);
    168  tmp5 = vsubq_s16(row2, row5);
    169  tmp3 = vaddq_s16(row3, row4);
    170  tmp4 = vsubq_s16(row3, row4);
    171 
    172  /* Even part */
    173  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
    174  tmp13 = vsubq_s16(tmp0, tmp3);
    175  tmp11 = vaddq_s16(tmp1, tmp2);
    176  tmp12 = vsubq_s16(tmp1, tmp2);
    177 
    178  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
    179  row4 = vsubq_s16(tmp10, tmp11);
    180 
    181  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
    182  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
    183  row6 = vsubq_s16(tmp13, z1);
    184 
    185  /* Odd part */
    186  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
    187  tmp11 = vaddq_s16(tmp5, tmp6);
    188  tmp12 = vaddq_s16(tmp6, tmp7);
    189 
    190  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
    191  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
    192  z2 = vaddq_s16(z2, z5);
    193  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
    194  z5 = vaddq_s16(tmp12, z5);
    195  z4 = vaddq_s16(z4, z5);
    196  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
    197 
    198  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
    199  z13 = vsubq_s16(tmp7, z3);
    200 
    201  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
    202  row3 = vsubq_s16(z13, z2);
    203  row1 = vaddq_s16(z11, z4);
    204  row7 = vsubq_s16(z11, z4);
    205 
    206  vst1q_s16(data + 0 * DCTSIZE, row0);
    207  vst1q_s16(data + 1 * DCTSIZE, row1);
    208  vst1q_s16(data + 2 * DCTSIZE, row2);
    209  vst1q_s16(data + 3 * DCTSIZE, row3);
    210  vst1q_s16(data + 4 * DCTSIZE, row4);
    211  vst1q_s16(data + 5 * DCTSIZE, row5);
    212  vst1q_s16(data + 6 * DCTSIZE, row6);
    213  vst1q_s16(data + 7 * DCTSIZE, row7);
    214 }