tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jfdctint-neon.c (15054B)


      1 /*
      2 * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
      6 *
      7 * This software is provided 'as-is', without any express or implied
      8 * warranty.  In no event will the authors be held liable for any damages
      9 * arising from the use of this software.
     10 *
     11 * Permission is granted to anyone to use this software for any purpose,
     12 * including commercial applications, and to alter it and redistribute it
     13 * freely, subject to the following restrictions:
     14 *
     15 * 1. The origin of this software must not be misrepresented; you must not
     16 *    claim that you wrote the original software. If you use this software
     17 *    in a product, an acknowledgment in the product documentation would be
     18 *    appreciated but is not required.
     19 * 2. Altered source versions must be plainly marked as such, and must not be
     20 *    misrepresented as being the original software.
     21 * 3. This notice may not be removed or altered from any source distribution.
     22 */
     23 
     24 #define JPEG_INTERNALS
     25 #include "../../jinclude.h"
     26 #include "../../jpeglib.h"
     27 #include "../../jsimd.h"
     28 #include "../../jdct.h"
     29 #include "../../jsimddct.h"
     30 #include "../jsimd.h"
     31 #include "align.h"
     32 #include "neon-compat.h"
     33 
     34 #include <arm_neon.h>
     35 
     36 
     37 /* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
     38 * (Discrete Cosine Transform) on one block of samples.  It uses the same
     39 * calculations and produces exactly the same output as IJG's original
     40 * jpeg_fdct_islow() function, which can be found in jfdctint.c.
     41 *
     42 * Scaled integer constants are used to avoid floating-point arithmetic:
     43 *    0.298631336 =  2446 * 2^-13
     44 *    0.390180644 =  3196 * 2^-13
     45 *    0.541196100 =  4433 * 2^-13
     46 *    0.765366865 =  6270 * 2^-13
     47 *    0.899976223 =  7373 * 2^-13
     48 *    1.175875602 =  9633 * 2^-13
     49 *    1.501321110 = 12299 * 2^-13
     50 *    1.847759065 = 15137 * 2^-13
     51 *    1.961570560 = 16069 * 2^-13
     52 *    2.053119869 = 16819 * 2^-13
     53 *    2.562915447 = 20995 * 2^-13
     54 *    3.072711026 = 25172 * 2^-13
     55 *
     56 * See jfdctint.c for further details of the DCT algorithm.  Where possible,
     57 * the variable names and comments here in jsimd_fdct_islow_neon() match up
     58 * with those in jpeg_fdct_islow().
     59 */
     60 
     61 #define CONST_BITS  13
     62 #define PASS1_BITS  2
     63 
     64 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
     65 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
     66 
     67 #define F_0_298  2446
     68 #define F_0_390  3196
     69 #define F_0_541  4433
     70 #define F_0_765  6270
     71 #define F_0_899  7373
     72 #define F_1_175  9633
     73 #define F_1_501  12299
     74 #define F_1_847  15137
     75 #define F_1_961  16069
     76 #define F_2_053  16819
     77 #define F_2_562  20995
     78 #define F_3_072  25172
     79 
     80 
     81 ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
     82  F_0_298, -F_0_390,  F_0_541,  F_0_765,
     83 -F_0_899,  F_1_175,  F_1_501, -F_1_847,
     84 -F_1_961,  F_2_053, -F_2_562,  F_3_072
     85 };
     86 
     87 void jsimd_fdct_islow_neon(DCTELEM *data)
     88 {
     89  /* Load DCT constants. */
     90 #ifdef HAVE_VLD1_S16_X3
     91  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
     92 #else
     93  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
     94  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
     95  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
     96  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
     97  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
     98 #endif
     99 
    100  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
    101   * are used, followed by vuzp to transpose the block such that we have a
    102   * column of samples per vector - allowing all rows to be processed at once.
    103   */
    104  int16x8x4_t s_rows_0123 = vld4q_s16(data);
    105  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
    106 
    107  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
    108  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
    109  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
    110  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
    111 
    112  int16x8_t col0 = cols_04.val[0];
    113  int16x8_t col1 = cols_15.val[0];
    114  int16x8_t col2 = cols_26.val[0];
    115  int16x8_t col3 = cols_37.val[0];
    116  int16x8_t col4 = cols_04.val[1];
    117  int16x8_t col5 = cols_15.val[1];
    118  int16x8_t col6 = cols_26.val[1];
    119  int16x8_t col7 = cols_37.val[1];
    120 
    121  /* Pass 1: process rows. */
    122 
    123  int16x8_t tmp0 = vaddq_s16(col0, col7);
    124  int16x8_t tmp7 = vsubq_s16(col0, col7);
    125  int16x8_t tmp1 = vaddq_s16(col1, col6);
    126  int16x8_t tmp6 = vsubq_s16(col1, col6);
    127  int16x8_t tmp2 = vaddq_s16(col2, col5);
    128  int16x8_t tmp5 = vsubq_s16(col2, col5);
    129  int16x8_t tmp3 = vaddq_s16(col3, col4);
    130  int16x8_t tmp4 = vsubq_s16(col3, col4);
    131 
    132  /* Even part */
    133  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
    134  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
    135  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
    136  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
    137 
    138  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
    139  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
    140 
    141  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
    142  int32x4_t z1_l =
    143    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
    144  int32x4_t z1_h =
    145    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
    146 
    147  int32x4_t col2_scaled_l =
    148    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
    149  int32x4_t col2_scaled_h =
    150    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
    151  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
    152                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
    153 
    154  int32x4_t col6_scaled_l =
    155    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
    156  int32x4_t col6_scaled_h =
    157    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
    158  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
    159                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
    160 
    161  /* Odd part */
    162  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
    163  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
    164  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
    165  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
    166  /* sqrt(2) * c3 */
    167  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
    168  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
    169  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
    170  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
    171 
    172  /* sqrt(2) * (-c1+c3+c5-c7) */
    173  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
    174  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
    175  /* sqrt(2) * ( c1+c3-c5+c7) */
    176  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
    177  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
    178  /* sqrt(2) * ( c1+c3+c5-c7) */
    179  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
    180  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
    181  /* sqrt(2) * ( c1+c3-c5-c7) */
    182  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
    183  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
    184 
    185  /* sqrt(2) * (c7-c3) */
    186  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
    187  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
    188  /* sqrt(2) * (-c1-c3) */
    189  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
    190  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
    191  /* sqrt(2) * (-c3-c5) */
    192  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
    193  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
    194  /* sqrt(2) * (c5-c3) */
    195  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
    196  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
    197 
    198  z3_l = vaddq_s32(z3_l, z5_l);
    199  z3_h = vaddq_s32(z3_h, z5_h);
    200  z4_l = vaddq_s32(z4_l, z5_l);
    201  z4_h = vaddq_s32(z4_h, z5_h);
    202 
    203  tmp4_l = vaddq_s32(tmp4_l, z1_l);
    204  tmp4_h = vaddq_s32(tmp4_h, z1_h);
    205  tmp4_l = vaddq_s32(tmp4_l, z3_l);
    206  tmp4_h = vaddq_s32(tmp4_h, z3_h);
    207  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
    208                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
    209 
    210  tmp5_l = vaddq_s32(tmp5_l, z2_l);
    211  tmp5_h = vaddq_s32(tmp5_h, z2_h);
    212  tmp5_l = vaddq_s32(tmp5_l, z4_l);
    213  tmp5_h = vaddq_s32(tmp5_h, z4_h);
    214  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
    215                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
    216 
    217  tmp6_l = vaddq_s32(tmp6_l, z2_l);
    218  tmp6_h = vaddq_s32(tmp6_h, z2_h);
    219  tmp6_l = vaddq_s32(tmp6_l, z3_l);
    220  tmp6_h = vaddq_s32(tmp6_h, z3_h);
    221  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
    222                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
    223 
    224  tmp7_l = vaddq_s32(tmp7_l, z1_l);
    225  tmp7_h = vaddq_s32(tmp7_h, z1_h);
    226  tmp7_l = vaddq_s32(tmp7_l, z4_l);
    227  tmp7_h = vaddq_s32(tmp7_h, z4_h);
    228  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
    229                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
    230 
    231  /* Transpose to work on columns in pass 2. */
    232  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
    233  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
    234  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
    235  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
    236 
    237  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
    238                                      vreinterpretq_s32_s16(cols_45.val[0]));
    239  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
    240                                      vreinterpretq_s32_s16(cols_45.val[1]));
    241  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
    242                                      vreinterpretq_s32_s16(cols_67.val[0]));
    243  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
    244                                      vreinterpretq_s32_s16(cols_67.val[1]));
    245 
    246  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
    247  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
    248  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
    249  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
    250 
    251  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
    252  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
    253  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
    254  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
    255  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
    256  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
    257  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
    258  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
    259 
    260  /* Pass 2: process columns. */
    261 
    262  tmp0 = vaddq_s16(row0, row7);
    263  tmp7 = vsubq_s16(row0, row7);
    264  tmp1 = vaddq_s16(row1, row6);
    265  tmp6 = vsubq_s16(row1, row6);
    266  tmp2 = vaddq_s16(row2, row5);
    267  tmp5 = vsubq_s16(row2, row5);
    268  tmp3 = vaddq_s16(row3, row4);
    269  tmp4 = vsubq_s16(row3, row4);
    270 
    271  /* Even part */
    272  tmp10 = vaddq_s16(tmp0, tmp3);
    273  tmp13 = vsubq_s16(tmp0, tmp3);
    274  tmp11 = vaddq_s16(tmp1, tmp2);
    275  tmp12 = vsubq_s16(tmp1, tmp2);
    276 
    277  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
    278  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
    279 
    280  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
    281  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
    282  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
    283 
    284  int32x4_t row2_scaled_l =
    285    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
    286  int32x4_t row2_scaled_h =
    287    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
    288  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
    289                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
    290 
    291  int32x4_t row6_scaled_l =
    292    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
    293  int32x4_t row6_scaled_h =
    294    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
    295  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
    296                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
    297 
    298  /* Odd part */
    299  z1 = vaddq_s16(tmp4, tmp7);
    300  z2 = vaddq_s16(tmp5, tmp6);
    301  z3 = vaddq_s16(tmp4, tmp6);
    302  z4 = vaddq_s16(tmp5, tmp7);
    303  /* sqrt(2) * c3 */
    304  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
    305  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
    306  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
    307  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
    308 
    309  /* sqrt(2) * (-c1+c3+c5-c7) */
    310  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
    311  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
    312  /* sqrt(2) * ( c1+c3-c5+c7) */
    313  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
    314  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
    315  /* sqrt(2) * ( c1+c3+c5-c7) */
    316  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
    317  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
    318  /* sqrt(2) * ( c1+c3-c5-c7) */
    319  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
    320  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
    321 
    322  /* sqrt(2) * (c7-c3) */
    323  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
    324  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
    325  /* sqrt(2) * (-c1-c3) */
    326  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
    327  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
    328  /* sqrt(2) * (-c3-c5) */
    329  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
    330  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
    331  /* sqrt(2) * (c5-c3) */
    332  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
    333  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
    334 
    335  z3_l = vaddq_s32(z3_l, z5_l);
    336  z3_h = vaddq_s32(z3_h, z5_h);
    337  z4_l = vaddq_s32(z4_l, z5_l);
    338  z4_h = vaddq_s32(z4_h, z5_h);
    339 
    340  tmp4_l = vaddq_s32(tmp4_l, z1_l);
    341  tmp4_h = vaddq_s32(tmp4_h, z1_h);
    342  tmp4_l = vaddq_s32(tmp4_l, z3_l);
    343  tmp4_h = vaddq_s32(tmp4_h, z3_h);
    344  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
    345                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
    346 
    347  tmp5_l = vaddq_s32(tmp5_l, z2_l);
    348  tmp5_h = vaddq_s32(tmp5_h, z2_h);
    349  tmp5_l = vaddq_s32(tmp5_l, z4_l);
    350  tmp5_h = vaddq_s32(tmp5_h, z4_h);
    351  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
    352                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
    353 
    354  tmp6_l = vaddq_s32(tmp6_l, z2_l);
    355  tmp6_h = vaddq_s32(tmp6_h, z2_h);
    356  tmp6_l = vaddq_s32(tmp6_l, z3_l);
    357  tmp6_h = vaddq_s32(tmp6_h, z3_h);
    358  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
    359                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
    360 
    361  tmp7_l = vaddq_s32(tmp7_l, z1_l);
    362  tmp7_h = vaddq_s32(tmp7_h, z1_h);
    363  tmp7_l = vaddq_s32(tmp7_l, z4_l);
    364  tmp7_h = vaddq_s32(tmp7_h, z4_h);
    365  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
    366                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
    367 
    368  vst1q_s16(data + 0 * DCTSIZE, row0);
    369  vst1q_s16(data + 1 * DCTSIZE, row1);
    370  vst1q_s16(data + 2 * DCTSIZE, row2);
    371  vst1q_s16(data + 3 * DCTSIZE, row3);
    372  vst1q_s16(data + 4 * DCTSIZE, row4);
    373  vst1q_s16(data + 5 * DCTSIZE, row5);
    374  vst1q_s16(data + 6 * DCTSIZE, row6);
    375  vst1q_s16(data + 7 * DCTSIZE, row7);
    376 }