jfdctfst-neon.c (7969B)
1 /* 2 * jfdctfst-neon.c - fast integer FDCT (Arm Neon) 3 * 4 * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 #define JPEG_INTERNALS 24 #include "../../jinclude.h" 25 #include "../../jpeglib.h" 26 #include "../../jsimd.h" 27 #include "../../jdct.h" 28 #include "../../jsimddct.h" 29 #include "../jsimd.h" 30 #include "align.h" 31 32 #include <arm_neon.h> 33 34 35 /* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT 36 * (Discrete Cosine Transform) on one block of samples. It uses the same 37 * calculations and produces exactly the same output as IJG's original 38 * jpeg_fdct_ifast() function, which can be found in jfdctfst.c. 39 * 40 * Scaled integer constants are used to avoid floating-point arithmetic: 41 * 0.382683433 = 12544 * 2^-15 42 * 0.541196100 = 17795 * 2^-15 43 * 0.707106781 = 23168 * 2^-15 44 * 0.306562965 = 9984 * 2^-15 45 * 46 * See jfdctfst.c for further details of the DCT algorithm. Where possible, 47 * the variable names and comments here in jsimd_fdct_ifast_neon() match up 48 * with those in jpeg_fdct_ifast(). 49 */ 50 51 #define F_0_382 12544 52 #define F_0_541 17792 53 #define F_0_707 23168 54 #define F_0_306 9984 55 56 57 ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = { 58 F_0_382, F_0_541, F_0_707, F_0_306 59 }; 60 61 void jsimd_fdct_ifast_neon(DCTELEM *data) 62 { 63 /* Load an 8x8 block of samples into Neon registers. De-interleaving loads 64 * are used, followed by vuzp to transpose the block such that we have a 65 * column of samples per vector - allowing all rows to be processed at once. 66 */ 67 int16x8x4_t data1 = vld4q_s16(data); 68 int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE); 69 70 int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]); 71 int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]); 72 int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]); 73 int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]); 74 75 int16x8_t col0 = cols_04.val[0]; 76 int16x8_t col1 = cols_15.val[0]; 77 int16x8_t col2 = cols_26.val[0]; 78 int16x8_t col3 = cols_37.val[0]; 79 int16x8_t col4 = cols_04.val[1]; 80 int16x8_t col5 = cols_15.val[1]; 81 int16x8_t col6 = cols_26.val[1]; 82 int16x8_t col7 = cols_37.val[1]; 83 84 /* Pass 1: process rows. */ 85 86 /* Load DCT conversion constants. */ 87 const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts); 88 89 int16x8_t tmp0 = vaddq_s16(col0, col7); 90 int16x8_t tmp7 = vsubq_s16(col0, col7); 91 int16x8_t tmp1 = vaddq_s16(col1, col6); 92 int16x8_t tmp6 = vsubq_s16(col1, col6); 93 int16x8_t tmp2 = vaddq_s16(col2, col5); 94 int16x8_t tmp5 = vsubq_s16(col2, col5); 95 int16x8_t tmp3 = vaddq_s16(col3, col4); 96 int16x8_t tmp4 = vsubq_s16(col3, col4); 97 98 /* Even part */ 99 int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ 100 int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); 101 int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); 102 int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); 103 104 col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ 105 col4 = vsubq_s16(tmp10, tmp11); 106 107 int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); 108 col2 = vaddq_s16(tmp13, z1); /* phase 5 */ 109 col6 = vsubq_s16(tmp13, z1); 110 111 /* Odd part */ 112 tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ 113 tmp11 = vaddq_s16(tmp5, tmp6); 114 tmp12 = vaddq_s16(tmp6, tmp7); 115 116 int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); 117 int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1); 118 z2 = vaddq_s16(z2, z5); 119 int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3); 120 z5 = vaddq_s16(tmp12, z5); 121 z4 = vaddq_s16(z4, z5); 122 int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2); 123 124 int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */ 125 int16x8_t z13 = vsubq_s16(tmp7, z3); 126 127 col5 = vaddq_s16(z13, z2); /* phase 6 */ 128 col3 = vsubq_s16(z13, z2); 129 col1 = vaddq_s16(z11, z4); 130 col7 = vsubq_s16(z11, z4); 131 132 /* Transpose to work on columns in pass 2. */ 133 int16x8x2_t cols_01 = vtrnq_s16(col0, col1); 134 int16x8x2_t cols_23 = vtrnq_s16(col2, col3); 135 int16x8x2_t cols_45 = vtrnq_s16(col4, col5); 136 int16x8x2_t cols_67 = vtrnq_s16(col6, col7); 137 138 int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), 139 vreinterpretq_s32_s16(cols_45.val[0])); 140 int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), 141 vreinterpretq_s32_s16(cols_45.val[1])); 142 int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), 143 vreinterpretq_s32_s16(cols_67.val[0])); 144 int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), 145 vreinterpretq_s32_s16(cols_67.val[1])); 146 147 int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); 148 int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); 149 int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); 150 int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); 151 152 int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); 153 int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); 154 int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); 155 int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); 156 int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); 157 int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); 158 int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); 159 int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); 160 161 /* Pass 2: process columns. */ 162 163 tmp0 = vaddq_s16(row0, row7); 164 tmp7 = vsubq_s16(row0, row7); 165 tmp1 = vaddq_s16(row1, row6); 166 tmp6 = vsubq_s16(row1, row6); 167 tmp2 = vaddq_s16(row2, row5); 168 tmp5 = vsubq_s16(row2, row5); 169 tmp3 = vaddq_s16(row3, row4); 170 tmp4 = vsubq_s16(row3, row4); 171 172 /* Even part */ 173 tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ 174 tmp13 = vsubq_s16(tmp0, tmp3); 175 tmp11 = vaddq_s16(tmp1, tmp2); 176 tmp12 = vsubq_s16(tmp1, tmp2); 177 178 row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ 179 row4 = vsubq_s16(tmp10, tmp11); 180 181 z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); 182 row2 = vaddq_s16(tmp13, z1); /* phase 5 */ 183 row6 = vsubq_s16(tmp13, z1); 184 185 /* Odd part */ 186 tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ 187 tmp11 = vaddq_s16(tmp5, tmp6); 188 tmp12 = vaddq_s16(tmp6, tmp7); 189 190 z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); 191 z2 = vqdmulhq_lane_s16(tmp10, consts, 1); 192 z2 = vaddq_s16(z2, z5); 193 z4 = vqdmulhq_lane_s16(tmp12, consts, 3); 194 z5 = vaddq_s16(tmp12, z5); 195 z4 = vaddq_s16(z4, z5); 196 z3 = vqdmulhq_lane_s16(tmp11, consts, 2); 197 198 z11 = vaddq_s16(tmp7, z3); /* phase 5 */ 199 z13 = vsubq_s16(tmp7, z3); 200 201 row5 = vaddq_s16(z13, z2); /* phase 6 */ 202 row3 = vsubq_s16(z13, z2); 203 row1 = vaddq_s16(z11, z4); 204 row7 = vsubq_s16(z11, z4); 205 206 vst1q_s16(data + 0 * DCTSIZE, row0); 207 vst1q_s16(data + 1 * DCTSIZE, row1); 208 vst1q_s16(data + 2 * DCTSIZE, row2); 209 vst1q_s16(data + 3 * DCTSIZE, row3); 210 vst1q_s16(data + 4 * DCTSIZE, row4); 211 vst1q_s16(data + 5 * DCTSIZE, row5); 212 vst1q_s16(data + 6 * DCTSIZE, row6); 213 vst1q_s16(data + 7 * DCTSIZE, row7); 214 }