jfdctfst-mmi.c (10432B)
1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2014, 2018-2019, D. R. Commander. All Rights Reserved. 5 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. 6 * All Rights Reserved. 7 * Authors: LiuQingfa <liuqingfa-hf@loongson.cn> 8 * 9 * Based on the x86 SIMD extension for IJG JPEG library 10 * Copyright (C) 1999-2006, MIYASAKA Masaru. 11 * 12 * This software is provided 'as-is', without any express or implied 13 * warranty. In no event will the authors be held liable for any damages 14 * arising from the use of this software. 15 * 16 * Permission is granted to anyone to use this software for any purpose, 17 * including commercial applications, and to alter it and redistribute it 18 * freely, subject to the following restrictions: 19 * 20 * 1. The origin of this software must not be misrepresented; you must not 21 * claim that you wrote the original software. If you use this software 22 * in a product, an acknowledgment in the product documentation would be 23 * appreciated but is not required. 24 * 2. Altered source versions must be plainly marked as such, and must not be 25 * misrepresented as being the original software. 26 * 3. This notice may not be removed or altered from any source distribution. 27 */ 28 29 /* FAST INTEGER FORWARD DCT */ 30 31 #include "jsimd_mmi.h" 32 33 34 #define CONST_BITS 8 35 36 #define F_0_382 ((short)98) /* FIX(0.382683433) */ 37 #define F_0_541 ((short)139) /* FIX(0.541196100) */ 38 #define F_0_707 ((short)181) /* FIX(0.707106781) */ 39 #define F_1_306 ((short)334) /* FIX(1.306562965) */ 40 41 #define PRE_MULTIPLY_SCALE_BITS 2 42 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 43 44 enum const_index { 45 index_PW_F0707, 46 index_PW_F0382, 47 index_PW_F0541, 48 index_PW_F1306 49 }; 50 51 static uint64_t const_value[] = { 52 _uint64_set1_pi16(F_0_707), 53 _uint64_set1_pi16(F_0_382), 54 _uint64_set1_pi16(F_0_541), 55 _uint64_set1_pi16(F_1_306) 56 }; 57 58 #define PW_F0707 get_const_value(index_PW_F0707) 59 #define PW_F0382 get_const_value(index_PW_F0382) 60 #define PW_F0541 get_const_value(index_PW_F0541) 61 #define PW_F1306 get_const_value(index_PW_F1306) 62 63 64 #define DO_FDCT_MULTIPLY(out, in, multiplier) { \ 65 __m64 mulhi, mullo, mul12, mul34; \ 66 \ 67 mullo = _mm_mullo_pi16(in, multiplier); \ 68 mulhi = _mm_mulhi_pi16(in, multiplier); \ 69 mul12 = _mm_unpacklo_pi16(mullo, mulhi); \ 70 mul34 = _mm_unpackhi_pi16(mullo, mulhi); \ 71 mul12 = _mm_srai_pi32(mul12, CONST_BITS); \ 72 mul34 = _mm_srai_pi32(mul34, CONST_BITS); \ 73 out = _mm_packs_pi32(mul12, mul34); \ 74 } 75 76 #define DO_FDCT_COMMON() { \ 77 \ 78 /* Even part */ \ 79 \ 80 tmp10 = _mm_add_pi16(tmp0, tmp3); \ 81 tmp13 = _mm_sub_pi16(tmp0, tmp3); \ 82 tmp11 = _mm_add_pi16(tmp1, tmp2); \ 83 tmp12 = _mm_sub_pi16(tmp1, tmp2); \ 84 \ 85 out0 = _mm_add_pi16(tmp10, tmp11); \ 86 out4 = _mm_sub_pi16(tmp10, tmp11); \ 87 \ 88 z1 = _mm_add_pi16(tmp12, tmp13); \ 89 DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \ 90 \ 91 out2 = _mm_add_pi16(tmp13, z1); \ 92 out6 = _mm_sub_pi16(tmp13, z1); \ 93 \ 94 /* Odd part */ \ 95 \ 96 tmp10 = _mm_add_pi16(tmp4, tmp5); \ 97 tmp11 = _mm_add_pi16(tmp5, tmp6); \ 98 tmp12 = _mm_add_pi16(tmp6, tmp7); \ 99 \ 100 z5 = _mm_sub_pi16(tmp10, tmp12); \ 101 DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \ 102 \ 103 DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \ 104 z2 = _mm_add_pi16(z2, z5); \ 105 \ 106 DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \ 107 z4 = _mm_add_pi16(z4, z5); \ 108 \ 109 DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \ 110 \ 111 z11 = _mm_add_pi16(tmp7, z3); \ 112 z13 = _mm_sub_pi16(tmp7, z3); \ 113 \ 114 out5 = _mm_add_pi16(z13, z2); \ 115 out3 = _mm_sub_pi16(z13, z2); \ 116 out1 = _mm_add_pi16(z11, z4); \ 117 out7 = _mm_sub_pi16(z11, z4); \ 118 } 119 120 #define DO_FDCT_PASS1() { \ 121 __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ 122 __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ 123 __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ 124 \ 125 row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ 126 row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ 127 row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ 128 row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ 129 row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ 130 row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ 131 row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ 132 row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ 133 \ 134 /* Transpose coefficients */ \ 135 \ 136 row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ 137 row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ 138 row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ 139 row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ 140 \ 141 row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ 142 row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ 143 row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ 144 row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ 145 \ 146 col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ 147 col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ 148 col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ 149 col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ 150 \ 151 tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ 152 tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ 153 tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ 154 tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ 155 \ 156 col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ 157 col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ 158 col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ 159 col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ 160 \ 161 tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ 162 tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ 163 tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ 164 tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ 165 \ 166 DO_FDCT_COMMON() \ 167 \ 168 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ 169 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \ 170 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ 171 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \ 172 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ 173 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \ 174 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ 175 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \ 176 } 177 178 #define DO_FDCT_PASS2() { \ 179 __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \ 180 __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \ 181 __m64 row0, row1, row2, row3, row4, row5, row6, row7; \ 182 \ 183 col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ 184 col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ 185 col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ 186 col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ 187 col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \ 188 col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \ 189 col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \ 190 col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \ 191 \ 192 /* Transpose coefficients */ \ 193 \ 194 col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ 195 col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ 196 col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ 197 col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ 198 \ 199 col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ 200 col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ 201 col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ 202 col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ 203 \ 204 row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ 205 row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ 206 row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ 207 row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ 208 \ 209 tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ 210 tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ 211 tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ 212 tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ 213 \ 214 row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ 215 row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ 216 row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ 217 row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ 218 \ 219 tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ 220 tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ 221 tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ 222 tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ 223 \ 224 DO_FDCT_COMMON() \ 225 \ 226 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ 227 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ 228 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ 229 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ 230 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \ 231 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \ 232 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \ 233 _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \ 234 } 235 236 void jsimd_fdct_ifast_mmi(DCTELEM *data) 237 { 238 __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 239 __m64 out0, out1, out2, out3, out4, out5, out6, out7; 240 __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13; 241 DCTELEM *dataptr = data; 242 243 /* Pass 1: process rows. */ 244 245 DO_FDCT_PASS1() 246 dataptr += DCTSIZE * 4; 247 DO_FDCT_PASS1() 248 249 /* Pass 2: process columns. */ 250 251 dataptr = data; 252 DO_FDCT_PASS2() 253 dataptr += 4; 254 DO_FDCT_PASS2() 255 }