jcphuff-neon.c (26878B)
1 /* 2 * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon) 3 * 4 * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. 5 * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved. 6 * Copyright (C) 2022, D. R. Commander. All Rights Reserved. 7 * 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 25 #define JPEG_INTERNALS 26 #include "../../jinclude.h" 27 #include "../../jpeglib.h" 28 #include "../../jsimd.h" 29 #include "../../jdct.h" 30 #include "../../jsimddct.h" 31 #include "../jsimd.h" 32 #include "neon-compat.h" 33 34 #include <arm_neon.h> 35 36 37 /* Data preparation for encode_mcu_AC_first(). 38 * 39 * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be 40 * found in jcphuff.c. 41 */ 42 43 void jsimd_encode_mcu_AC_first_prepare_neon 44 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, 45 UJCOEF *values, size_t *zerobits) 46 { 47 UJCOEF *values_ptr = values; 48 UJCOEF *diff_values_ptr = values + DCTSIZE2; 49 50 /* Rows of coefficients to zero (since they haven't been processed) */ 51 int i, rows_to_zero = 8; 52 53 for (i = 0; i < Sl / 16; i++) { 54 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); 55 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); 56 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); 57 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); 58 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); 59 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); 60 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); 61 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); 62 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]); 63 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); 64 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); 65 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); 66 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); 67 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); 68 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); 69 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); 70 71 /* Isolate sign of coefficients. */ 72 uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); 73 uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); 74 /* Compute absolute value of coefficients and apply point transform Al. */ 75 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); 76 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); 77 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); 78 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); 79 80 /* Compute diff values. */ 81 uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); 82 uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); 83 84 /* Store transformed coefficients and diff values. */ 85 vst1q_u16(values_ptr, abs_coefs1); 86 vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); 87 vst1q_u16(diff_values_ptr, diff1); 88 vst1q_u16(diff_values_ptr + DCTSIZE, diff2); 89 values_ptr += 16; 90 diff_values_ptr += 16; 91 jpeg_natural_order_start += 16; 92 rows_to_zero -= 2; 93 } 94 95 /* Same operation but for remaining partial vector */ 96 int remaining_coefs = Sl % 16; 97 if (remaining_coefs > 8) { 98 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); 99 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); 100 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); 101 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); 102 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); 103 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); 104 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); 105 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); 106 int16x8_t coefs2 = vdupq_n_s16(0); 107 switch (remaining_coefs) { 108 case 15: 109 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); 110 FALLTHROUGH /*FALLTHROUGH*/ 111 case 14: 112 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); 113 FALLTHROUGH /*FALLTHROUGH*/ 114 case 13: 115 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); 116 FALLTHROUGH /*FALLTHROUGH*/ 117 case 12: 118 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); 119 FALLTHROUGH /*FALLTHROUGH*/ 120 case 11: 121 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); 122 FALLTHROUGH /*FALLTHROUGH*/ 123 case 10: 124 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); 125 FALLTHROUGH /*FALLTHROUGH*/ 126 case 9: 127 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0); 128 FALLTHROUGH /*FALLTHROUGH*/ 129 default: 130 break; 131 } 132 133 /* Isolate sign of coefficients. */ 134 uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); 135 uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); 136 /* Compute absolute value of coefficients and apply point transform Al. */ 137 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); 138 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); 139 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); 140 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); 141 142 /* Compute diff values. */ 143 uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); 144 uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); 145 146 /* Store transformed coefficients and diff values. */ 147 vst1q_u16(values_ptr, abs_coefs1); 148 vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); 149 vst1q_u16(diff_values_ptr, diff1); 150 vst1q_u16(diff_values_ptr + DCTSIZE, diff2); 151 values_ptr += 16; 152 diff_values_ptr += 16; 153 rows_to_zero -= 2; 154 155 } else if (remaining_coefs > 0) { 156 int16x8_t coefs = vdupq_n_s16(0); 157 158 switch (remaining_coefs) { 159 case 8: 160 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7); 161 FALLTHROUGH /*FALLTHROUGH*/ 162 case 7: 163 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6); 164 FALLTHROUGH /*FALLTHROUGH*/ 165 case 6: 166 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5); 167 FALLTHROUGH /*FALLTHROUGH*/ 168 case 5: 169 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4); 170 FALLTHROUGH /*FALLTHROUGH*/ 171 case 4: 172 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3); 173 FALLTHROUGH /*FALLTHROUGH*/ 174 case 3: 175 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2); 176 FALLTHROUGH /*FALLTHROUGH*/ 177 case 2: 178 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1); 179 FALLTHROUGH /*FALLTHROUGH*/ 180 case 1: 181 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0); 182 FALLTHROUGH /*FALLTHROUGH*/ 183 default: 184 break; 185 } 186 187 /* Isolate sign of coefficients. */ 188 uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)); 189 /* Compute absolute value of coefficients and apply point transform Al. */ 190 uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); 191 abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); 192 193 /* Compute diff values. */ 194 uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs); 195 196 /* Store transformed coefficients and diff values. */ 197 vst1q_u16(values_ptr, abs_coefs); 198 vst1q_u16(diff_values_ptr, diff); 199 values_ptr += 8; 200 diff_values_ptr += 8; 201 rows_to_zero--; 202 } 203 204 /* Zero remaining memory in the values and diff_values blocks. */ 205 for (i = 0; i < rows_to_zero; i++) { 206 vst1q_u16(values_ptr, vdupq_n_u16(0)); 207 vst1q_u16(diff_values_ptr, vdupq_n_u16(0)); 208 values_ptr += 8; 209 diff_values_ptr += 8; 210 } 211 212 /* Construct zerobits bitmap. A set bit means that the corresponding 213 * coefficient != 0. 214 */ 215 uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE); 216 uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE); 217 uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE); 218 uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE); 219 uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE); 220 uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE); 221 uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE); 222 uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE); 223 224 uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0))); 225 uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0))); 226 uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0))); 227 uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0))); 228 uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0))); 229 uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0))); 230 uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0))); 231 uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0))); 232 233 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ 234 const uint8x8_t bitmap_mask = 235 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); 236 237 row0_eq0 = vand_u8(row0_eq0, bitmap_mask); 238 row1_eq0 = vand_u8(row1_eq0, bitmap_mask); 239 row2_eq0 = vand_u8(row2_eq0, bitmap_mask); 240 row3_eq0 = vand_u8(row3_eq0, bitmap_mask); 241 row4_eq0 = vand_u8(row4_eq0, bitmap_mask); 242 row5_eq0 = vand_u8(row5_eq0, bitmap_mask); 243 row6_eq0 = vand_u8(row6_eq0, bitmap_mask); 244 row7_eq0 = vand_u8(row7_eq0, bitmap_mask); 245 246 uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0); 247 uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0); 248 uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0); 249 uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0); 250 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); 251 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); 252 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); 253 254 #if defined(__aarch64__) || defined(_M_ARM64) 255 /* Move bitmap to a 64-bit scalar register. */ 256 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); 257 /* Store zerobits bitmap. */ 258 *zerobits = ~bitmap; 259 #else 260 /* Move bitmap to two 32-bit scalar registers. */ 261 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); 262 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); 263 /* Store zerobits bitmap. */ 264 zerobits[0] = ~bitmap0; 265 zerobits[1] = ~bitmap1; 266 #endif 267 } 268 269 270 /* Data preparation for encode_mcu_AC_refine(). 271 * 272 * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be 273 * found in jcphuff.c. 274 */ 275 276 int jsimd_encode_mcu_AC_refine_prepare_neon 277 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, 278 UJCOEF *absvalues, size_t *bits) 279 { 280 /* Temporary storage buffers for data used to compute the signbits bitmap and 281 * the end-of-block (EOB) position 282 */ 283 uint8_t coef_sign_bits[64]; 284 uint8_t coef_eq1_bits[64]; 285 286 UJCOEF *absvalues_ptr = absvalues; 287 uint8_t *coef_sign_bits_ptr = coef_sign_bits; 288 uint8_t *eq1_bits_ptr = coef_eq1_bits; 289 290 /* Rows of coefficients to zero (since they haven't been processed) */ 291 int i, rows_to_zero = 8; 292 293 for (i = 0; i < Sl / 16; i++) { 294 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); 295 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); 296 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); 297 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); 298 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); 299 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); 300 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); 301 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); 302 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]); 303 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); 304 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); 305 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); 306 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); 307 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); 308 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); 309 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); 310 311 /* Compute and store data for signbits bitmap. */ 312 uint8x8_t sign_coefs1 = 313 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15))); 314 uint8x8_t sign_coefs2 = 315 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15))); 316 vst1_u8(coef_sign_bits_ptr, sign_coefs1); 317 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); 318 319 /* Compute absolute value of coefficients and apply point transform Al. */ 320 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); 321 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); 322 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); 323 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); 324 vst1q_u16(absvalues_ptr, abs_coefs1); 325 vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); 326 327 /* Test whether transformed coefficient values == 1 (used to find EOB 328 * position.) 329 */ 330 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); 331 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); 332 vst1_u8(eq1_bits_ptr, coefs_eq11); 333 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); 334 335 absvalues_ptr += 16; 336 coef_sign_bits_ptr += 16; 337 eq1_bits_ptr += 16; 338 jpeg_natural_order_start += 16; 339 rows_to_zero -= 2; 340 } 341 342 /* Same operation but for remaining partial vector */ 343 int remaining_coefs = Sl % 16; 344 if (remaining_coefs > 8) { 345 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); 346 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); 347 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); 348 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); 349 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); 350 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); 351 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); 352 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); 353 int16x8_t coefs2 = vdupq_n_s16(0); 354 switch (remaining_coefs) { 355 case 15: 356 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); 357 FALLTHROUGH /*FALLTHROUGH*/ 358 case 14: 359 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); 360 FALLTHROUGH /*FALLTHROUGH*/ 361 case 13: 362 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); 363 FALLTHROUGH /*FALLTHROUGH*/ 364 case 12: 365 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); 366 FALLTHROUGH /*FALLTHROUGH*/ 367 case 11: 368 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); 369 FALLTHROUGH /*FALLTHROUGH*/ 370 case 10: 371 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); 372 FALLTHROUGH /*FALLTHROUGH*/ 373 case 9: 374 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0); 375 FALLTHROUGH /*FALLTHROUGH*/ 376 default: 377 break; 378 } 379 380 /* Compute and store data for signbits bitmap. */ 381 uint8x8_t sign_coefs1 = 382 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15))); 383 uint8x8_t sign_coefs2 = 384 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15))); 385 vst1_u8(coef_sign_bits_ptr, sign_coefs1); 386 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); 387 388 /* Compute absolute value of coefficients and apply point transform Al. */ 389 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); 390 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); 391 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); 392 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); 393 vst1q_u16(absvalues_ptr, abs_coefs1); 394 vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); 395 396 /* Test whether transformed coefficient values == 1 (used to find EOB 397 * position.) 398 */ 399 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); 400 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); 401 vst1_u8(eq1_bits_ptr, coefs_eq11); 402 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); 403 404 absvalues_ptr += 16; 405 coef_sign_bits_ptr += 16; 406 eq1_bits_ptr += 16; 407 jpeg_natural_order_start += 16; 408 rows_to_zero -= 2; 409 410 } else if (remaining_coefs > 0) { 411 int16x8_t coefs = vdupq_n_s16(0); 412 413 switch (remaining_coefs) { 414 case 8: 415 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7); 416 FALLTHROUGH /*FALLTHROUGH*/ 417 case 7: 418 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6); 419 FALLTHROUGH /*FALLTHROUGH*/ 420 case 6: 421 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5); 422 FALLTHROUGH /*FALLTHROUGH*/ 423 case 5: 424 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4); 425 FALLTHROUGH /*FALLTHROUGH*/ 426 case 4: 427 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3); 428 FALLTHROUGH /*FALLTHROUGH*/ 429 case 3: 430 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2); 431 FALLTHROUGH /*FALLTHROUGH*/ 432 case 2: 433 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1); 434 FALLTHROUGH /*FALLTHROUGH*/ 435 case 1: 436 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0); 437 FALLTHROUGH /*FALLTHROUGH*/ 438 default: 439 break; 440 } 441 442 /* Compute and store data for signbits bitmap. */ 443 uint8x8_t sign_coefs = 444 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15))); 445 vst1_u8(coef_sign_bits_ptr, sign_coefs); 446 447 /* Compute absolute value of coefficients and apply point transform Al. */ 448 uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); 449 abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); 450 vst1q_u16(absvalues_ptr, abs_coefs); 451 452 /* Test whether transformed coefficient values == 1 (used to find EOB 453 * position.) 454 */ 455 uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1))); 456 vst1_u8(eq1_bits_ptr, coefs_eq1); 457 458 absvalues_ptr += 8; 459 coef_sign_bits_ptr += 8; 460 eq1_bits_ptr += 8; 461 rows_to_zero--; 462 } 463 464 /* Zero remaining memory in blocks. */ 465 for (i = 0; i < rows_to_zero; i++) { 466 vst1q_u16(absvalues_ptr, vdupq_n_u16(0)); 467 vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0)); 468 vst1_u8(eq1_bits_ptr, vdup_n_u8(0)); 469 absvalues_ptr += 8; 470 coef_sign_bits_ptr += 8; 471 eq1_bits_ptr += 8; 472 } 473 474 /* Construct zerobits bitmap. */ 475 uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE); 476 uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE); 477 uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE); 478 uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE); 479 uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE); 480 uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE); 481 uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE); 482 uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE); 483 484 uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0))); 485 uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0))); 486 uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0))); 487 uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0))); 488 uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0))); 489 uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0))); 490 uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0))); 491 uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0))); 492 493 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ 494 const uint8x8_t bitmap_mask = 495 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); 496 497 abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask); 498 abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask); 499 abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask); 500 abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask); 501 abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask); 502 abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask); 503 abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask); 504 abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask); 505 506 uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0); 507 uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0); 508 uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0); 509 uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0); 510 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); 511 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); 512 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); 513 514 #if defined(__aarch64__) || defined(_M_ARM64) 515 /* Move bitmap to a 64-bit scalar register. */ 516 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); 517 /* Store zerobits bitmap. */ 518 bits[0] = ~bitmap; 519 #else 520 /* Move bitmap to two 32-bit scalar registers. */ 521 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); 522 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); 523 /* Store zerobits bitmap. */ 524 bits[0] = ~bitmap0; 525 bits[1] = ~bitmap1; 526 #endif 527 528 /* Construct signbits bitmap. */ 529 uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE); 530 uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE); 531 uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE); 532 uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE); 533 uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE); 534 uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE); 535 uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE); 536 uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE); 537 538 signbits_row0 = vand_u8(signbits_row0, bitmap_mask); 539 signbits_row1 = vand_u8(signbits_row1, bitmap_mask); 540 signbits_row2 = vand_u8(signbits_row2, bitmap_mask); 541 signbits_row3 = vand_u8(signbits_row3, bitmap_mask); 542 signbits_row4 = vand_u8(signbits_row4, bitmap_mask); 543 signbits_row5 = vand_u8(signbits_row5, bitmap_mask); 544 signbits_row6 = vand_u8(signbits_row6, bitmap_mask); 545 signbits_row7 = vand_u8(signbits_row7, bitmap_mask); 546 547 bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1); 548 bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3); 549 bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5); 550 bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7); 551 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); 552 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); 553 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); 554 555 #if defined(__aarch64__) || defined(_M_ARM64) 556 /* Move bitmap to a 64-bit scalar register. */ 557 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); 558 /* Store signbits bitmap. */ 559 bits[1] = ~bitmap; 560 #else 561 /* Move bitmap to two 32-bit scalar registers. */ 562 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); 563 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); 564 /* Store signbits bitmap. */ 565 bits[2] = ~bitmap0; 566 bits[3] = ~bitmap1; 567 #endif 568 569 /* Construct bitmap to find EOB position (the index of the last coefficient 570 * equal to 1.) 571 */ 572 uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE); 573 uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE); 574 uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE); 575 uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE); 576 uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE); 577 uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE); 578 uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE); 579 uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE); 580 581 row0_eq1 = vand_u8(row0_eq1, bitmap_mask); 582 row1_eq1 = vand_u8(row1_eq1, bitmap_mask); 583 row2_eq1 = vand_u8(row2_eq1, bitmap_mask); 584 row3_eq1 = vand_u8(row3_eq1, bitmap_mask); 585 row4_eq1 = vand_u8(row4_eq1, bitmap_mask); 586 row5_eq1 = vand_u8(row5_eq1, bitmap_mask); 587 row6_eq1 = vand_u8(row6_eq1, bitmap_mask); 588 row7_eq1 = vand_u8(row7_eq1, bitmap_mask); 589 590 bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1); 591 bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1); 592 bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1); 593 bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1); 594 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); 595 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); 596 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); 597 598 #if defined(__aarch64__) || defined(_M_ARM64) 599 /* Move bitmap to a 64-bit scalar register. */ 600 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); 601 602 /* Return EOB position. */ 603 if (bitmap == 0) { 604 /* EOB position is defined to be 0 if all coefficients != 1. */ 605 return 0; 606 } else { 607 return 63 - BUILTIN_CLZLL(bitmap); 608 } 609 #else 610 /* Move bitmap to two 32-bit scalar registers. */ 611 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); 612 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); 613 614 /* Return EOB position. */ 615 if (bitmap0 == 0 && bitmap1 == 0) { 616 return 0; 617 } else if (bitmap1 != 0) { 618 return 63 - BUILTIN_CLZ(bitmap1); 619 } else { 620 return 31 - BUILTIN_CLZ(bitmap0); 621 } 622 #endif 623 }