jdcolext-neon.c (16425B)
1 /* 2 * jdcolext-neon.c - colorspace conversion (Arm Neon) 3 * 4 * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved. 6 * 7 * This software is provided 'as-is', without any express or implied 8 * warranty. In no event will the authors be held liable for any damages 9 * arising from the use of this software. 10 * 11 * Permission is granted to anyone to use this software for any purpose, 12 * including commercial applications, and to alter it and redistribute it 13 * freely, subject to the following restrictions: 14 * 15 * 1. The origin of this software must not be misrepresented; you must not 16 * claim that you wrote the original software. If you use this software 17 * in a product, an acknowledgment in the product documentation would be 18 * appreciated but is not required. 19 * 2. Altered source versions must be plainly marked as such, and must not be 20 * misrepresented as being the original software. 21 * 3. This notice may not be removed or altered from any source distribution. 22 */ 23 24 /* This file is included by jdcolor-neon.c. */ 25 26 27 /* YCbCr -> RGB conversion is defined by the following equations: 28 * R = Y + 1.40200 * (Cr - 128) 29 * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) 30 * B = Y + 1.77200 * (Cb - 128) 31 * 32 * Scaled integer constants are used to avoid floating-point arithmetic: 33 * 0.3441467 = 11277 * 2^-15 34 * 0.7141418 = 23401 * 2^-15 35 * 1.4020386 = 22971 * 2^-14 36 * 1.7720337 = 29033 * 2^-14 37 * These constants are defined in jdcolor-neon.c. 38 * 39 * To ensure correct results, rounding is used when descaling. 40 */ 41 42 /* Notes on safe memory access for YCbCr -> RGB conversion routines: 43 * 44 * Input memory buffers can be safely overread up to the next multiple of 45 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in 46 * jmemmgr.c. 47 * 48 * The output buffer cannot safely be written beyond output_width, since 49 * output_buf points to a possibly unpadded row in the decompressed image 50 * buffer allocated by the calling program. 51 */ 52 53 void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf, 54 JDIMENSION input_row, JSAMPARRAY output_buf, 55 int num_rows) 56 { 57 JSAMPROW outptr; 58 /* Pointers to Y, Cb, and Cr data */ 59 JSAMPROW inptr0, inptr1, inptr2; 60 61 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); 62 const int16x8_t neg_128 = vdupq_n_s16(-128); 63 64 while (--num_rows >= 0) { 65 inptr0 = input_buf[0][input_row]; 66 inptr1 = input_buf[1][input_row]; 67 inptr2 = input_buf[2][input_row]; 68 input_row++; 69 outptr = *output_buf++; 70 int cols_remaining = output_width; 71 for (; cols_remaining >= 16; cols_remaining -= 16) { 72 uint8x16_t y = vld1q_u8(inptr0); 73 uint8x16_t cb = vld1q_u8(inptr1); 74 uint8x16_t cr = vld1q_u8(inptr2); 75 /* Subtract 128 from Cb and Cr. */ 76 int16x8_t cr_128_l = 77 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), 78 vget_low_u8(cr))); 79 int16x8_t cr_128_h = 80 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), 81 vget_high_u8(cr))); 82 int16x8_t cb_128_l = 83 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), 84 vget_low_u8(cb))); 85 int16x8_t cb_128_h = 86 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), 87 vget_high_u8(cb))); 88 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 89 int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0); 90 int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l), 91 consts, 0); 92 int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0); 93 int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h), 94 consts, 0); 95 g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l), 96 consts, 1); 97 g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l), 98 consts, 1); 99 g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h), 100 consts, 1); 101 g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h), 102 consts, 1); 103 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 104 int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15), 105 vrshrn_n_s32(g_sub_y_lh, 15)); 106 int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15), 107 vrshrn_n_s32(g_sub_y_hh, 15)); 108 /* Compute R-Y: 1.40200 * (Cr - 128) */ 109 int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1), 110 consts, 2); 111 int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1), 112 consts, 2); 113 /* Compute B-Y: 1.77200 * (Cb - 128) */ 114 int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1), 115 consts, 3); 116 int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1), 117 consts, 3); 118 /* Add Y. */ 119 int16x8_t r_l = 120 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l), 121 vget_low_u8(y))); 122 int16x8_t r_h = 123 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h), 124 vget_high_u8(y))); 125 int16x8_t b_l = 126 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l), 127 vget_low_u8(y))); 128 int16x8_t b_h = 129 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h), 130 vget_high_u8(y))); 131 int16x8_t g_l = 132 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l), 133 vget_low_u8(y))); 134 int16x8_t g_h = 135 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), 136 vget_high_u8(y))); 137 138 #if RGB_PIXELSIZE == 4 139 uint8x16x4_t rgba; 140 /* Convert each component to unsigned and narrow, clamping to [0-255]. */ 141 rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); 142 rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); 143 rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); 144 /* Set alpha channel to opaque (0xFF). */ 145 rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); 146 /* Store RGBA pixel data to memory. */ 147 vst4q_u8(outptr, rgba); 148 #elif RGB_PIXELSIZE == 3 149 uint8x16x3_t rgb; 150 /* Convert each component to unsigned and narrow, clamping to [0-255]. */ 151 rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); 152 rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); 153 rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); 154 /* Store RGB pixel data to memory. */ 155 vst3q_u8(outptr, rgb); 156 #else 157 /* Pack R, G, and B values in ratio 5:6:5. */ 158 uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8); 159 rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5); 160 rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11); 161 uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8); 162 rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5); 163 rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11); 164 /* Store RGB pixel data to memory. */ 165 vst1q_u16((uint16_t *)outptr, rgb565_l); 166 vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h); 167 #endif 168 169 /* Increment pointers. */ 170 inptr0 += 16; 171 inptr1 += 16; 172 inptr2 += 16; 173 outptr += (RGB_PIXELSIZE * 16); 174 } 175 176 if (cols_remaining >= 8) { 177 uint8x8_t y = vld1_u8(inptr0); 178 uint8x8_t cb = vld1_u8(inptr1); 179 uint8x8_t cr = vld1_u8(inptr2); 180 /* Subtract 128 from Cb and Cr. */ 181 int16x8_t cr_128 = 182 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); 183 int16x8_t cb_128 = 184 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); 185 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 186 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); 187 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); 188 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); 189 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); 190 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 191 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), 192 vrshrn_n_s32(g_sub_y_h, 15)); 193 /* Compute R-Y: 1.40200 * (Cr - 128) */ 194 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), 195 consts, 2); 196 /* Compute B-Y: 1.77200 * (Cb - 128) */ 197 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), 198 consts, 3); 199 /* Add Y. */ 200 int16x8_t r = 201 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); 202 int16x8_t b = 203 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); 204 int16x8_t g = 205 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); 206 207 #if RGB_PIXELSIZE == 4 208 uint8x8x4_t rgba; 209 /* Convert each component to unsigned and narrow, clamping to [0-255]. */ 210 rgba.val[RGB_RED] = vqmovun_s16(r); 211 rgba.val[RGB_GREEN] = vqmovun_s16(g); 212 rgba.val[RGB_BLUE] = vqmovun_s16(b); 213 /* Set alpha channel to opaque (0xFF). */ 214 rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); 215 /* Store RGBA pixel data to memory. */ 216 vst4_u8(outptr, rgba); 217 #elif RGB_PIXELSIZE == 3 218 uint8x8x3_t rgb; 219 /* Convert each component to unsigned and narrow, clamping to [0-255]. */ 220 rgb.val[RGB_RED] = vqmovun_s16(r); 221 rgb.val[RGB_GREEN] = vqmovun_s16(g); 222 rgb.val[RGB_BLUE] = vqmovun_s16(b); 223 /* Store RGB pixel data to memory. */ 224 vst3_u8(outptr, rgb); 225 #else 226 /* Pack R, G, and B values in ratio 5:6:5. */ 227 uint16x8_t rgb565 = vqshluq_n_s16(r, 8); 228 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5); 229 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11); 230 /* Store RGB pixel data to memory. */ 231 vst1q_u16((uint16_t *)outptr, rgb565); 232 #endif 233 234 /* Increment pointers. */ 235 inptr0 += 8; 236 inptr1 += 8; 237 inptr2 += 8; 238 outptr += (RGB_PIXELSIZE * 8); 239 cols_remaining -= 8; 240 } 241 242 /* Handle the tail elements. */ 243 if (cols_remaining > 0) { 244 uint8x8_t y = vld1_u8(inptr0); 245 uint8x8_t cb = vld1_u8(inptr1); 246 uint8x8_t cr = vld1_u8(inptr2); 247 /* Subtract 128 from Cb and Cr. */ 248 int16x8_t cr_128 = 249 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); 250 int16x8_t cb_128 = 251 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); 252 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 253 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); 254 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); 255 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); 256 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); 257 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 258 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), 259 vrshrn_n_s32(g_sub_y_h, 15)); 260 /* Compute R-Y: 1.40200 * (Cr - 128) */ 261 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), 262 consts, 2); 263 /* Compute B-Y: 1.77200 * (Cb - 128) */ 264 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), 265 consts, 3); 266 /* Add Y. */ 267 int16x8_t r = 268 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); 269 int16x8_t b = 270 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); 271 int16x8_t g = 272 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); 273 274 #if RGB_PIXELSIZE == 4 275 uint8x8x4_t rgba; 276 /* Convert each component to unsigned and narrow, clamping to [0-255]. */ 277 rgba.val[RGB_RED] = vqmovun_s16(r); 278 rgba.val[RGB_GREEN] = vqmovun_s16(g); 279 rgba.val[RGB_BLUE] = vqmovun_s16(b); 280 /* Set alpha channel to opaque (0xFF). */ 281 rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); 282 /* Store RGBA pixel data to memory. */ 283 switch (cols_remaining) { 284 case 7: 285 vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6); 286 FALLTHROUGH /*FALLTHROUGH*/ 287 case 6: 288 vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5); 289 FALLTHROUGH /*FALLTHROUGH*/ 290 case 5: 291 vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4); 292 FALLTHROUGH /*FALLTHROUGH*/ 293 case 4: 294 vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3); 295 FALLTHROUGH /*FALLTHROUGH*/ 296 case 3: 297 vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2); 298 FALLTHROUGH /*FALLTHROUGH*/ 299 case 2: 300 vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1); 301 FALLTHROUGH /*FALLTHROUGH*/ 302 case 1: 303 vst4_lane_u8(outptr, rgba, 0); 304 FALLTHROUGH /*FALLTHROUGH*/ 305 default: 306 break; 307 } 308 #elif RGB_PIXELSIZE == 3 309 uint8x8x3_t rgb; 310 /* Convert each component to unsigned and narrow, clamping to [0-255]. */ 311 rgb.val[RGB_RED] = vqmovun_s16(r); 312 rgb.val[RGB_GREEN] = vqmovun_s16(g); 313 rgb.val[RGB_BLUE] = vqmovun_s16(b); 314 /* Store RGB pixel data to memory. */ 315 switch (cols_remaining) { 316 case 7: 317 vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6); 318 FALLTHROUGH /*FALLTHROUGH*/ 319 case 6: 320 vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5); 321 FALLTHROUGH /*FALLTHROUGH*/ 322 case 5: 323 vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4); 324 FALLTHROUGH /*FALLTHROUGH*/ 325 case 4: 326 vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3); 327 FALLTHROUGH /*FALLTHROUGH*/ 328 case 3: 329 vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2); 330 FALLTHROUGH /*FALLTHROUGH*/ 331 case 2: 332 vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1); 333 FALLTHROUGH /*FALLTHROUGH*/ 334 case 1: 335 vst3_lane_u8(outptr, rgb, 0); 336 FALLTHROUGH /*FALLTHROUGH*/ 337 default: 338 break; 339 } 340 #else 341 /* Pack R, G, and B values in ratio 5:6:5. */ 342 uint16x8_t rgb565 = vqshluq_n_s16(r, 8); 343 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5); 344 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11); 345 /* Store RGB565 pixel data to memory. */ 346 switch (cols_remaining) { 347 case 7: 348 vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6); 349 FALLTHROUGH /*FALLTHROUGH*/ 350 case 6: 351 vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5); 352 FALLTHROUGH /*FALLTHROUGH*/ 353 case 5: 354 vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4); 355 FALLTHROUGH /*FALLTHROUGH*/ 356 case 4: 357 vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3); 358 FALLTHROUGH /*FALLTHROUGH*/ 359 case 3: 360 vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2); 361 FALLTHROUGH /*FALLTHROUGH*/ 362 case 2: 363 vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1); 364 FALLTHROUGH /*FALLTHROUGH*/ 365 case 1: 366 vst1q_lane_u16((uint16_t *)outptr, rgb565, 0); 367 FALLTHROUGH /*FALLTHROUGH*/ 368 default: 369 break; 370 } 371 #endif 372 } 373 } 374 }