jdmrgext-neon.c (30912B)
1 /* 2 * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon) 3 * 4 * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved. 6 * 7 * This software is provided 'as-is', without any express or implied 8 * warranty. In no event will the authors be held liable for any damages 9 * arising from the use of this software. 10 * 11 * Permission is granted to anyone to use this software for any purpose, 12 * including commercial applications, and to alter it and redistribute it 13 * freely, subject to the following restrictions: 14 * 15 * 1. The origin of this software must not be misrepresented; you must not 16 * claim that you wrote the original software. If you use this software 17 * in a product, an acknowledgment in the product documentation would be 18 * appreciated but is not required. 19 * 2. Altered source versions must be plainly marked as such, and must not be 20 * misrepresented as being the original software. 21 * 3. This notice may not be removed or altered from any source distribution. 22 */ 23 24 /* This file is included by jdmerge-neon.c. */ 25 26 27 /* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2 28 * chroma upsampling and YCbCr -> RGB color conversion into a single function. 29 * 30 * As with the standalone functions, YCbCr -> RGB conversion is defined by the 31 * following equations: 32 * R = Y + 1.40200 * (Cr - 128) 33 * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) 34 * B = Y + 1.77200 * (Cb - 128) 35 * 36 * Scaled integer constants are used to avoid floating-point arithmetic: 37 * 0.3441467 = 11277 * 2^-15 38 * 0.7141418 = 23401 * 2^-15 39 * 1.4020386 = 22971 * 2^-14 40 * 1.7720337 = 29033 * 2^-14 41 * These constants are defined in jdmerge-neon.c. 42 * 43 * To ensure correct results, rounding is used when descaling. 44 */ 45 46 /* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion 47 * routines: 48 * 49 * Input memory buffers can be safely overread up to the next multiple of 50 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in 51 * jmemmgr.c. 52 * 53 * The output buffer cannot safely be written beyond output_width, since 54 * output_buf points to a possibly unpadded row in the decompressed image 55 * buffer allocated by the calling program. 56 */ 57 58 /* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 59 */ 60 61 void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width, 62 JSAMPIMAGE input_buf, 63 JDIMENSION in_row_group_ctr, 64 JSAMPARRAY output_buf) 65 { 66 JSAMPROW outptr; 67 /* Pointers to Y, Cb, and Cr data */ 68 JSAMPROW inptr0, inptr1, inptr2; 69 70 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); 71 const int16x8_t neg_128 = vdupq_n_s16(-128); 72 73 inptr0 = input_buf[0][in_row_group_ctr]; 74 inptr1 = input_buf[1][in_row_group_ctr]; 75 inptr2 = input_buf[2][in_row_group_ctr]; 76 outptr = output_buf[0]; 77 78 int cols_remaining = output_width; 79 for (; cols_remaining >= 16; cols_remaining -= 16) { 80 /* De-interleave Y component values into two separate vectors, one 81 * containing the component values with even-numbered indices and one 82 * containing the component values with odd-numbered indices. 83 */ 84 uint8x8x2_t y = vld2_u8(inptr0); 85 uint8x8_t cb = vld1_u8(inptr1); 86 uint8x8_t cr = vld1_u8(inptr2); 87 /* Subtract 128 from Cb and Cr. */ 88 int16x8_t cr_128 = 89 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); 90 int16x8_t cb_128 = 91 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); 92 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 93 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); 94 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); 95 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); 96 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); 97 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 98 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), 99 vrshrn_n_s32(g_sub_y_h, 15)); 100 /* Compute R-Y: 1.40200 * (Cr - 128) */ 101 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); 102 /* Compute B-Y: 1.77200 * (Cb - 128) */ 103 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); 104 /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and 105 * "odd" Y component values. This effectively upsamples the chroma 106 * components horizontally. 107 */ 108 int16x8_t g_even = 109 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 110 y.val[0])); 111 int16x8_t r_even = 112 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 113 y.val[0])); 114 int16x8_t b_even = 115 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 116 y.val[0])); 117 int16x8_t g_odd = 118 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 119 y.val[1])); 120 int16x8_t r_odd = 121 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 122 y.val[1])); 123 int16x8_t b_odd = 124 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 125 y.val[1])); 126 /* Convert each component to unsigned and narrow, clamping to [0-255]. 127 * Re-interleave the "even" and "odd" component values. 128 */ 129 uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd)); 130 uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd)); 131 uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd)); 132 133 #ifdef RGB_ALPHA 134 uint8x16x4_t rgba; 135 rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]); 136 rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]); 137 rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]); 138 /* Set alpha channel to opaque (0xFF). */ 139 rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); 140 /* Store RGBA pixel data to memory. */ 141 vst4q_u8(outptr, rgba); 142 #else 143 uint8x16x3_t rgb; 144 rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]); 145 rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]); 146 rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]); 147 /* Store RGB pixel data to memory. */ 148 vst3q_u8(outptr, rgb); 149 #endif 150 151 /* Increment pointers. */ 152 inptr0 += 16; 153 inptr1 += 8; 154 inptr2 += 8; 155 outptr += (RGB_PIXELSIZE * 16); 156 } 157 158 if (cols_remaining > 0) { 159 /* De-interleave Y component values into two separate vectors, one 160 * containing the component values with even-numbered indices and one 161 * containing the component values with odd-numbered indices. 162 */ 163 uint8x8x2_t y = vld2_u8(inptr0); 164 uint8x8_t cb = vld1_u8(inptr1); 165 uint8x8_t cr = vld1_u8(inptr2); 166 /* Subtract 128 from Cb and Cr. */ 167 int16x8_t cr_128 = 168 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); 169 int16x8_t cb_128 = 170 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); 171 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 172 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); 173 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); 174 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); 175 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); 176 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 177 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), 178 vrshrn_n_s32(g_sub_y_h, 15)); 179 /* Compute R-Y: 1.40200 * (Cr - 128) */ 180 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); 181 /* Compute B-Y: 1.77200 * (Cb - 128) */ 182 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); 183 /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and 184 * "odd" Y component values. This effectively upsamples the chroma 185 * components horizontally. 186 */ 187 int16x8_t g_even = 188 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 189 y.val[0])); 190 int16x8_t r_even = 191 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 192 y.val[0])); 193 int16x8_t b_even = 194 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 195 y.val[0])); 196 int16x8_t g_odd = 197 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 198 y.val[1])); 199 int16x8_t r_odd = 200 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 201 y.val[1])); 202 int16x8_t b_odd = 203 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 204 y.val[1])); 205 /* Convert each component to unsigned and narrow, clamping to [0-255]. 206 * Re-interleave the "even" and "odd" component values. 207 */ 208 uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd)); 209 uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd)); 210 uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd)); 211 212 #ifdef RGB_ALPHA 213 uint8x8x4_t rgba_h; 214 rgba_h.val[RGB_RED] = r.val[1]; 215 rgba_h.val[RGB_GREEN] = g.val[1]; 216 rgba_h.val[RGB_BLUE] = b.val[1]; 217 /* Set alpha channel to opaque (0xFF). */ 218 rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); 219 uint8x8x4_t rgba_l; 220 rgba_l.val[RGB_RED] = r.val[0]; 221 rgba_l.val[RGB_GREEN] = g.val[0]; 222 rgba_l.val[RGB_BLUE] = b.val[0]; 223 /* Set alpha channel to opaque (0xFF). */ 224 rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); 225 /* Store RGBA pixel data to memory. */ 226 switch (cols_remaining) { 227 case 15: 228 vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6); 229 FALLTHROUGH /*FALLTHROUGH*/ 230 case 14: 231 vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5); 232 FALLTHROUGH /*FALLTHROUGH*/ 233 case 13: 234 vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4); 235 FALLTHROUGH /*FALLTHROUGH*/ 236 case 12: 237 vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3); 238 FALLTHROUGH /*FALLTHROUGH*/ 239 case 11: 240 vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2); 241 FALLTHROUGH /*FALLTHROUGH*/ 242 case 10: 243 vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1); 244 FALLTHROUGH /*FALLTHROUGH*/ 245 case 9: 246 vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0); 247 FALLTHROUGH /*FALLTHROUGH*/ 248 case 8: 249 vst4_u8(outptr, rgba_l); 250 break; 251 case 7: 252 vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6); 253 FALLTHROUGH /*FALLTHROUGH*/ 254 case 6: 255 vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5); 256 FALLTHROUGH /*FALLTHROUGH*/ 257 case 5: 258 vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4); 259 FALLTHROUGH /*FALLTHROUGH*/ 260 case 4: 261 vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3); 262 FALLTHROUGH /*FALLTHROUGH*/ 263 case 3: 264 vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2); 265 FALLTHROUGH /*FALLTHROUGH*/ 266 case 2: 267 vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1); 268 FALLTHROUGH /*FALLTHROUGH*/ 269 case 1: 270 vst4_lane_u8(outptr, rgba_l, 0); 271 FALLTHROUGH /*FALLTHROUGH*/ 272 default: 273 break; 274 } 275 #else 276 uint8x8x3_t rgb_h; 277 rgb_h.val[RGB_RED] = r.val[1]; 278 rgb_h.val[RGB_GREEN] = g.val[1]; 279 rgb_h.val[RGB_BLUE] = b.val[1]; 280 uint8x8x3_t rgb_l; 281 rgb_l.val[RGB_RED] = r.val[0]; 282 rgb_l.val[RGB_GREEN] = g.val[0]; 283 rgb_l.val[RGB_BLUE] = b.val[0]; 284 /* Store RGB pixel data to memory. */ 285 switch (cols_remaining) { 286 case 15: 287 vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6); 288 FALLTHROUGH /*FALLTHROUGH*/ 289 case 14: 290 vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5); 291 FALLTHROUGH /*FALLTHROUGH*/ 292 case 13: 293 vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4); 294 FALLTHROUGH /*FALLTHROUGH*/ 295 case 12: 296 vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3); 297 FALLTHROUGH /*FALLTHROUGH*/ 298 case 11: 299 vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2); 300 FALLTHROUGH /*FALLTHROUGH*/ 301 case 10: 302 vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1); 303 FALLTHROUGH /*FALLTHROUGH*/ 304 case 9: 305 vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0); 306 FALLTHROUGH /*FALLTHROUGH*/ 307 case 8: 308 vst3_u8(outptr, rgb_l); 309 break; 310 case 7: 311 vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6); 312 FALLTHROUGH /*FALLTHROUGH*/ 313 case 6: 314 vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5); 315 FALLTHROUGH /*FALLTHROUGH*/ 316 case 5: 317 vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4); 318 FALLTHROUGH /*FALLTHROUGH*/ 319 case 4: 320 vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3); 321 FALLTHROUGH /*FALLTHROUGH*/ 322 case 3: 323 vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2); 324 FALLTHROUGH /*FALLTHROUGH*/ 325 case 2: 326 vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1); 327 FALLTHROUGH /*FALLTHROUGH*/ 328 case 1: 329 vst3_lane_u8(outptr, rgb_l, 0); 330 FALLTHROUGH /*FALLTHROUGH*/ 331 default: 332 break; 333 } 334 #endif 335 } 336 } 337 338 339 /* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 340 * 341 * See comments above for details regarding color conversion and safe memory 342 * access. 343 */ 344 345 void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width, 346 JSAMPIMAGE input_buf, 347 JDIMENSION in_row_group_ctr, 348 JSAMPARRAY output_buf) 349 { 350 JSAMPROW outptr0, outptr1; 351 /* Pointers to Y (both rows), Cb, and Cr data */ 352 JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2; 353 354 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); 355 const int16x8_t neg_128 = vdupq_n_s16(-128); 356 357 inptr0_0 = input_buf[0][in_row_group_ctr * 2]; 358 inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1]; 359 inptr1 = input_buf[1][in_row_group_ctr]; 360 inptr2 = input_buf[2][in_row_group_ctr]; 361 outptr0 = output_buf[0]; 362 outptr1 = output_buf[1]; 363 364 int cols_remaining = output_width; 365 for (; cols_remaining >= 16; cols_remaining -= 16) { 366 /* For each row, de-interleave Y component values into two separate 367 * vectors, one containing the component values with even-numbered indices 368 * and one containing the component values with odd-numbered indices. 369 */ 370 uint8x8x2_t y0 = vld2_u8(inptr0_0); 371 uint8x8x2_t y1 = vld2_u8(inptr0_1); 372 uint8x8_t cb = vld1_u8(inptr1); 373 uint8x8_t cr = vld1_u8(inptr2); 374 /* Subtract 128 from Cb and Cr. */ 375 int16x8_t cr_128 = 376 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); 377 int16x8_t cb_128 = 378 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); 379 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 380 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); 381 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); 382 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); 383 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); 384 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 385 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), 386 vrshrn_n_s32(g_sub_y_h, 15)); 387 /* Compute R-Y: 1.40200 * (Cr - 128) */ 388 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); 389 /* Compute B-Y: 1.77200 * (Cb - 128) */ 390 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); 391 /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both 392 * the "even" and "odd" Y component values. This effectively upsamples the 393 * chroma components both horizontally and vertically. 394 */ 395 int16x8_t g0_even = 396 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 397 y0.val[0])); 398 int16x8_t r0_even = 399 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 400 y0.val[0])); 401 int16x8_t b0_even = 402 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 403 y0.val[0])); 404 int16x8_t g0_odd = 405 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 406 y0.val[1])); 407 int16x8_t r0_odd = 408 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 409 y0.val[1])); 410 int16x8_t b0_odd = 411 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 412 y0.val[1])); 413 int16x8_t g1_even = 414 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 415 y1.val[0])); 416 int16x8_t r1_even = 417 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 418 y1.val[0])); 419 int16x8_t b1_even = 420 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 421 y1.val[0])); 422 int16x8_t g1_odd = 423 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 424 y1.val[1])); 425 int16x8_t r1_odd = 426 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 427 y1.val[1])); 428 int16x8_t b1_odd = 429 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 430 y1.val[1])); 431 /* Convert each component to unsigned and narrow, clamping to [0-255]. 432 * Re-interleave the "even" and "odd" component values. 433 */ 434 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); 435 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); 436 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); 437 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); 438 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); 439 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); 440 441 #ifdef RGB_ALPHA 442 uint8x16x4_t rgba0, rgba1; 443 rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]); 444 rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]); 445 rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]); 446 rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]); 447 rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]); 448 rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]); 449 /* Set alpha channel to opaque (0xFF). */ 450 rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF); 451 rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF); 452 /* Store RGBA pixel data to memory. */ 453 vst4q_u8(outptr0, rgba0); 454 vst4q_u8(outptr1, rgba1); 455 #else 456 uint8x16x3_t rgb0, rgb1; 457 rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]); 458 rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]); 459 rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]); 460 rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]); 461 rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]); 462 rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]); 463 /* Store RGB pixel data to memory. */ 464 vst3q_u8(outptr0, rgb0); 465 vst3q_u8(outptr1, rgb1); 466 #endif 467 468 /* Increment pointers. */ 469 inptr0_0 += 16; 470 inptr0_1 += 16; 471 inptr1 += 8; 472 inptr2 += 8; 473 outptr0 += (RGB_PIXELSIZE * 16); 474 outptr1 += (RGB_PIXELSIZE * 16); 475 } 476 477 if (cols_remaining > 0) { 478 /* For each row, de-interleave Y component values into two separate 479 * vectors, one containing the component values with even-numbered indices 480 * and one containing the component values with odd-numbered indices. 481 */ 482 uint8x8x2_t y0 = vld2_u8(inptr0_0); 483 uint8x8x2_t y1 = vld2_u8(inptr0_1); 484 uint8x8_t cb = vld1_u8(inptr1); 485 uint8x8_t cr = vld1_u8(inptr2); 486 /* Subtract 128 from Cb and Cr. */ 487 int16x8_t cr_128 = 488 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); 489 int16x8_t cb_128 = 490 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); 491 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ 492 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); 493 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); 494 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); 495 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); 496 /* Descale G components: shift right 15, round, and narrow to 16-bit. */ 497 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), 498 vrshrn_n_s32(g_sub_y_h, 15)); 499 /* Compute R-Y: 1.40200 * (Cr - 128) */ 500 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); 501 /* Compute B-Y: 1.77200 * (Cb - 128) */ 502 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); 503 /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both 504 * the "even" and "odd" Y component values. This effectively upsamples the 505 * chroma components both horizontally and vertically. 506 */ 507 int16x8_t g0_even = 508 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 509 y0.val[0])); 510 int16x8_t r0_even = 511 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 512 y0.val[0])); 513 int16x8_t b0_even = 514 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 515 y0.val[0])); 516 int16x8_t g0_odd = 517 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 518 y0.val[1])); 519 int16x8_t r0_odd = 520 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 521 y0.val[1])); 522 int16x8_t b0_odd = 523 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 524 y0.val[1])); 525 int16x8_t g1_even = 526 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 527 y1.val[0])); 528 int16x8_t r1_even = 529 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 530 y1.val[0])); 531 int16x8_t b1_even = 532 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 533 y1.val[0])); 534 int16x8_t g1_odd = 535 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), 536 y1.val[1])); 537 int16x8_t r1_odd = 538 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), 539 y1.val[1])); 540 int16x8_t b1_odd = 541 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), 542 y1.val[1])); 543 /* Convert each component to unsigned and narrow, clamping to [0-255]. 544 * Re-interleave the "even" and "odd" component values. 545 */ 546 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); 547 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); 548 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); 549 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); 550 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); 551 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); 552 553 #ifdef RGB_ALPHA 554 uint8x8x4_t rgba0_h, rgba1_h; 555 rgba0_h.val[RGB_RED] = r0.val[1]; 556 rgba1_h.val[RGB_RED] = r1.val[1]; 557 rgba0_h.val[RGB_GREEN] = g0.val[1]; 558 rgba1_h.val[RGB_GREEN] = g1.val[1]; 559 rgba0_h.val[RGB_BLUE] = b0.val[1]; 560 rgba1_h.val[RGB_BLUE] = b1.val[1]; 561 /* Set alpha channel to opaque (0xFF). */ 562 rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); 563 rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); 564 565 uint8x8x4_t rgba0_l, rgba1_l; 566 rgba0_l.val[RGB_RED] = r0.val[0]; 567 rgba1_l.val[RGB_RED] = r1.val[0]; 568 rgba0_l.val[RGB_GREEN] = g0.val[0]; 569 rgba1_l.val[RGB_GREEN] = g1.val[0]; 570 rgba0_l.val[RGB_BLUE] = b0.val[0]; 571 rgba1_l.val[RGB_BLUE] = b1.val[0]; 572 /* Set alpha channel to opaque (0xFF). */ 573 rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); 574 rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); 575 /* Store RGBA pixel data to memory. */ 576 switch (cols_remaining) { 577 case 15: 578 vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6); 579 vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6); 580 FALLTHROUGH /*FALLTHROUGH*/ 581 case 14: 582 vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5); 583 vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5); 584 FALLTHROUGH /*FALLTHROUGH*/ 585 case 13: 586 vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4); 587 vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4); 588 FALLTHROUGH /*FALLTHROUGH*/ 589 case 12: 590 vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3); 591 vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3); 592 FALLTHROUGH /*FALLTHROUGH*/ 593 case 11: 594 vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2); 595 vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2); 596 FALLTHROUGH /*FALLTHROUGH*/ 597 case 10: 598 vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1); 599 vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1); 600 FALLTHROUGH /*FALLTHROUGH*/ 601 case 9: 602 vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0); 603 vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0); 604 FALLTHROUGH /*FALLTHROUGH*/ 605 case 8: 606 vst4_u8(outptr0, rgba0_l); 607 vst4_u8(outptr1, rgba1_l); 608 break; 609 case 7: 610 vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6); 611 vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6); 612 FALLTHROUGH /*FALLTHROUGH*/ 613 case 6: 614 vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5); 615 vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5); 616 FALLTHROUGH /*FALLTHROUGH*/ 617 case 5: 618 vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4); 619 vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4); 620 FALLTHROUGH /*FALLTHROUGH*/ 621 case 4: 622 vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3); 623 vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3); 624 FALLTHROUGH /*FALLTHROUGH*/ 625 case 3: 626 vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2); 627 vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2); 628 FALLTHROUGH /*FALLTHROUGH*/ 629 case 2: 630 vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1); 631 vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1); 632 FALLTHROUGH /*FALLTHROUGH*/ 633 case 1: 634 vst4_lane_u8(outptr0, rgba0_l, 0); 635 vst4_lane_u8(outptr1, rgba1_l, 0); 636 FALLTHROUGH /*FALLTHROUGH*/ 637 default: 638 break; 639 } 640 #else 641 uint8x8x3_t rgb0_h, rgb1_h; 642 rgb0_h.val[RGB_RED] = r0.val[1]; 643 rgb1_h.val[RGB_RED] = r1.val[1]; 644 rgb0_h.val[RGB_GREEN] = g0.val[1]; 645 rgb1_h.val[RGB_GREEN] = g1.val[1]; 646 rgb0_h.val[RGB_BLUE] = b0.val[1]; 647 rgb1_h.val[RGB_BLUE] = b1.val[1]; 648 649 uint8x8x3_t rgb0_l, rgb1_l; 650 rgb0_l.val[RGB_RED] = r0.val[0]; 651 rgb1_l.val[RGB_RED] = r1.val[0]; 652 rgb0_l.val[RGB_GREEN] = g0.val[0]; 653 rgb1_l.val[RGB_GREEN] = g1.val[0]; 654 rgb0_l.val[RGB_BLUE] = b0.val[0]; 655 rgb1_l.val[RGB_BLUE] = b1.val[0]; 656 /* Store RGB pixel data to memory. */ 657 switch (cols_remaining) { 658 case 15: 659 vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6); 660 vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6); 661 FALLTHROUGH /*FALLTHROUGH*/ 662 case 14: 663 vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5); 664 vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5); 665 FALLTHROUGH /*FALLTHROUGH*/ 666 case 13: 667 vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4); 668 vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4); 669 FALLTHROUGH /*FALLTHROUGH*/ 670 case 12: 671 vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3); 672 vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3); 673 FALLTHROUGH /*FALLTHROUGH*/ 674 case 11: 675 vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2); 676 vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2); 677 FALLTHROUGH /*FALLTHROUGH*/ 678 case 10: 679 vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1); 680 vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1); 681 FALLTHROUGH /*FALLTHROUGH*/ 682 case 9: 683 vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0); 684 vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0); 685 FALLTHROUGH /*FALLTHROUGH*/ 686 case 8: 687 vst3_u8(outptr0, rgb0_l); 688 vst3_u8(outptr1, rgb1_l); 689 break; 690 case 7: 691 vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6); 692 vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6); 693 FALLTHROUGH /*FALLTHROUGH*/ 694 case 6: 695 vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5); 696 vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5); 697 FALLTHROUGH /*FALLTHROUGH*/ 698 case 5: 699 vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4); 700 vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4); 701 FALLTHROUGH /*FALLTHROUGH*/ 702 case 4: 703 vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3); 704 vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3); 705 FALLTHROUGH /*FALLTHROUGH*/ 706 case 3: 707 vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2); 708 vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2); 709 FALLTHROUGH /*FALLTHROUGH*/ 710 case 2: 711 vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1); 712 vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1); 713 FALLTHROUGH /*FALLTHROUGH*/ 714 case 1: 715 vst3_lane_u8(outptr0, rgb0_l, 0); 716 vst3_lane_u8(outptr1, rgb1_l, 0); 717 FALLTHROUGH /*FALLTHROUGH*/ 718 default: 719 break; 720 } 721 #endif 722 } 723 }