jdsample-neon.c (25451B)
1 /* 2 * jdsample-neon.c - upsampling (Arm Neon) 3 * 4 * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved. 6 * 7 * This software is provided 'as-is', without any express or implied 8 * warranty. In no event will the authors be held liable for any damages 9 * arising from the use of this software. 10 * 11 * Permission is granted to anyone to use this software for any purpose, 12 * including commercial applications, and to alter it and redistribute it 13 * freely, subject to the following restrictions: 14 * 15 * 1. The origin of this software must not be misrepresented; you must not 16 * claim that you wrote the original software. If you use this software 17 * in a product, an acknowledgment in the product documentation would be 18 * appreciated but is not required. 19 * 2. Altered source versions must be plainly marked as such, and must not be 20 * misrepresented as being the original software. 21 * 3. This notice may not be removed or altered from any source distribution. 22 */ 23 24 #define JPEG_INTERNALS 25 #include "../../jinclude.h" 26 #include "../../jpeglib.h" 27 #include "../../jsimd.h" 28 #include "../../jdct.h" 29 #include "../../jsimddct.h" 30 #include "../jsimd.h" 31 32 #include <arm_neon.h> 33 34 35 /* The diagram below shows a row of samples produced by h2v1 downsampling. 36 * 37 * s0 s1 s2 38 * +---------+---------+---------+ 39 * | | | | 40 * | p0 p1 | p2 p3 | p4 p5 | 41 * | | | | 42 * +---------+---------+---------+ 43 * 44 * Samples s0-s2 were created by averaging the original pixel component values 45 * centered at positions p0-p5 above. To approximate those original pixel 46 * component values, we proportionally blend the adjacent samples in each row. 47 * 48 * An upsampled pixel component value is computed by blending the sample 49 * containing the pixel center with the nearest neighboring sample, in the 50 * ratio 3:1. For example: 51 * p1(upsampled) = 3/4 * s0 + 1/4 * s1 52 * p2(upsampled) = 3/4 * s1 + 1/4 * s0 53 * When computing the first and last pixel component values in the row, there 54 * is no adjacent sample to blend, so: 55 * p0(upsampled) = s0 56 * p5(upsampled) = s2 57 */ 58 59 void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor, 60 JDIMENSION downsampled_width, 61 JSAMPARRAY input_data, 62 JSAMPARRAY *output_data_ptr) 63 { 64 JSAMPARRAY output_data = *output_data_ptr; 65 JSAMPROW inptr, outptr; 66 int inrow; 67 unsigned colctr; 68 /* Set up constants. */ 69 const uint16x8_t one_u16 = vdupq_n_u16(1); 70 const uint8x8_t three_u8 = vdup_n_u8(3); 71 72 for (inrow = 0; inrow < max_v_samp_factor; inrow++) { 73 inptr = input_data[inrow]; 74 outptr = output_data[inrow]; 75 /* First pixel component value in this row of the original image */ 76 *outptr = (JSAMPLE)GETJSAMPLE(*inptr); 77 78 /* 3/4 * containing sample + 1/4 * nearest neighboring sample 79 * For p1: containing sample = s0, nearest neighboring sample = s1 80 * For p2: containing sample = s1, nearest neighboring sample = s0 81 */ 82 uint8x16_t s0 = vld1q_u8(inptr); 83 uint8x16_t s1 = vld1q_u8(inptr + 1); 84 /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes 85 * denote low half and high half respectively. 86 */ 87 uint16x8_t s1_add_3s0_l = 88 vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8); 89 uint16x8_t s1_add_3s0_h = 90 vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8); 91 uint16x8_t s0_add_3s1_l = 92 vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8); 93 uint16x8_t s0_add_3s1_h = 94 vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); 95 /* Add ordered dithering bias to odd pixel values. */ 96 s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); 97 s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); 98 99 /* The offset is initially 1, because the first pixel component has already 100 * been stored. However, in subsequent iterations of the SIMD loop, this 101 * offset is (2 * colctr - 1) to stay within the bounds of the sample 102 * buffers without having to resort to a slow scalar tail case for the last 103 * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" 104 * in jmemmgr.c for more details. 105 */ 106 unsigned outptr_offset = 1; 107 uint8x16x2_t output_pixels; 108 109 /* We use software pipelining to maximise performance. The code indented 110 * an extra two spaces begins the next iteration of the loop. 111 */ 112 for (colctr = 16; colctr < downsampled_width; colctr += 16) { 113 114 s0 = vld1q_u8(inptr + colctr - 1); 115 s1 = vld1q_u8(inptr + colctr); 116 117 /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ 118 output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), 119 vrshrn_n_u16(s1_add_3s0_h, 2)); 120 output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), 121 vshrn_n_u16(s0_add_3s1_h, 2)); 122 123 /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes 124 * denote low half and high half respectively. 125 */ 126 s1_add_3s0_l = 127 vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8); 128 s1_add_3s0_h = 129 vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8); 130 s0_add_3s1_l = 131 vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8); 132 s0_add_3s1_h = 133 vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); 134 /* Add ordered dithering bias to odd pixel values. */ 135 s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); 136 s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); 137 138 /* Store pixel component values to memory. */ 139 vst2q_u8(outptr + outptr_offset, output_pixels); 140 outptr_offset = 2 * colctr - 1; 141 } 142 143 /* Complete the last iteration of the loop. */ 144 145 /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ 146 output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), 147 vrshrn_n_u16(s1_add_3s0_h, 2)); 148 output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), 149 vshrn_n_u16(s0_add_3s1_h, 2)); 150 /* Store pixel component values to memory. */ 151 vst2q_u8(outptr + outptr_offset, output_pixels); 152 153 /* Last pixel component value in this row of the original image */ 154 outptr[2 * downsampled_width - 1] = 155 GETJSAMPLE(inptr[downsampled_width - 1]); 156 } 157 } 158 159 160 /* The diagram below shows an array of samples produced by h2v2 downsampling. 161 * 162 * s0 s1 s2 163 * +---------+---------+---------+ 164 * | p0 p1 | p2 p3 | p4 p5 | 165 * sA | | | | 166 * | p6 p7 | p8 p9 | p10 p11| 167 * +---------+---------+---------+ 168 * | p12 p13| p14 p15| p16 p17| 169 * sB | | | | 170 * | p18 p19| p20 p21| p22 p23| 171 * +---------+---------+---------+ 172 * | p24 p25| p26 p27| p28 p29| 173 * sC | | | | 174 * | p30 p31| p32 p33| p34 p35| 175 * +---------+---------+---------+ 176 * 177 * Samples s0A-s2C were created by averaging the original pixel component 178 * values centered at positions p0-p35 above. To approximate one of those 179 * original pixel component values, we proportionally blend the sample 180 * containing the pixel center with the nearest neighboring samples in each 181 * row, column, and diagonal. 182 * 183 * An upsampled pixel component value is computed by first blending the sample 184 * containing the pixel center with the nearest neighboring samples in the 185 * same column, in the ratio 3:1, and then blending each column sum with the 186 * nearest neighboring column sum, in the ratio 3:1. For example: 187 * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) + 188 * 1/4 * (3/4 * s0B + 1/4 * s0A) 189 * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A 190 * When computing the first and last pixel component values in the row, there 191 * is no horizontally adjacent sample to blend, so: 192 * p12(upsampled) = 3/4 * s0B + 1/4 * s0A 193 * p23(upsampled) = 3/4 * s2B + 1/4 * s2C 194 * When computing the first and last pixel component values in the column, 195 * there is no vertically adjacent sample to blend, so: 196 * p2(upsampled) = 3/4 * s1A + 1/4 * s0A 197 * p33(upsampled) = 3/4 * s1C + 1/4 * s2C 198 * When computing the corner pixel component values, there is no adjacent 199 * sample to blend, so: 200 * p0(upsampled) = s0A 201 * p35(upsampled) = s2C 202 */ 203 204 void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor, 205 JDIMENSION downsampled_width, 206 JSAMPARRAY input_data, 207 JSAMPARRAY *output_data_ptr) 208 { 209 JSAMPARRAY output_data = *output_data_ptr; 210 JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; 211 int inrow, outrow; 212 unsigned colctr; 213 /* Set up constants. */ 214 const uint16x8_t seven_u16 = vdupq_n_u16(7); 215 const uint8x8_t three_u8 = vdup_n_u8(3); 216 const uint16x8_t three_u16 = vdupq_n_u16(3); 217 218 inrow = outrow = 0; 219 while (outrow < max_v_samp_factor) { 220 inptr0 = input_data[inrow - 1]; 221 inptr1 = input_data[inrow]; 222 inptr2 = input_data[inrow + 1]; 223 /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, 224 * respectively. 225 */ 226 outptr0 = output_data[outrow++]; 227 outptr1 = output_data[outrow++]; 228 229 /* First pixel component value in this row of the original image */ 230 int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0); 231 *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4); 232 int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2); 233 *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4); 234 235 /* Step 1: Blend samples vertically in columns s0 and s1. 236 * Leave the divide by 4 until the end, when it can be done for both 237 * dimensions at once, right-shifting by 4. 238 */ 239 240 /* Load and compute s0colsum0 and s0colsum1. */ 241 uint8x16_t s0A = vld1q_u8(inptr0); 242 uint8x16_t s0B = vld1q_u8(inptr1); 243 uint8x16_t s0C = vld1q_u8(inptr2); 244 /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes 245 * denote low half and high half respectively. 246 */ 247 uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), 248 vget_low_u8(s0B), three_u8); 249 uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), 250 vget_high_u8(s0B), three_u8); 251 uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), 252 vget_low_u8(s0B), three_u8); 253 uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), 254 vget_high_u8(s0B), three_u8); 255 /* Load and compute s1colsum0 and s1colsum1. */ 256 uint8x16_t s1A = vld1q_u8(inptr0 + 1); 257 uint8x16_t s1B = vld1q_u8(inptr1 + 1); 258 uint8x16_t s1C = vld1q_u8(inptr2 + 1); 259 uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), 260 vget_low_u8(s1B), three_u8); 261 uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), 262 vget_high_u8(s1B), three_u8); 263 uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), 264 vget_low_u8(s1B), three_u8); 265 uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), 266 vget_high_u8(s1B), three_u8); 267 268 /* Step 2: Blend the already-blended columns. */ 269 270 uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); 271 uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); 272 uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); 273 uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); 274 uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); 275 uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); 276 uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); 277 uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); 278 /* Add ordered dithering bias to odd pixel values. */ 279 output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); 280 output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); 281 output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); 282 output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); 283 /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */ 284 uint8x16x2_t output_pixels0 = { { 285 vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)), 286 vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4)) 287 } }; 288 uint8x16x2_t output_pixels1 = { { 289 vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)), 290 vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4)) 291 } }; 292 293 /* Store pixel component values to memory. 294 * The minimum size of the output buffer for each row is 64 bytes => no 295 * need to worry about buffer overflow here. See "Creation of 2-D sample 296 * arrays" in jmemmgr.c for more details. 297 */ 298 vst2q_u8(outptr0 + 1, output_pixels0); 299 vst2q_u8(outptr1 + 1, output_pixels1); 300 301 /* The first pixel of the image shifted our loads and stores by one byte. 302 * We have to re-align on a 32-byte boundary at some point before the end 303 * of the row (we do it now on the 32/33 pixel boundary) to stay within the 304 * bounds of the sample buffers without having to resort to a slow scalar 305 * tail case for the last (downsampled_width % 16) samples. See "Creation 306 * of 2-D sample arrays" in jmemmgr.c for more details. 307 */ 308 for (colctr = 16; colctr < downsampled_width; colctr += 16) { 309 /* Step 1: Blend samples vertically in columns s0 and s1. */ 310 311 /* Load and compute s0colsum0 and s0colsum1. */ 312 s0A = vld1q_u8(inptr0 + colctr - 1); 313 s0B = vld1q_u8(inptr1 + colctr - 1); 314 s0C = vld1q_u8(inptr2 + colctr - 1); 315 s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B), 316 three_u8); 317 s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B), 318 three_u8); 319 s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B), 320 three_u8); 321 s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B), 322 three_u8); 323 /* Load and compute s1colsum0 and s1colsum1. */ 324 s1A = vld1q_u8(inptr0 + colctr); 325 s1B = vld1q_u8(inptr1 + colctr); 326 s1C = vld1q_u8(inptr2 + colctr); 327 s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B), 328 three_u8); 329 s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B), 330 three_u8); 331 s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B), 332 three_u8); 333 s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B), 334 three_u8); 335 336 /* Step 2: Blend the already-blended columns. */ 337 338 output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); 339 output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); 340 output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); 341 output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); 342 output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); 343 output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); 344 output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); 345 output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); 346 /* Add ordered dithering bias to odd pixel values. */ 347 output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); 348 output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); 349 output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); 350 output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); 351 /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */ 352 output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4), 353 vshrn_n_u16(output0_p1_h, 4)); 354 output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), 355 vrshrn_n_u16(output0_p2_h, 4)); 356 output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4), 357 vshrn_n_u16(output1_p1_h, 4)); 358 output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), 359 vrshrn_n_u16(output1_p2_h, 4)); 360 /* Store pixel component values to memory. */ 361 vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0); 362 vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1); 363 } 364 365 /* Last pixel component value in this row of the original image */ 366 int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + 367 GETJSAMPLE(inptr0[downsampled_width - 1]); 368 outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4); 369 int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + 370 GETJSAMPLE(inptr2[downsampled_width - 1]); 371 outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4); 372 inrow++; 373 } 374 } 375 376 377 /* The diagram below shows a column of samples produced by h1v2 downsampling 378 * (or by losslessly rotating or transposing an h2v1-downsampled image.) 379 * 380 * +---------+ 381 * | p0 | 382 * sA | | 383 * | p1 | 384 * +---------+ 385 * | p2 | 386 * sB | | 387 * | p3 | 388 * +---------+ 389 * | p4 | 390 * sC | | 391 * | p5 | 392 * +---------+ 393 * 394 * Samples sA-sC were created by averaging the original pixel component values 395 * centered at positions p0-p5 above. To approximate those original pixel 396 * component values, we proportionally blend the adjacent samples in each 397 * column. 398 * 399 * An upsampled pixel component value is computed by blending the sample 400 * containing the pixel center with the nearest neighboring sample, in the 401 * ratio 3:1. For example: 402 * p1(upsampled) = 3/4 * sA + 1/4 * sB 403 * p2(upsampled) = 3/4 * sB + 1/4 * sA 404 * When computing the first and last pixel component values in the column, 405 * there is no adjacent sample to blend, so: 406 * p0(upsampled) = sA 407 * p5(upsampled) = sC 408 */ 409 410 void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor, 411 JDIMENSION downsampled_width, 412 JSAMPARRAY input_data, 413 JSAMPARRAY *output_data_ptr) 414 { 415 JSAMPARRAY output_data = *output_data_ptr; 416 JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; 417 int inrow, outrow; 418 unsigned colctr; 419 /* Set up constants. */ 420 const uint16x8_t one_u16 = vdupq_n_u16(1); 421 const uint8x8_t three_u8 = vdup_n_u8(3); 422 423 inrow = outrow = 0; 424 while (outrow < max_v_samp_factor) { 425 inptr0 = input_data[inrow - 1]; 426 inptr1 = input_data[inrow]; 427 inptr2 = input_data[inrow + 1]; 428 /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, 429 * respectively. 430 */ 431 outptr0 = output_data[outrow++]; 432 outptr1 = output_data[outrow++]; 433 inrow++; 434 435 /* The size of the input and output buffers is always a multiple of 32 436 * bytes => no need to worry about buffer overflow when reading/writing 437 * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more 438 * details. 439 */ 440 for (colctr = 0; colctr < downsampled_width; colctr += 16) { 441 /* Load samples. */ 442 uint8x16_t sA = vld1q_u8(inptr0 + colctr); 443 uint8x16_t sB = vld1q_u8(inptr1 + colctr); 444 uint8x16_t sC = vld1q_u8(inptr2 + colctr); 445 /* Blend samples vertically. */ 446 uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)), 447 vget_low_u8(sB), three_u8); 448 uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)), 449 vget_high_u8(sB), three_u8); 450 uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)), 451 vget_low_u8(sB), three_u8); 452 uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)), 453 vget_high_u8(sB), three_u8); 454 /* Add ordered dithering bias to pixel values in even output rows. */ 455 colsum0_l = vaddq_u16(colsum0_l, one_u16); 456 colsum0_h = vaddq_u16(colsum0_h, one_u16); 457 /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ 458 uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2), 459 vshrn_n_u16(colsum0_h, 2)); 460 uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2), 461 vrshrn_n_u16(colsum1_h, 2)); 462 /* Store pixel component values to memory. */ 463 vst1q_u8(outptr0 + colctr, output_pixels0); 464 vst1q_u8(outptr1 + colctr, output_pixels1); 465 } 466 } 467 } 468 469 470 /* The diagram below shows a row of samples produced by h2v1 downsampling. 471 * 472 * s0 s1 473 * +---------+---------+ 474 * | | | 475 * | p0 p1 | p2 p3 | 476 * | | | 477 * +---------+---------+ 478 * 479 * Samples s0 and s1 were created by averaging the original pixel component 480 * values centered at positions p0-p3 above. To approximate those original 481 * pixel component values, we duplicate the samples horizontally: 482 * p0(upsampled) = p1(upsampled) = s0 483 * p2(upsampled) = p3(upsampled) = s1 484 */ 485 486 void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, 487 JSAMPARRAY input_data, 488 JSAMPARRAY *output_data_ptr) 489 { 490 JSAMPARRAY output_data = *output_data_ptr; 491 JSAMPROW inptr, outptr; 492 int inrow; 493 unsigned colctr; 494 495 for (inrow = 0; inrow < max_v_samp_factor; inrow++) { 496 inptr = input_data[inrow]; 497 outptr = output_data[inrow]; 498 for (colctr = 0; 2 * colctr < output_width; colctr += 16) { 499 uint8x16_t samples = vld1q_u8(inptr + colctr); 500 /* Duplicate the samples. The store operation below interleaves them so 501 * that adjacent pixel component values take on the same sample value, 502 * per above. 503 */ 504 uint8x16x2_t output_pixels = { { samples, samples } }; 505 /* Store pixel component values to memory. 506 * Due to the way sample buffers are allocated, we don't need to worry 507 * about tail cases when output_width is not a multiple of 32. See 508 * "Creation of 2-D sample arrays" in jmemmgr.c for details. 509 */ 510 vst2q_u8(outptr + 2 * colctr, output_pixels); 511 } 512 } 513 } 514 515 516 /* The diagram below shows an array of samples produced by h2v2 downsampling. 517 * 518 * s0 s1 519 * +---------+---------+ 520 * | p0 p1 | p2 p3 | 521 * sA | | | 522 * | p4 p5 | p6 p7 | 523 * +---------+---------+ 524 * | p8 p9 | p10 p11| 525 * sB | | | 526 * | p12 p13| p14 p15| 527 * +---------+---------+ 528 * 529 * Samples s0A-s1B were created by averaging the original pixel component 530 * values centered at positions p0-p15 above. To approximate those original 531 * pixel component values, we duplicate the samples both horizontally and 532 * vertically: 533 * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A 534 * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A 535 * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B 536 * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B 537 */ 538 539 void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, 540 JSAMPARRAY input_data, 541 JSAMPARRAY *output_data_ptr) 542 { 543 JSAMPARRAY output_data = *output_data_ptr; 544 JSAMPROW inptr, outptr0, outptr1; 545 int inrow, outrow; 546 unsigned colctr; 547 548 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { 549 inptr = input_data[inrow]; 550 outptr0 = output_data[outrow++]; 551 outptr1 = output_data[outrow++]; 552 553 for (colctr = 0; 2 * colctr < output_width; colctr += 16) { 554 uint8x16_t samples = vld1q_u8(inptr + colctr); 555 /* Duplicate the samples. The store operation below interleaves them so 556 * that adjacent pixel component values take on the same sample value, 557 * per above. 558 */ 559 uint8x16x2_t output_pixels = { { samples, samples } }; 560 /* Store pixel component values for both output rows to memory. 561 * Due to the way sample buffers are allocated, we don't need to worry 562 * about tail cases when output_width is not a multiple of 32. See 563 * "Creation of 2-D sample arrays" in jmemmgr.c for details. 564 */ 565 vst2q_u8(outptr0 + 2 * colctr, output_pixels); 566 vst2q_u8(outptr1 + 2 * colctr, output_pixels); 567 } 568 } 569 }