tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdsample-neon.c (25451B)


      1 /*
      2 * jdsample-neon.c - upsampling (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
      6 *
      7 * This software is provided 'as-is', without any express or implied
      8 * warranty.  In no event will the authors be held liable for any damages
      9 * arising from the use of this software.
     10 *
     11 * Permission is granted to anyone to use this software for any purpose,
     12 * including commercial applications, and to alter it and redistribute it
     13 * freely, subject to the following restrictions:
     14 *
     15 * 1. The origin of this software must not be misrepresented; you must not
     16 *    claim that you wrote the original software. If you use this software
     17 *    in a product, an acknowledgment in the product documentation would be
     18 *    appreciated but is not required.
     19 * 2. Altered source versions must be plainly marked as such, and must not be
     20 *    misrepresented as being the original software.
     21 * 3. This notice may not be removed or altered from any source distribution.
     22 */
     23 
     24 #define JPEG_INTERNALS
     25 #include "../../jinclude.h"
     26 #include "../../jpeglib.h"
     27 #include "../../jsimd.h"
     28 #include "../../jdct.h"
     29 #include "../../jsimddct.h"
     30 #include "../jsimd.h"
     31 
     32 #include <arm_neon.h>
     33 
     34 
     35 /* The diagram below shows a row of samples produced by h2v1 downsampling.
     36 *
     37 *                s0        s1        s2
     38 *            +---------+---------+---------+
     39 *            |         |         |         |
     40 *            | p0   p1 | p2   p3 | p4   p5 |
     41 *            |         |         |         |
     42 *            +---------+---------+---------+
     43 *
     44 * Samples s0-s2 were created by averaging the original pixel component values
     45 * centered at positions p0-p5 above.  To approximate those original pixel
     46 * component values, we proportionally blend the adjacent samples in each row.
     47 *
     48 * An upsampled pixel component value is computed by blending the sample
     49 * containing the pixel center with the nearest neighboring sample, in the
     50 * ratio 3:1.  For example:
     51 *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
     52 *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
     53 * When computing the first and last pixel component values in the row, there
     54 * is no adjacent sample to blend, so:
     55 *     p0(upsampled) = s0
     56 *     p5(upsampled) = s2
     57 */
     58 
     59 void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
     60                                    JDIMENSION downsampled_width,
     61                                    JSAMPARRAY input_data,
     62                                    JSAMPARRAY *output_data_ptr)
     63 {
     64  JSAMPARRAY output_data = *output_data_ptr;
     65  JSAMPROW inptr, outptr;
     66  int inrow;
     67  unsigned colctr;
     68  /* Set up constants. */
     69  const uint16x8_t one_u16 = vdupq_n_u16(1);
     70  const uint8x8_t three_u8 = vdup_n_u8(3);
     71 
     72  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
     73    inptr = input_data[inrow];
     74    outptr = output_data[inrow];
     75    /* First pixel component value in this row of the original image */
     76    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
     77 
     78    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
     79     * For p1: containing sample = s0, nearest neighboring sample = s1
     80     * For p2: containing sample = s1, nearest neighboring sample = s0
     81     */
     82    uint8x16_t s0 = vld1q_u8(inptr);
     83    uint8x16_t s1 = vld1q_u8(inptr + 1);
     84    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
     85     * denote low half and high half respectively.
     86     */
     87    uint16x8_t s1_add_3s0_l =
     88      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
     89    uint16x8_t s1_add_3s0_h =
     90      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
     91    uint16x8_t s0_add_3s1_l =
     92      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
     93    uint16x8_t s0_add_3s1_h =
     94      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
     95    /* Add ordered dithering bias to odd pixel values. */
     96    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
     97    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
     98 
     99    /* The offset is initially 1, because the first pixel component has already
    100     * been stored.  However, in subsequent iterations of the SIMD loop, this
    101     * offset is (2 * colctr - 1) to stay within the bounds of the sample
    102     * buffers without having to resort to a slow scalar tail case for the last
    103     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
    104     * in jmemmgr.c for more details.
    105     */
    106    unsigned outptr_offset = 1;
    107    uint8x16x2_t output_pixels;
    108 
    109    /* We use software pipelining to maximise performance.  The code indented
    110     * an extra two spaces begins the next iteration of the loop.
    111     */
    112    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
    113 
    114        s0 = vld1q_u8(inptr + colctr - 1);
    115        s1 = vld1q_u8(inptr + colctr);
    116 
    117      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
    118      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
    119                                         vrshrn_n_u16(s1_add_3s0_h, 2));
    120      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
    121                                         vshrn_n_u16(s0_add_3s1_h, 2));
    122 
    123        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
    124         * denote low half and high half respectively.
    125         */
    126        s1_add_3s0_l =
    127          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
    128        s1_add_3s0_h =
    129          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
    130        s0_add_3s1_l =
    131          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
    132        s0_add_3s1_h =
    133          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
    134        /* Add ordered dithering bias to odd pixel values. */
    135        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
    136        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
    137 
    138      /* Store pixel component values to memory. */
    139      vst2q_u8(outptr + outptr_offset, output_pixels);
    140      outptr_offset = 2 * colctr - 1;
    141    }
    142 
    143    /* Complete the last iteration of the loop. */
    144 
    145    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
    146    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
    147                                       vrshrn_n_u16(s1_add_3s0_h, 2));
    148    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
    149                                       vshrn_n_u16(s0_add_3s1_h, 2));
    150    /* Store pixel component values to memory. */
    151    vst2q_u8(outptr + outptr_offset, output_pixels);
    152 
    153    /* Last pixel component value in this row of the original image */
    154    outptr[2 * downsampled_width - 1] =
    155      GETJSAMPLE(inptr[downsampled_width - 1]);
    156  }
    157 }
    158 
    159 
    160 /* The diagram below shows an array of samples produced by h2v2 downsampling.
    161 *
    162 *                s0        s1        s2
    163 *            +---------+---------+---------+
    164 *            | p0   p1 | p2   p3 | p4   p5 |
    165 *       sA   |         |         |         |
    166 *            | p6   p7 | p8   p9 | p10  p11|
    167 *            +---------+---------+---------+
    168 *            | p12  p13| p14  p15| p16  p17|
    169 *       sB   |         |         |         |
    170 *            | p18  p19| p20  p21| p22  p23|
    171 *            +---------+---------+---------+
    172 *            | p24  p25| p26  p27| p28  p29|
    173 *       sC   |         |         |         |
    174 *            | p30  p31| p32  p33| p34  p35|
    175 *            +---------+---------+---------+
    176 *
    177 * Samples s0A-s2C were created by averaging the original pixel component
    178 * values centered at positions p0-p35 above.  To approximate one of those
    179 * original pixel component values, we proportionally blend the sample
    180 * containing the pixel center with the nearest neighboring samples in each
    181 * row, column, and diagonal.
    182 *
    183 * An upsampled pixel component value is computed by first blending the sample
    184 * containing the pixel center with the nearest neighboring samples in the
    185 * same column, in the ratio 3:1, and then blending each column sum with the
    186 * nearest neighboring column sum, in the ratio 3:1.  For example:
    187 *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
    188 *                      1/4 * (3/4 * s0B + 1/4 * s0A)
    189 *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
    190 * When computing the first and last pixel component values in the row, there
    191 * is no horizontally adjacent sample to blend, so:
    192 *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
    193 *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
    194 * When computing the first and last pixel component values in the column,
    195 * there is no vertically adjacent sample to blend, so:
    196 *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
    197 *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
    198 * When computing the corner pixel component values, there is no adjacent
    199 * sample to blend, so:
    200 *     p0(upsampled) = s0A
    201 *     p35(upsampled) = s2C
    202 */
    203 
    204 void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
    205                                    JDIMENSION downsampled_width,
    206                                    JSAMPARRAY input_data,
    207                                    JSAMPARRAY *output_data_ptr)
    208 {
    209  JSAMPARRAY output_data = *output_data_ptr;
    210  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
    211  int inrow, outrow;
    212  unsigned colctr;
    213  /* Set up constants. */
    214  const uint16x8_t seven_u16 = vdupq_n_u16(7);
    215  const uint8x8_t three_u8 = vdup_n_u8(3);
    216  const uint16x8_t three_u16 = vdupq_n_u16(3);
    217 
    218  inrow = outrow = 0;
    219  while (outrow < max_v_samp_factor) {
    220    inptr0 = input_data[inrow - 1];
    221    inptr1 = input_data[inrow];
    222    inptr2 = input_data[inrow + 1];
    223    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
    224     * respectively.
    225     */
    226    outptr0 = output_data[outrow++];
    227    outptr1 = output_data[outrow++];
    228 
    229    /* First pixel component value in this row of the original image */
    230    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
    231    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
    232    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
    233    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
    234 
    235    /* Step 1: Blend samples vertically in columns s0 and s1.
    236     * Leave the divide by 4 until the end, when it can be done for both
    237     * dimensions at once, right-shifting by 4.
    238     */
    239 
    240    /* Load and compute s0colsum0 and s0colsum1. */
    241    uint8x16_t s0A = vld1q_u8(inptr0);
    242    uint8x16_t s0B = vld1q_u8(inptr1);
    243    uint8x16_t s0C = vld1q_u8(inptr2);
    244    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
    245     * denote low half and high half respectively.
    246     */
    247    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
    248                                      vget_low_u8(s0B), three_u8);
    249    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
    250                                      vget_high_u8(s0B), three_u8);
    251    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
    252                                      vget_low_u8(s0B), three_u8);
    253    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
    254                                      vget_high_u8(s0B), three_u8);
    255    /* Load and compute s1colsum0 and s1colsum1. */
    256    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
    257    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
    258    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
    259    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
    260                                      vget_low_u8(s1B), three_u8);
    261    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
    262                                      vget_high_u8(s1B), three_u8);
    263    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
    264                                      vget_low_u8(s1B), three_u8);
    265    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
    266                                      vget_high_u8(s1B), three_u8);
    267 
    268    /* Step 2: Blend the already-blended columns. */
    269 
    270    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
    271    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
    272    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
    273    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
    274    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
    275    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
    276    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
    277    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
    278    /* Add ordered dithering bias to odd pixel values. */
    279    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
    280    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
    281    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
    282    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
    283    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
    284    uint8x16x2_t output_pixels0 = { {
    285      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
    286      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
    287    } };
    288    uint8x16x2_t output_pixels1 = { {
    289      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
    290      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
    291    } };
    292 
    293    /* Store pixel component values to memory.
    294     * The minimum size of the output buffer for each row is 64 bytes => no
    295     * need to worry about buffer overflow here.  See "Creation of 2-D sample
    296     * arrays" in jmemmgr.c for more details.
    297     */
    298    vst2q_u8(outptr0 + 1, output_pixels0);
    299    vst2q_u8(outptr1 + 1, output_pixels1);
    300 
    301    /* The first pixel of the image shifted our loads and stores by one byte.
    302     * We have to re-align on a 32-byte boundary at some point before the end
    303     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
    304     * bounds of the sample buffers without having to resort to a slow scalar
    305     * tail case for the last (downsampled_width % 16) samples.  See "Creation
    306     * of 2-D sample arrays" in jmemmgr.c for more details.
    307     */
    308    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
    309      /* Step 1: Blend samples vertically in columns s0 and s1. */
    310 
    311      /* Load and compute s0colsum0 and s0colsum1. */
    312      s0A = vld1q_u8(inptr0 + colctr - 1);
    313      s0B = vld1q_u8(inptr1 + colctr - 1);
    314      s0C = vld1q_u8(inptr2 + colctr - 1);
    315      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
    316                             three_u8);
    317      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
    318                             three_u8);
    319      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
    320                             three_u8);
    321      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
    322                             three_u8);
    323      /* Load and compute s1colsum0 and s1colsum1. */
    324      s1A = vld1q_u8(inptr0 + colctr);
    325      s1B = vld1q_u8(inptr1 + colctr);
    326      s1C = vld1q_u8(inptr2 + colctr);
    327      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
    328                             three_u8);
    329      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
    330                             three_u8);
    331      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
    332                             three_u8);
    333      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
    334                             three_u8);
    335 
    336      /* Step 2: Blend the already-blended columns. */
    337 
    338      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
    339      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
    340      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
    341      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
    342      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
    343      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
    344      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
    345      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
    346      /* Add ordered dithering bias to odd pixel values. */
    347      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
    348      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
    349      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
    350      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
    351      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
    352      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
    353                                          vshrn_n_u16(output0_p1_h, 4));
    354      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
    355                                          vrshrn_n_u16(output0_p2_h, 4));
    356      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
    357                                          vshrn_n_u16(output1_p1_h, 4));
    358      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
    359                                          vrshrn_n_u16(output1_p2_h, 4));
    360      /* Store pixel component values to memory. */
    361      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
    362      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
    363    }
    364 
    365    /* Last pixel component value in this row of the original image */
    366    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
    367                    GETJSAMPLE(inptr0[downsampled_width - 1]);
    368    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
    369    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
    370                    GETJSAMPLE(inptr2[downsampled_width - 1]);
    371    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
    372    inrow++;
    373  }
    374 }
    375 
    376 
    377 /* The diagram below shows a column of samples produced by h1v2 downsampling
    378 * (or by losslessly rotating or transposing an h2v1-downsampled image.)
    379 *
    380 *            +---------+
    381 *            |   p0    |
    382 *     sA     |         |
    383 *            |   p1    |
    384 *            +---------+
    385 *            |   p2    |
    386 *     sB     |         |
    387 *            |   p3    |
    388 *            +---------+
    389 *            |   p4    |
    390 *     sC     |         |
    391 *            |   p5    |
    392 *            +---------+
    393 *
    394 * Samples sA-sC were created by averaging the original pixel component values
    395 * centered at positions p0-p5 above.  To approximate those original pixel
    396 * component values, we proportionally blend the adjacent samples in each
    397 * column.
    398 *
    399 * An upsampled pixel component value is computed by blending the sample
    400 * containing the pixel center with the nearest neighboring sample, in the
    401 * ratio 3:1.  For example:
    402 *     p1(upsampled) = 3/4 * sA + 1/4 * sB
    403 *     p2(upsampled) = 3/4 * sB + 1/4 * sA
    404 * When computing the first and last pixel component values in the column,
    405 * there is no adjacent sample to blend, so:
    406 *     p0(upsampled) = sA
    407 *     p5(upsampled) = sC
    408 */
    409 
    410 void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
    411                                    JDIMENSION downsampled_width,
    412                                    JSAMPARRAY input_data,
    413                                    JSAMPARRAY *output_data_ptr)
    414 {
    415  JSAMPARRAY output_data = *output_data_ptr;
    416  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
    417  int inrow, outrow;
    418  unsigned colctr;
    419  /* Set up constants. */
    420  const uint16x8_t one_u16 = vdupq_n_u16(1);
    421  const uint8x8_t three_u8 = vdup_n_u8(3);
    422 
    423  inrow = outrow = 0;
    424  while (outrow < max_v_samp_factor) {
    425    inptr0 = input_data[inrow - 1];
    426    inptr1 = input_data[inrow];
    427    inptr2 = input_data[inrow + 1];
    428    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
    429     * respectively.
    430     */
    431    outptr0 = output_data[outrow++];
    432    outptr1 = output_data[outrow++];
    433    inrow++;
    434 
    435    /* The size of the input and output buffers is always a multiple of 32
    436     * bytes => no need to worry about buffer overflow when reading/writing
    437     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
    438     * details.
    439     */
    440    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
    441      /* Load samples. */
    442      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
    443      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
    444      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
    445      /* Blend samples vertically. */
    446      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
    447                                      vget_low_u8(sB), three_u8);
    448      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
    449                                      vget_high_u8(sB), three_u8);
    450      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
    451                                      vget_low_u8(sB), three_u8);
    452      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
    453                                      vget_high_u8(sB), three_u8);
    454      /* Add ordered dithering bias to pixel values in even output rows. */
    455      colsum0_l = vaddq_u16(colsum0_l, one_u16);
    456      colsum0_h = vaddq_u16(colsum0_h, one_u16);
    457      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
    458      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
    459                                              vshrn_n_u16(colsum0_h, 2));
    460      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
    461                                              vrshrn_n_u16(colsum1_h, 2));
    462      /* Store pixel component values to memory. */
    463      vst1q_u8(outptr0 + colctr, output_pixels0);
    464      vst1q_u8(outptr1 + colctr, output_pixels1);
    465    }
    466  }
    467 }
    468 
    469 
    470 /* The diagram below shows a row of samples produced by h2v1 downsampling.
    471 *
    472 *                s0        s1
    473 *            +---------+---------+
    474 *            |         |         |
    475 *            | p0   p1 | p2   p3 |
    476 *            |         |         |
    477 *            +---------+---------+
    478 *
    479 * Samples s0 and s1 were created by averaging the original pixel component
    480 * values centered at positions p0-p3 above.  To approximate those original
    481 * pixel component values, we duplicate the samples horizontally:
    482 *     p0(upsampled) = p1(upsampled) = s0
    483 *     p2(upsampled) = p3(upsampled) = s1
    484 */
    485 
    486 void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
    487                              JSAMPARRAY input_data,
    488                              JSAMPARRAY *output_data_ptr)
    489 {
    490  JSAMPARRAY output_data = *output_data_ptr;
    491  JSAMPROW inptr, outptr;
    492  int inrow;
    493  unsigned colctr;
    494 
    495  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
    496    inptr = input_data[inrow];
    497    outptr = output_data[inrow];
    498    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
    499      uint8x16_t samples = vld1q_u8(inptr + colctr);
    500      /* Duplicate the samples.  The store operation below interleaves them so
    501       * that adjacent pixel component values take on the same sample value,
    502       * per above.
    503       */
    504      uint8x16x2_t output_pixels = { { samples, samples } };
    505      /* Store pixel component values to memory.
    506       * Due to the way sample buffers are allocated, we don't need to worry
    507       * about tail cases when output_width is not a multiple of 32.  See
    508       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
    509       */
    510      vst2q_u8(outptr + 2 * colctr, output_pixels);
    511    }
    512  }
    513 }
    514 
    515 
    516 /* The diagram below shows an array of samples produced by h2v2 downsampling.
    517 *
    518 *                s0        s1
    519 *            +---------+---------+
    520 *            | p0   p1 | p2   p3 |
    521 *       sA   |         |         |
    522 *            | p4   p5 | p6   p7 |
    523 *            +---------+---------+
    524 *            | p8   p9 | p10  p11|
    525 *       sB   |         |         |
    526 *            | p12  p13| p14  p15|
    527 *            +---------+---------+
    528 *
    529 * Samples s0A-s1B were created by averaging the original pixel component
    530 * values centered at positions p0-p15 above.  To approximate those original
    531 * pixel component values, we duplicate the samples both horizontally and
    532 * vertically:
    533 *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
    534 *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
    535 *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
    536 *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
    537 */
    538 
    539 void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
    540                              JSAMPARRAY input_data,
    541                              JSAMPARRAY *output_data_ptr)
    542 {
    543  JSAMPARRAY output_data = *output_data_ptr;
    544  JSAMPROW inptr, outptr0, outptr1;
    545  int inrow, outrow;
    546  unsigned colctr;
    547 
    548  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    549    inptr = input_data[inrow];
    550    outptr0 = output_data[outrow++];
    551    outptr1 = output_data[outrow++];
    552 
    553    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
    554      uint8x16_t samples = vld1q_u8(inptr + colctr);
    555      /* Duplicate the samples.  The store operation below interleaves them so
    556       * that adjacent pixel component values take on the same sample value,
    557       * per above.
    558       */
    559      uint8x16x2_t output_pixels = { { samples, samples } };
    560      /* Store pixel component values for both output rows to memory.
    561       * Due to the way sample buffers are allocated, we don't need to worry
    562       * about tail cases when output_width is not a multiple of 32.  See
    563       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
    564       */
    565      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
    566      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
    567    }
    568  }
    569 }