tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdmrgext-neon.c (30912B)


      1 /*
      2 * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
      6 *
      7 * This software is provided 'as-is', without any express or implied
      8 * warranty.  In no event will the authors be held liable for any damages
      9 * arising from the use of this software.
     10 *
     11 * Permission is granted to anyone to use this software for any purpose,
     12 * including commercial applications, and to alter it and redistribute it
     13 * freely, subject to the following restrictions:
     14 *
     15 * 1. The origin of this software must not be misrepresented; you must not
     16 *    claim that you wrote the original software. If you use this software
     17 *    in a product, an acknowledgment in the product documentation would be
     18 *    appreciated but is not required.
     19 * 2. Altered source versions must be plainly marked as such, and must not be
     20 *    misrepresented as being the original software.
     21 * 3. This notice may not be removed or altered from any source distribution.
     22 */
     23 
     24 /* This file is included by jdmerge-neon.c. */
     25 
     26 
     27 /* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
     28 * chroma upsampling and YCbCr -> RGB color conversion into a single function.
     29 *
     30 * As with the standalone functions, YCbCr -> RGB conversion is defined by the
     31 * following equations:
     32 *    R = Y                        + 1.40200 * (Cr - 128)
     33 *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
     34 *    B = Y + 1.77200 * (Cb - 128)
     35 *
     36 * Scaled integer constants are used to avoid floating-point arithmetic:
     37 *    0.3441467 = 11277 * 2^-15
     38 *    0.7141418 = 23401 * 2^-15
     39 *    1.4020386 = 22971 * 2^-14
     40 *    1.7720337 = 29033 * 2^-14
     41 * These constants are defined in jdmerge-neon.c.
     42 *
     43 * To ensure correct results, rounding is used when descaling.
     44 */
     45 
     46 /* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
     47 * routines:
     48 *
     49 * Input memory buffers can be safely overread up to the next multiple of
     50 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
     51 * jmemmgr.c.
     52 *
     53 * The output buffer cannot safely be written beyond output_width, since
     54 * output_buf points to a possibly unpadded row in the decompressed image
     55 * buffer allocated by the calling program.
     56 */
     57 
     58 /* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
     59 */
     60 
     61 void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
     62                                     JSAMPIMAGE input_buf,
     63                                     JDIMENSION in_row_group_ctr,
     64                                     JSAMPARRAY output_buf)
     65 {
     66  JSAMPROW outptr;
     67  /* Pointers to Y, Cb, and Cr data */
     68  JSAMPROW inptr0, inptr1, inptr2;
     69 
     70  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
     71  const int16x8_t neg_128 = vdupq_n_s16(-128);
     72 
     73  inptr0 = input_buf[0][in_row_group_ctr];
     74  inptr1 = input_buf[1][in_row_group_ctr];
     75  inptr2 = input_buf[2][in_row_group_ctr];
     76  outptr = output_buf[0];
     77 
     78  int cols_remaining = output_width;
     79  for (; cols_remaining >= 16; cols_remaining -= 16) {
     80    /* De-interleave Y component values into two separate vectors, one
     81     * containing the component values with even-numbered indices and one
     82     * containing the component values with odd-numbered indices.
     83     */
     84    uint8x8x2_t y = vld2_u8(inptr0);
     85    uint8x8_t cb = vld1_u8(inptr1);
     86    uint8x8_t cr = vld1_u8(inptr2);
     87    /* Subtract 128 from Cb and Cr. */
     88    int16x8_t cr_128 =
     89      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
     90    int16x8_t cb_128 =
     91      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
     92    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
     93    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
     94    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
     95    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
     96    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
     97    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
     98    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
     99                                     vrshrn_n_s32(g_sub_y_h, 15));
    100    /* Compute R-Y: 1.40200 * (Cr - 128) */
    101    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
    102    /* Compute B-Y: 1.77200 * (Cb - 128) */
    103    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
    104    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
    105     * "odd" Y component values.  This effectively upsamples the chroma
    106     * components horizontally.
    107     */
    108    int16x8_t g_even =
    109      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    110                                     y.val[0]));
    111    int16x8_t r_even =
    112      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    113                                     y.val[0]));
    114    int16x8_t b_even =
    115      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    116                                     y.val[0]));
    117    int16x8_t g_odd =
    118      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    119                                     y.val[1]));
    120    int16x8_t r_odd =
    121      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    122                                     y.val[1]));
    123    int16x8_t b_odd =
    124      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    125                                     y.val[1]));
    126    /* Convert each component to unsigned and narrow, clamping to [0-255].
    127     * Re-interleave the "even" and "odd" component values.
    128     */
    129    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
    130    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
    131    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
    132 
    133 #ifdef RGB_ALPHA
    134    uint8x16x4_t rgba;
    135    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
    136    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
    137    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
    138    /* Set alpha channel to opaque (0xFF). */
    139    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
    140    /* Store RGBA pixel data to memory. */
    141    vst4q_u8(outptr, rgba);
    142 #else
    143    uint8x16x3_t rgb;
    144    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
    145    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
    146    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
    147    /* Store RGB pixel data to memory. */
    148    vst3q_u8(outptr, rgb);
    149 #endif
    150 
    151    /* Increment pointers. */
    152    inptr0 += 16;
    153    inptr1 += 8;
    154    inptr2 += 8;
    155    outptr += (RGB_PIXELSIZE * 16);
    156  }
    157 
    158  if (cols_remaining > 0) {
    159    /* De-interleave Y component values into two separate vectors, one
    160     * containing the component values with even-numbered indices and one
    161     * containing the component values with odd-numbered indices.
    162     */
    163    uint8x8x2_t y = vld2_u8(inptr0);
    164    uint8x8_t cb = vld1_u8(inptr1);
    165    uint8x8_t cr = vld1_u8(inptr2);
    166    /* Subtract 128 from Cb and Cr. */
    167    int16x8_t cr_128 =
    168      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
    169    int16x8_t cb_128 =
    170      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
    171    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
    172    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
    173    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
    174    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
    175    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
    176    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
    177    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
    178                                     vrshrn_n_s32(g_sub_y_h, 15));
    179    /* Compute R-Y: 1.40200 * (Cr - 128) */
    180    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
    181    /* Compute B-Y: 1.77200 * (Cb - 128) */
    182    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
    183    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
    184     * "odd" Y component values.  This effectively upsamples the chroma
    185     * components horizontally.
    186     */
    187    int16x8_t g_even =
    188      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    189                                     y.val[0]));
    190    int16x8_t r_even =
    191      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    192                                     y.val[0]));
    193    int16x8_t b_even =
    194      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    195                                     y.val[0]));
    196    int16x8_t g_odd =
    197      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    198                                     y.val[1]));
    199    int16x8_t r_odd =
    200      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    201                                     y.val[1]));
    202    int16x8_t b_odd =
    203      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    204                                     y.val[1]));
    205    /* Convert each component to unsigned and narrow, clamping to [0-255].
    206     * Re-interleave the "even" and "odd" component values.
    207     */
    208    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
    209    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
    210    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
    211 
    212 #ifdef RGB_ALPHA
    213    uint8x8x4_t rgba_h;
    214    rgba_h.val[RGB_RED] = r.val[1];
    215    rgba_h.val[RGB_GREEN] = g.val[1];
    216    rgba_h.val[RGB_BLUE] = b.val[1];
    217    /* Set alpha channel to opaque (0xFF). */
    218    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    219    uint8x8x4_t rgba_l;
    220    rgba_l.val[RGB_RED] = r.val[0];
    221    rgba_l.val[RGB_GREEN] = g.val[0];
    222    rgba_l.val[RGB_BLUE] = b.val[0];
    223    /* Set alpha channel to opaque (0xFF). */
    224    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    225    /* Store RGBA pixel data to memory. */
    226    switch (cols_remaining) {
    227    case 15:
    228      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
    229      FALLTHROUGH               /*FALLTHROUGH*/
    230    case 14:
    231      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
    232      FALLTHROUGH               /*FALLTHROUGH*/
    233    case 13:
    234      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
    235      FALLTHROUGH               /*FALLTHROUGH*/
    236    case 12:
    237      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
    238      FALLTHROUGH               /*FALLTHROUGH*/
    239    case 11:
    240      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
    241      FALLTHROUGH               /*FALLTHROUGH*/
    242    case 10:
    243      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
    244      FALLTHROUGH               /*FALLTHROUGH*/
    245    case 9:
    246      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
    247      FALLTHROUGH               /*FALLTHROUGH*/
    248    case 8:
    249      vst4_u8(outptr, rgba_l);
    250      break;
    251    case 7:
    252      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
    253      FALLTHROUGH               /*FALLTHROUGH*/
    254    case 6:
    255      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
    256      FALLTHROUGH               /*FALLTHROUGH*/
    257    case 5:
    258      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
    259      FALLTHROUGH               /*FALLTHROUGH*/
    260    case 4:
    261      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
    262      FALLTHROUGH               /*FALLTHROUGH*/
    263    case 3:
    264      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
    265      FALLTHROUGH               /*FALLTHROUGH*/
    266    case 2:
    267      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
    268      FALLTHROUGH               /*FALLTHROUGH*/
    269    case 1:
    270      vst4_lane_u8(outptr, rgba_l, 0);
    271      FALLTHROUGH               /*FALLTHROUGH*/
    272    default:
    273      break;
    274    }
    275 #else
    276    uint8x8x3_t rgb_h;
    277    rgb_h.val[RGB_RED] = r.val[1];
    278    rgb_h.val[RGB_GREEN] = g.val[1];
    279    rgb_h.val[RGB_BLUE] = b.val[1];
    280    uint8x8x3_t rgb_l;
    281    rgb_l.val[RGB_RED] = r.val[0];
    282    rgb_l.val[RGB_GREEN] = g.val[0];
    283    rgb_l.val[RGB_BLUE] = b.val[0];
    284    /* Store RGB pixel data to memory. */
    285    switch (cols_remaining) {
    286    case 15:
    287      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
    288      FALLTHROUGH               /*FALLTHROUGH*/
    289    case 14:
    290      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
    291      FALLTHROUGH               /*FALLTHROUGH*/
    292    case 13:
    293      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
    294      FALLTHROUGH               /*FALLTHROUGH*/
    295    case 12:
    296      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
    297      FALLTHROUGH               /*FALLTHROUGH*/
    298    case 11:
    299      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
    300      FALLTHROUGH               /*FALLTHROUGH*/
    301    case 10:
    302      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
    303      FALLTHROUGH               /*FALLTHROUGH*/
    304    case 9:
    305      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
    306      FALLTHROUGH               /*FALLTHROUGH*/
    307    case 8:
    308      vst3_u8(outptr, rgb_l);
    309      break;
    310    case 7:
    311      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
    312      FALLTHROUGH               /*FALLTHROUGH*/
    313    case 6:
    314      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
    315      FALLTHROUGH               /*FALLTHROUGH*/
    316    case 5:
    317      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
    318      FALLTHROUGH               /*FALLTHROUGH*/
    319    case 4:
    320      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
    321      FALLTHROUGH               /*FALLTHROUGH*/
    322    case 3:
    323      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
    324      FALLTHROUGH               /*FALLTHROUGH*/
    325    case 2:
    326      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
    327      FALLTHROUGH               /*FALLTHROUGH*/
    328    case 1:
    329      vst3_lane_u8(outptr, rgb_l, 0);
    330      FALLTHROUGH               /*FALLTHROUGH*/
    331    default:
    332      break;
    333    }
    334 #endif
    335  }
    336 }
    337 
    338 
    339 /* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
    340 *
    341 * See comments above for details regarding color conversion and safe memory
    342 * access.
    343 */
    344 
    345 void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
    346                                     JSAMPIMAGE input_buf,
    347                                     JDIMENSION in_row_group_ctr,
    348                                     JSAMPARRAY output_buf)
    349 {
    350  JSAMPROW outptr0, outptr1;
    351  /* Pointers to Y (both rows), Cb, and Cr data */
    352  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
    353 
    354  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
    355  const int16x8_t neg_128 = vdupq_n_s16(-128);
    356 
    357  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
    358  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
    359  inptr1 = input_buf[1][in_row_group_ctr];
    360  inptr2 = input_buf[2][in_row_group_ctr];
    361  outptr0 = output_buf[0];
    362  outptr1 = output_buf[1];
    363 
    364  int cols_remaining = output_width;
    365  for (; cols_remaining >= 16; cols_remaining -= 16) {
    366    /* For each row, de-interleave Y component values into two separate
    367     * vectors, one containing the component values with even-numbered indices
    368     * and one containing the component values with odd-numbered indices.
    369     */
    370    uint8x8x2_t y0 = vld2_u8(inptr0_0);
    371    uint8x8x2_t y1 = vld2_u8(inptr0_1);
    372    uint8x8_t cb = vld1_u8(inptr1);
    373    uint8x8_t cr = vld1_u8(inptr2);
    374    /* Subtract 128 from Cb and Cr. */
    375    int16x8_t cr_128 =
    376      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
    377    int16x8_t cb_128 =
    378      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
    379    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
    380    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
    381    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
    382    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
    383    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
    384    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
    385    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
    386                                     vrshrn_n_s32(g_sub_y_h, 15));
    387    /* Compute R-Y: 1.40200 * (Cr - 128) */
    388    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
    389    /* Compute B-Y: 1.77200 * (Cb - 128) */
    390    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
    391    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
    392     * the "even" and "odd" Y component values.  This effectively upsamples the
    393     * chroma components both horizontally and vertically.
    394     */
    395    int16x8_t g0_even =
    396      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    397                                     y0.val[0]));
    398    int16x8_t r0_even =
    399      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    400                                     y0.val[0]));
    401    int16x8_t b0_even =
    402      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    403                                     y0.val[0]));
    404    int16x8_t g0_odd =
    405      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    406                                     y0.val[1]));
    407    int16x8_t r0_odd =
    408      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    409                                     y0.val[1]));
    410    int16x8_t b0_odd =
    411      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    412                                     y0.val[1]));
    413    int16x8_t g1_even =
    414      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    415                                     y1.val[0]));
    416    int16x8_t r1_even =
    417      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    418                                     y1.val[0]));
    419    int16x8_t b1_even =
    420      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    421                                     y1.val[0]));
    422    int16x8_t g1_odd =
    423      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    424                                     y1.val[1]));
    425    int16x8_t r1_odd =
    426      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    427                                     y1.val[1]));
    428    int16x8_t b1_odd =
    429      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    430                                     y1.val[1]));
    431    /* Convert each component to unsigned and narrow, clamping to [0-255].
    432     * Re-interleave the "even" and "odd" component values.
    433     */
    434    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
    435    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
    436    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
    437    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
    438    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
    439    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
    440 
    441 #ifdef RGB_ALPHA
    442    uint8x16x4_t rgba0, rgba1;
    443    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
    444    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
    445    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
    446    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
    447    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
    448    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
    449    /* Set alpha channel to opaque (0xFF). */
    450    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
    451    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
    452    /* Store RGBA pixel data to memory. */
    453    vst4q_u8(outptr0, rgba0);
    454    vst4q_u8(outptr1, rgba1);
    455 #else
    456    uint8x16x3_t rgb0, rgb1;
    457    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
    458    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
    459    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
    460    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
    461    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
    462    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
    463    /* Store RGB pixel data to memory. */
    464    vst3q_u8(outptr0, rgb0);
    465    vst3q_u8(outptr1, rgb1);
    466 #endif
    467 
    468    /* Increment pointers. */
    469    inptr0_0 += 16;
    470    inptr0_1 += 16;
    471    inptr1 += 8;
    472    inptr2 += 8;
    473    outptr0 += (RGB_PIXELSIZE * 16);
    474    outptr1 += (RGB_PIXELSIZE * 16);
    475  }
    476 
    477  if (cols_remaining > 0) {
    478    /* For each row, de-interleave Y component values into two separate
    479     * vectors, one containing the component values with even-numbered indices
    480     * and one containing the component values with odd-numbered indices.
    481     */
    482    uint8x8x2_t y0 = vld2_u8(inptr0_0);
    483    uint8x8x2_t y1 = vld2_u8(inptr0_1);
    484    uint8x8_t cb = vld1_u8(inptr1);
    485    uint8x8_t cr = vld1_u8(inptr2);
    486    /* Subtract 128 from Cb and Cr. */
    487    int16x8_t cr_128 =
    488      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
    489    int16x8_t cb_128 =
    490      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
    491    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
    492    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
    493    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
    494    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
    495    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
    496    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
    497    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
    498                                     vrshrn_n_s32(g_sub_y_h, 15));
    499    /* Compute R-Y: 1.40200 * (Cr - 128) */
    500    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
    501    /* Compute B-Y: 1.77200 * (Cb - 128) */
    502    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
    503    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
    504     * the "even" and "odd" Y component values.  This effectively upsamples the
    505     * chroma components both horizontally and vertically.
    506     */
    507    int16x8_t g0_even =
    508      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    509                                     y0.val[0]));
    510    int16x8_t r0_even =
    511      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    512                                     y0.val[0]));
    513    int16x8_t b0_even =
    514      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    515                                     y0.val[0]));
    516    int16x8_t g0_odd =
    517      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    518                                     y0.val[1]));
    519    int16x8_t r0_odd =
    520      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    521                                     y0.val[1]));
    522    int16x8_t b0_odd =
    523      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    524                                     y0.val[1]));
    525    int16x8_t g1_even =
    526      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    527                                     y1.val[0]));
    528    int16x8_t r1_even =
    529      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    530                                     y1.val[0]));
    531    int16x8_t b1_even =
    532      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    533                                     y1.val[0]));
    534    int16x8_t g1_odd =
    535      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
    536                                     y1.val[1]));
    537    int16x8_t r1_odd =
    538      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
    539                                     y1.val[1]));
    540    int16x8_t b1_odd =
    541      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
    542                                     y1.val[1]));
    543    /* Convert each component to unsigned and narrow, clamping to [0-255].
    544     * Re-interleave the "even" and "odd" component values.
    545     */
    546    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
    547    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
    548    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
    549    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
    550    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
    551    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
    552 
    553 #ifdef RGB_ALPHA
    554    uint8x8x4_t rgba0_h, rgba1_h;
    555    rgba0_h.val[RGB_RED] = r0.val[1];
    556    rgba1_h.val[RGB_RED] = r1.val[1];
    557    rgba0_h.val[RGB_GREEN] = g0.val[1];
    558    rgba1_h.val[RGB_GREEN] = g1.val[1];
    559    rgba0_h.val[RGB_BLUE] = b0.val[1];
    560    rgba1_h.val[RGB_BLUE] = b1.val[1];
    561    /* Set alpha channel to opaque (0xFF). */
    562    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    563    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    564 
    565    uint8x8x4_t rgba0_l, rgba1_l;
    566    rgba0_l.val[RGB_RED] = r0.val[0];
    567    rgba1_l.val[RGB_RED] = r1.val[0];
    568    rgba0_l.val[RGB_GREEN] = g0.val[0];
    569    rgba1_l.val[RGB_GREEN] = g1.val[0];
    570    rgba0_l.val[RGB_BLUE] = b0.val[0];
    571    rgba1_l.val[RGB_BLUE] = b1.val[0];
    572    /* Set alpha channel to opaque (0xFF). */
    573    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    574    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    575    /* Store RGBA pixel data to memory. */
    576    switch (cols_remaining) {
    577    case 15:
    578      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
    579      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
    580      FALLTHROUGH               /*FALLTHROUGH*/
    581    case 14:
    582      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
    583      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
    584      FALLTHROUGH               /*FALLTHROUGH*/
    585    case 13:
    586      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
    587      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
    588      FALLTHROUGH               /*FALLTHROUGH*/
    589    case 12:
    590      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
    591      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
    592      FALLTHROUGH               /*FALLTHROUGH*/
    593    case 11:
    594      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
    595      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
    596      FALLTHROUGH               /*FALLTHROUGH*/
    597    case 10:
    598      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
    599      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
    600      FALLTHROUGH               /*FALLTHROUGH*/
    601    case 9:
    602      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
    603      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
    604      FALLTHROUGH               /*FALLTHROUGH*/
    605    case 8:
    606      vst4_u8(outptr0, rgba0_l);
    607      vst4_u8(outptr1, rgba1_l);
    608      break;
    609    case 7:
    610      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
    611      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
    612      FALLTHROUGH               /*FALLTHROUGH*/
    613    case 6:
    614      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
    615      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
    616      FALLTHROUGH               /*FALLTHROUGH*/
    617    case 5:
    618      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
    619      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
    620      FALLTHROUGH               /*FALLTHROUGH*/
    621    case 4:
    622      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
    623      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
    624      FALLTHROUGH               /*FALLTHROUGH*/
    625    case 3:
    626      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
    627      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
    628      FALLTHROUGH               /*FALLTHROUGH*/
    629    case 2:
    630      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
    631      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
    632      FALLTHROUGH               /*FALLTHROUGH*/
    633    case 1:
    634      vst4_lane_u8(outptr0, rgba0_l, 0);
    635      vst4_lane_u8(outptr1, rgba1_l, 0);
    636      FALLTHROUGH               /*FALLTHROUGH*/
    637    default:
    638      break;
    639    }
    640 #else
    641    uint8x8x3_t rgb0_h, rgb1_h;
    642    rgb0_h.val[RGB_RED] = r0.val[1];
    643    rgb1_h.val[RGB_RED] = r1.val[1];
    644    rgb0_h.val[RGB_GREEN] = g0.val[1];
    645    rgb1_h.val[RGB_GREEN] = g1.val[1];
    646    rgb0_h.val[RGB_BLUE] = b0.val[1];
    647    rgb1_h.val[RGB_BLUE] = b1.val[1];
    648 
    649    uint8x8x3_t rgb0_l, rgb1_l;
    650    rgb0_l.val[RGB_RED] = r0.val[0];
    651    rgb1_l.val[RGB_RED] = r1.val[0];
    652    rgb0_l.val[RGB_GREEN] = g0.val[0];
    653    rgb1_l.val[RGB_GREEN] = g1.val[0];
    654    rgb0_l.val[RGB_BLUE] = b0.val[0];
    655    rgb1_l.val[RGB_BLUE] = b1.val[0];
    656    /* Store RGB pixel data to memory. */
    657    switch (cols_remaining) {
    658    case 15:
    659      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
    660      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
    661      FALLTHROUGH               /*FALLTHROUGH*/
    662    case 14:
    663      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
    664      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
    665      FALLTHROUGH               /*FALLTHROUGH*/
    666    case 13:
    667      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
    668      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
    669      FALLTHROUGH               /*FALLTHROUGH*/
    670    case 12:
    671      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
    672      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
    673      FALLTHROUGH               /*FALLTHROUGH*/
    674    case 11:
    675      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
    676      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
    677      FALLTHROUGH               /*FALLTHROUGH*/
    678    case 10:
    679      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
    680      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
    681      FALLTHROUGH               /*FALLTHROUGH*/
    682    case 9:
    683      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
    684      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
    685      FALLTHROUGH               /*FALLTHROUGH*/
    686    case 8:
    687      vst3_u8(outptr0, rgb0_l);
    688      vst3_u8(outptr1, rgb1_l);
    689      break;
    690    case 7:
    691      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
    692      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
    693      FALLTHROUGH               /*FALLTHROUGH*/
    694    case 6:
    695      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
    696      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
    697      FALLTHROUGH               /*FALLTHROUGH*/
    698    case 5:
    699      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
    700      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
    701      FALLTHROUGH               /*FALLTHROUGH*/
    702    case 4:
    703      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
    704      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
    705      FALLTHROUGH               /*FALLTHROUGH*/
    706    case 3:
    707      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
    708      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
    709      FALLTHROUGH               /*FALLTHROUGH*/
    710    case 2:
    711      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
    712      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
    713      FALLTHROUGH               /*FALLTHROUGH*/
    714    case 1:
    715      vst3_lane_u8(outptr0, rgb0_l, 0);
    716      vst3_lane_u8(outptr1, rgb1_l, 0);
    717      FALLTHROUGH               /*FALLTHROUGH*/
    718    default:
    719      break;
    720    }
    721 #endif
    722  }
    723 }