tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdcolext-neon.c (16425B)


      1 /*
      2 * jdcolext-neon.c - colorspace conversion (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
      6 *
      7 * This software is provided 'as-is', without any express or implied
      8 * warranty.  In no event will the authors be held liable for any damages
      9 * arising from the use of this software.
     10 *
     11 * Permission is granted to anyone to use this software for any purpose,
     12 * including commercial applications, and to alter it and redistribute it
     13 * freely, subject to the following restrictions:
     14 *
     15 * 1. The origin of this software must not be misrepresented; you must not
     16 *    claim that you wrote the original software. If you use this software
     17 *    in a product, an acknowledgment in the product documentation would be
     18 *    appreciated but is not required.
     19 * 2. Altered source versions must be plainly marked as such, and must not be
     20 *    misrepresented as being the original software.
     21 * 3. This notice may not be removed or altered from any source distribution.
     22 */
     23 
     24 /* This file is included by jdcolor-neon.c. */
     25 
     26 
     27 /* YCbCr -> RGB conversion is defined by the following equations:
     28 *    R = Y                        + 1.40200 * (Cr - 128)
     29 *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
     30 *    B = Y + 1.77200 * (Cb - 128)
     31 *
     32 * Scaled integer constants are used to avoid floating-point arithmetic:
     33 *    0.3441467 = 11277 * 2^-15
     34 *    0.7141418 = 23401 * 2^-15
     35 *    1.4020386 = 22971 * 2^-14
     36 *    1.7720337 = 29033 * 2^-14
     37 * These constants are defined in jdcolor-neon.c.
     38 *
     39 * To ensure correct results, rounding is used when descaling.
     40 */
     41 
     42 /* Notes on safe memory access for YCbCr -> RGB conversion routines:
     43 *
     44 * Input memory buffers can be safely overread up to the next multiple of
     45 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
     46 * jmemmgr.c.
     47 *
     48 * The output buffer cannot safely be written beyond output_width, since
     49 * output_buf points to a possibly unpadded row in the decompressed image
     50 * buffer allocated by the calling program.
     51 */
     52 
     53 void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
     54                                JDIMENSION input_row, JSAMPARRAY output_buf,
     55                                int num_rows)
     56 {
     57  JSAMPROW outptr;
     58  /* Pointers to Y, Cb, and Cr data */
     59  JSAMPROW inptr0, inptr1, inptr2;
     60 
     61  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
     62  const int16x8_t neg_128 = vdupq_n_s16(-128);
     63 
     64  while (--num_rows >= 0) {
     65    inptr0 = input_buf[0][input_row];
     66    inptr1 = input_buf[1][input_row];
     67    inptr2 = input_buf[2][input_row];
     68    input_row++;
     69    outptr = *output_buf++;
     70    int cols_remaining = output_width;
     71    for (; cols_remaining >= 16; cols_remaining -= 16) {
     72      uint8x16_t y  = vld1q_u8(inptr0);
     73      uint8x16_t cb = vld1q_u8(inptr1);
     74      uint8x16_t cr = vld1q_u8(inptr2);
     75      /* Subtract 128 from Cb and Cr. */
     76      int16x8_t cr_128_l =
     77        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
     78                                       vget_low_u8(cr)));
     79      int16x8_t cr_128_h =
     80        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
     81                                       vget_high_u8(cr)));
     82      int16x8_t cb_128_l =
     83        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
     84                                       vget_low_u8(cb)));
     85      int16x8_t cb_128_h =
     86        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
     87                                       vget_high_u8(cb)));
     88      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
     89      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
     90      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
     91                                            consts, 0);
     92      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
     93      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
     94                                            consts, 0);
     95      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
     96                                  consts, 1);
     97      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
     98                                  consts, 1);
     99      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
    100                                  consts, 1);
    101      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
    102                                  consts, 1);
    103      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
    104      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
    105                                         vrshrn_n_s32(g_sub_y_lh, 15));
    106      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
    107                                         vrshrn_n_s32(g_sub_y_hh, 15));
    108      /* Compute R-Y: 1.40200 * (Cr - 128) */
    109      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
    110                                               consts, 2);
    111      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
    112                                               consts, 2);
    113      /* Compute B-Y: 1.77200 * (Cb - 128) */
    114      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
    115                                               consts, 3);
    116      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
    117                                               consts, 3);
    118      /* Add Y. */
    119      int16x8_t r_l =
    120        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
    121                                       vget_low_u8(y)));
    122      int16x8_t r_h =
    123        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
    124                                       vget_high_u8(y)));
    125      int16x8_t b_l =
    126        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
    127                                       vget_low_u8(y)));
    128      int16x8_t b_h =
    129        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
    130                                       vget_high_u8(y)));
    131      int16x8_t g_l =
    132        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
    133                                       vget_low_u8(y)));
    134      int16x8_t g_h =
    135        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
    136                                       vget_high_u8(y)));
    137 
    138 #if RGB_PIXELSIZE == 4
    139      uint8x16x4_t rgba;
    140      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
    141      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
    142      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
    143      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
    144      /* Set alpha channel to opaque (0xFF). */
    145      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
    146      /* Store RGBA pixel data to memory. */
    147      vst4q_u8(outptr, rgba);
    148 #elif RGB_PIXELSIZE == 3
    149      uint8x16x3_t rgb;
    150      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
    151      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
    152      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
    153      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
    154      /* Store RGB pixel data to memory. */
    155      vst3q_u8(outptr, rgb);
    156 #else
    157      /* Pack R, G, and B values in ratio 5:6:5. */
    158      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
    159      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
    160      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
    161      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
    162      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
    163      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
    164      /* Store RGB pixel data to memory. */
    165      vst1q_u16((uint16_t *)outptr, rgb565_l);
    166      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
    167 #endif
    168 
    169      /* Increment pointers. */
    170      inptr0 += 16;
    171      inptr1 += 16;
    172      inptr2 += 16;
    173      outptr += (RGB_PIXELSIZE * 16);
    174    }
    175 
    176    if (cols_remaining >= 8) {
    177      uint8x8_t y  = vld1_u8(inptr0);
    178      uint8x8_t cb = vld1_u8(inptr1);
    179      uint8x8_t cr = vld1_u8(inptr2);
    180      /* Subtract 128 from Cb and Cr. */
    181      int16x8_t cr_128 =
    182        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
    183      int16x8_t cb_128 =
    184        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
    185      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
    186      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
    187      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
    188      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
    189      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
    190      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
    191      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
    192                                       vrshrn_n_s32(g_sub_y_h, 15));
    193      /* Compute R-Y: 1.40200 * (Cr - 128) */
    194      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
    195                                             consts, 2);
    196      /* Compute B-Y: 1.77200 * (Cb - 128) */
    197      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
    198                                             consts, 3);
    199      /* Add Y. */
    200      int16x8_t r =
    201        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
    202      int16x8_t b =
    203        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
    204      int16x8_t g =
    205        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
    206 
    207 #if RGB_PIXELSIZE == 4
    208      uint8x8x4_t rgba;
    209      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
    210      rgba.val[RGB_RED] = vqmovun_s16(r);
    211      rgba.val[RGB_GREEN] = vqmovun_s16(g);
    212      rgba.val[RGB_BLUE] = vqmovun_s16(b);
    213      /* Set alpha channel to opaque (0xFF). */
    214      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    215      /* Store RGBA pixel data to memory. */
    216      vst4_u8(outptr, rgba);
    217 #elif RGB_PIXELSIZE == 3
    218      uint8x8x3_t rgb;
    219      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
    220      rgb.val[RGB_RED] = vqmovun_s16(r);
    221      rgb.val[RGB_GREEN] = vqmovun_s16(g);
    222      rgb.val[RGB_BLUE] = vqmovun_s16(b);
    223      /* Store RGB pixel data to memory. */
    224      vst3_u8(outptr, rgb);
    225 #else
    226      /* Pack R, G, and B values in ratio 5:6:5. */
    227      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
    228      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
    229      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
    230      /* Store RGB pixel data to memory. */
    231      vst1q_u16((uint16_t *)outptr, rgb565);
    232 #endif
    233 
    234      /* Increment pointers. */
    235      inptr0 += 8;
    236      inptr1 += 8;
    237      inptr2 += 8;
    238      outptr += (RGB_PIXELSIZE * 8);
    239      cols_remaining -= 8;
    240    }
    241 
    242    /* Handle the tail elements. */
    243    if (cols_remaining > 0) {
    244      uint8x8_t y  = vld1_u8(inptr0);
    245      uint8x8_t cb = vld1_u8(inptr1);
    246      uint8x8_t cr = vld1_u8(inptr2);
    247      /* Subtract 128 from Cb and Cr. */
    248      int16x8_t cr_128 =
    249        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
    250      int16x8_t cb_128 =
    251        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
    252      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
    253      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
    254      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
    255      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
    256      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
    257      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
    258      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
    259                                       vrshrn_n_s32(g_sub_y_h, 15));
    260      /* Compute R-Y: 1.40200 * (Cr - 128) */
    261      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
    262                                             consts, 2);
    263      /* Compute B-Y: 1.77200 * (Cb - 128) */
    264      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
    265                                             consts, 3);
    266      /* Add Y. */
    267      int16x8_t r =
    268        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
    269      int16x8_t b =
    270        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
    271      int16x8_t g =
    272        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
    273 
    274 #if RGB_PIXELSIZE == 4
    275      uint8x8x4_t rgba;
    276      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
    277      rgba.val[RGB_RED] = vqmovun_s16(r);
    278      rgba.val[RGB_GREEN] = vqmovun_s16(g);
    279      rgba.val[RGB_BLUE] = vqmovun_s16(b);
    280      /* Set alpha channel to opaque (0xFF). */
    281      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
    282      /* Store RGBA pixel data to memory. */
    283      switch (cols_remaining) {
    284      case 7:
    285        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
    286        FALLTHROUGH             /*FALLTHROUGH*/
    287      case 6:
    288        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
    289        FALLTHROUGH             /*FALLTHROUGH*/
    290      case 5:
    291        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
    292        FALLTHROUGH             /*FALLTHROUGH*/
    293      case 4:
    294        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
    295        FALLTHROUGH             /*FALLTHROUGH*/
    296      case 3:
    297        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
    298        FALLTHROUGH             /*FALLTHROUGH*/
    299      case 2:
    300        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
    301        FALLTHROUGH             /*FALLTHROUGH*/
    302      case 1:
    303        vst4_lane_u8(outptr, rgba, 0);
    304        FALLTHROUGH             /*FALLTHROUGH*/
    305      default:
    306        break;
    307      }
    308 #elif RGB_PIXELSIZE == 3
    309      uint8x8x3_t rgb;
    310      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
    311      rgb.val[RGB_RED] = vqmovun_s16(r);
    312      rgb.val[RGB_GREEN] = vqmovun_s16(g);
    313      rgb.val[RGB_BLUE] = vqmovun_s16(b);
    314      /* Store RGB pixel data to memory. */
    315      switch (cols_remaining) {
    316      case 7:
    317        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
    318        FALLTHROUGH             /*FALLTHROUGH*/
    319      case 6:
    320        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
    321        FALLTHROUGH             /*FALLTHROUGH*/
    322      case 5:
    323        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
    324        FALLTHROUGH             /*FALLTHROUGH*/
    325      case 4:
    326        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
    327        FALLTHROUGH             /*FALLTHROUGH*/
    328      case 3:
    329        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
    330        FALLTHROUGH             /*FALLTHROUGH*/
    331      case 2:
    332        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
    333        FALLTHROUGH             /*FALLTHROUGH*/
    334      case 1:
    335        vst3_lane_u8(outptr, rgb, 0);
    336        FALLTHROUGH             /*FALLTHROUGH*/
    337      default:
    338        break;
    339      }
    340 #else
    341      /* Pack R, G, and B values in ratio 5:6:5. */
    342      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
    343      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
    344      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
    345      /* Store RGB565 pixel data to memory. */
    346      switch (cols_remaining) {
    347      case 7:
    348        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
    349        FALLTHROUGH             /*FALLTHROUGH*/
    350      case 6:
    351        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
    352        FALLTHROUGH             /*FALLTHROUGH*/
    353      case 5:
    354        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
    355        FALLTHROUGH             /*FALLTHROUGH*/
    356      case 4:
    357        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
    358        FALLTHROUGH             /*FALLTHROUGH*/
    359      case 3:
    360        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
    361        FALLTHROUGH             /*FALLTHROUGH*/
    362      case 2:
    363        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
    364        FALLTHROUGH             /*FALLTHROUGH*/
    365      case 1:
    366        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
    367        FALLTHROUGH             /*FALLTHROUGH*/
    368      default:
    369        break;
    370      }
    371 #endif
    372    }
    373  }
    374 }