tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcsample-neon.c (8626B)


      1 /*
      2 * jcsample-neon.c - downsampling (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 *
      6 * This software is provided 'as-is', without any express or implied
      7 * warranty.  In no event will the authors be held liable for any damages
      8 * arising from the use of this software.
      9 *
     10 * Permission is granted to anyone to use this software for any purpose,
     11 * including commercial applications, and to alter it and redistribute it
     12 * freely, subject to the following restrictions:
     13 *
     14 * 1. The origin of this software must not be misrepresented; you must not
     15 *    claim that you wrote the original software. If you use this software
     16 *    in a product, an acknowledgment in the product documentation would be
     17 *    appreciated but is not required.
     18 * 2. Altered source versions must be plainly marked as such, and must not be
     19 *    misrepresented as being the original software.
     20 * 3. This notice may not be removed or altered from any source distribution.
     21 */
     22 
     23 #define JPEG_INTERNALS
     24 #include "../../jinclude.h"
     25 #include "../../jpeglib.h"
     26 #include "../../jsimd.h"
     27 #include "../../jdct.h"
     28 #include "../../jsimddct.h"
     29 #include "../jsimd.h"
     30 #include "align.h"
     31 
     32 #include <arm_neon.h>
     33 
     34 
     35 ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
     36  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
     37  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
     38  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
     39  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
     40  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
     41  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
     42  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
     43  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
     44  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
     45  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
     46  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
     47  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
     48  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
     49  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
     50  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
     51  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
     52  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
     53  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
     54  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
     55  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
     56  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
     57  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
     58  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
     59  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
     60  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
     61  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
     62  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
     63  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
     64  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
     65  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     66  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
     67  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     68 };
     69 
     70 
     71 /* Downsample pixel values of a single component.
     72 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
     73 * without smoothing.
     74 */
     75 
     76 void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
     77                                JDIMENSION v_samp_factor,
     78                                JDIMENSION width_in_blocks,
     79                                JSAMPARRAY input_data, JSAMPARRAY output_data)
     80 {
     81  JSAMPROW inptr, outptr;
     82  /* Load expansion mask to pad remaining elements of last DCT block. */
     83  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
     84  const uint8x16_t expand_mask =
     85    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
     86  /* Load bias pattern (alternating every pixel.) */
     87  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
     88  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
     89  unsigned i, outrow;
     90 
     91  for (outrow = 0; outrow < v_samp_factor; outrow++) {
     92    outptr = output_data[outrow];
     93    inptr = input_data[outrow];
     94 
     95    /* Downsample all but the last DCT block of pixels. */
     96    for (i = 0; i < width_in_blocks - 1; i++) {
     97      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
     98      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
     99      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
    100      /* Divide total by 2 and narrow to 8-bit. */
    101      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
    102      /* Store samples to memory. */
    103      vst1_u8(outptr + i * DCTSIZE, samples_u8);
    104    }
    105 
    106    /* Load pixels in last DCT block into a table. */
    107    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
    108 #if defined(__aarch64__) || defined(_M_ARM64)
    109    /* Pad the empty elements with the value of the last pixel. */
    110    pixels = vqtbl1q_u8(pixels, expand_mask);
    111 #else
    112    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
    113    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
    114                         vtbl2_u8(table, vget_high_u8(expand_mask)));
    115 #endif
    116    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
    117    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
    118    /* Divide total by 2, narrow to 8-bit, and store. */
    119    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
    120    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
    121  }
    122 }
    123 
    124 
    125 /* Downsample pixel values of a single component.
    126 * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    127 * without smoothing.
    128 */
    129 
    130 void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
    131                                JDIMENSION v_samp_factor,
    132                                JDIMENSION width_in_blocks,
    133                                JSAMPARRAY input_data, JSAMPARRAY output_data)
    134 {
    135  JSAMPROW inptr0, inptr1, outptr;
    136  /* Load expansion mask to pad remaining elements of last DCT block. */
    137  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
    138  const uint8x16_t expand_mask =
    139    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
    140  /* Load bias pattern (alternating every pixel.) */
    141  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
    142  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
    143  unsigned i, outrow;
    144 
    145  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    146    outptr = output_data[outrow];
    147    inptr0 = input_data[outrow];
    148    inptr1 = input_data[outrow + 1];
    149 
    150    /* Downsample all but the last DCT block of pixels. */
    151    for (i = 0; i < width_in_blocks - 1; i++) {
    152      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
    153      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
    154      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
    155      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
    156      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
    157       */
    158      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
    159      /* Divide total by 4 and narrow to 8-bit. */
    160      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
    161      /* Store samples to memory and increment pointers. */
    162      vst1_u8(outptr + i * DCTSIZE, samples_u8);
    163    }
    164 
    165    /* Load pixels in last DCT block into a table. */
    166    uint8x16_t pixels_r0 =
    167      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
    168    uint8x16_t pixels_r1 =
    169      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
    170 #if defined(__aarch64__) || defined(_M_ARM64)
    171    /* Pad the empty elements with the value of the last pixel. */
    172    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
    173    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
    174 #else
    175    uint8x8x2_t table_r0 =
    176      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
    177    uint8x8x2_t table_r1 =
    178      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
    179    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
    180                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
    181    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
    182                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
    183 #endif
    184    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
    185    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
    186    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
    187    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
    188    /* Divide total by 4, narrow to 8-bit, and store. */
    189    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
    190    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
    191  }
    192 }