[ tor-browser ].git.dasho

jquanti-neon.c (8989B)
      1 /*
      2 * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
      3 *
      4 * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
      5 *
      6 * This software is provided 'as-is', without any express or implied
      7 * warranty.  In no event will the authors be held liable for any damages
      8 * arising from the use of this software.
      9 *
     10 * Permission is granted to anyone to use this software for any purpose,
     11 * including commercial applications, and to alter it and redistribute it
     12 * freely, subject to the following restrictions:
     13 *
     14 * 1. The origin of this software must not be misrepresented; you must not
     15 *    claim that you wrote the original software. If you use this software
     16 *    in a product, an acknowledgment in the product documentation would be
     17 *    appreciated but is not required.
     18 * 2. Altered source versions must be plainly marked as such, and must not be
     19 *    misrepresented as being the original software.
     20 * 3. This notice may not be removed or altered from any source distribution.
     21 */
     22 
     23 #define JPEG_INTERNALS
     24 #include "../../jinclude.h"
     25 #include "../../jpeglib.h"
     26 #include "../../jsimd.h"
     27 #include "../../jdct.h"
     28 #include "../../jsimddct.h"
     29 #include "../jsimd.h"
     30 
     31 #include <arm_neon.h>
     32 
     33 
     34 /* After downsampling, the resulting sample values are in the range [0, 255],
     35 * but the Discrete Cosine Transform (DCT) operates on values centered around
     36 * 0.
     37 *
     38 * To prepare sample values for the DCT, load samples into a DCT workspace,
     39 * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
     40 * are also widened from 8- to 16-bit.
     41 *
     42 * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
     43 */
     44 
     45 void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
     46                         DCTELEM *workspace)
     47 {
     48  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
     49  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
     50  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
     51  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
     52  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
     53  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
     54  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
     55  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
     56 
     57  int16x8_t row0 =
     58    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
     59  int16x8_t row1 =
     60    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
     61  int16x8_t row2 =
     62    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
     63  int16x8_t row3 =
     64    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
     65  int16x8_t row4 =
     66    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
     67  int16x8_t row5 =
     68    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
     69  int16x8_t row6 =
     70    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
     71  int16x8_t row7 =
     72    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
     73 
     74  vst1q_s16(workspace + 0 * DCTSIZE, row0);
     75  vst1q_s16(workspace + 1 * DCTSIZE, row1);
     76  vst1q_s16(workspace + 2 * DCTSIZE, row2);
     77  vst1q_s16(workspace + 3 * DCTSIZE, row3);
     78  vst1q_s16(workspace + 4 * DCTSIZE, row4);
     79  vst1q_s16(workspace + 5 * DCTSIZE, row5);
     80  vst1q_s16(workspace + 6 * DCTSIZE, row6);
     81  vst1q_s16(workspace + 7 * DCTSIZE, row7);
     82 }
     83 
     84 
     85 /* After the DCT, the resulting array of coefficient values needs to be divided
     86 * by an array of quantization values.
     87 *
     88 * To avoid a slow division operation, the DCT coefficients are multiplied by
     89 * the (scaled) reciprocals of the quantization values and then right-shifted.
     90 *
     91 * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
     92 */
     93 
     94 void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
     95                         DCTELEM *workspace)
     96 {
     97  JCOEFPTR out_ptr = coef_block;
     98  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
     99  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
    100  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
    101  int i;
    102 
    103 #if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
    104 #pragma unroll
    105 #endif
    106  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
    107    /* Load DCT coefficients. */
    108    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
    109    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
    110    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
    111    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
    112    /* Load reciprocals of quantization values. */
    113    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
    114    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
    115    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
    116    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
    117    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
    118    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
    119    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
    120    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
    121    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
    122    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
    123    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
    124    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
    125 
    126    /* Extract sign from coefficients. */
    127    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
    128    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
    129    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
    130    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
    131    /* Get absolute value of DCT coefficients. */
    132    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
    133    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
    134    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
    135    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
    136    /* Add correction. */
    137    abs_row0 = vaddq_u16(abs_row0, corr0);
    138    abs_row1 = vaddq_u16(abs_row1, corr1);
    139    abs_row2 = vaddq_u16(abs_row2, corr2);
    140    abs_row3 = vaddq_u16(abs_row3, corr3);
    141 
    142    /* Multiply DCT coefficients by quantization reciprocals. */
    143    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
    144                                                       vget_low_u16(recip0)));
    145    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
    146                                                       vget_high_u16(recip0)));
    147    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
    148                                                       vget_low_u16(recip1)));
    149    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
    150                                                       vget_high_u16(recip1)));
    151    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
    152                                                       vget_low_u16(recip2)));
    153    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
    154                                                       vget_high_u16(recip2)));
    155    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
    156                                                       vget_low_u16(recip3)));
    157    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
    158                                                       vget_high_u16(recip3)));
    159    /* Narrow back to 16-bit. */
    160    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
    161    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
    162    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
    163    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
    164 
    165    /* Since VSHR only supports an immediate as its second argument, negate the
    166     * shift value and shift left.
    167     */
    168    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
    169                                           vnegq_s16(shift0)));
    170    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
    171                                           vnegq_s16(shift1)));
    172    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
    173                                           vnegq_s16(shift2)));
    174    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
    175                                           vnegq_s16(shift3)));
    176 
    177    /* Restore sign to original product. */
    178    row0 = veorq_s16(row0, sign_row0);
    179    row0 = vsubq_s16(row0, sign_row0);
    180    row1 = veorq_s16(row1, sign_row1);
    181    row1 = vsubq_s16(row1, sign_row1);
    182    row2 = veorq_s16(row2, sign_row2);
    183    row2 = vsubq_s16(row2, sign_row2);
    184    row3 = veorq_s16(row3, sign_row3);
    185    row3 = vsubq_s16(row3, sign_row3);
    186 
    187    /* Store quantized coefficients to memory. */
    188    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
    189    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
    190    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
    191    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
    192  }
    193 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE