tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rescaler_neon.c (7453B)


      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // NEON version of rescaling functions
     11 //
     12 // Author: Skal (pascal.massimino@gmail.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
     17 
     18 #include <arm_neon.h>
     19 #include <assert.h>
     20 #include "src/dsp/neon.h"
     21 #include "src/utils/rescaler_utils.h"
     22 
     23 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
     24 #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
     25 #define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
     26 
     27 #define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC))
     28 #define LOAD_32x8(SRC, DST0, DST1)                                    \
     29    LOAD_32x4(SRC + 0, DST0);                                         \
     30    LOAD_32x4(SRC + 4, DST1)
     31 
     32 #define STORE_32x8(SRC0, SRC1, DST) do {                              \
     33    vst1q_u32((DST) + 0, SRC0);                                       \
     34    vst1q_u32((DST) + 4, SRC1);                                       \
     35 } while (0)
     36 
     37 #if (WEBP_RESCALER_RFIX == 32)
     38 #define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1))
     39 // note: B is actualy scale>>1. See MAKE_HALF_CST
     40 #define MULT_FIX(A, B) \
     41    vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
     42 #define MULT_FIX_FLOOR(A, B) \
     43    vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
     44 #else
     45 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
     46 #endif
     47 
     48 static uint32x4_t Interpolate_NEON(const rescaler_t* WEBP_RESTRICT const frow,
     49                                   const rescaler_t* WEBP_RESTRICT const irow,
     50                                   uint32_t A, uint32_t B) {
     51  LOAD_32x4(frow, A0);
     52  LOAD_32x4(irow, B0);
     53  const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
     54  const uint64x2_t C1 = vmull_n_u32(vget_high_u32(A0), A);
     55  const uint64x2_t D0 = vmlal_n_u32(C0, vget_low_u32(B0), B);
     56  const uint64x2_t D1 = vmlal_n_u32(C1, vget_high_u32(B0), B);
     57  const uint32x4_t E = vcombine_u32(
     58      vrshrn_n_u64(D0, WEBP_RESCALER_RFIX),
     59      vrshrn_n_u64(D1, WEBP_RESCALER_RFIX));
     60  return E;
     61 }
     62 
     63 static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
     64  int x_out;
     65  uint8_t* const dst = wrk->dst;
     66  rescaler_t* const irow = wrk->irow;
     67  const int x_out_max = wrk->dst_width * wrk->num_channels;
     68  const int max_span = x_out_max & ~7;
     69  const rescaler_t* const frow = wrk->frow;
     70  const uint32_t fy_scale = wrk->fy_scale;
     71  const int32x4_t fy_scale_half = MAKE_HALF_CST(fy_scale);
     72  assert(!WebPRescalerOutputDone(wrk));
     73  assert(wrk->y_accum <= 0);
     74  assert(wrk->y_expand);
     75  assert(wrk->y_sub != 0);
     76  if (wrk->y_accum == 0) {
     77    for (x_out = 0; x_out < max_span; x_out += 8) {
     78      LOAD_32x4(frow + x_out + 0, A0);
     79      LOAD_32x4(frow + x_out + 4, A1);
     80      const uint32x4_t B0 = MULT_FIX(A0, fy_scale_half);
     81      const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
     82      const uint16x4_t C0 = vmovn_u32(B0);
     83      const uint16x4_t C1 = vmovn_u32(B1);
     84      const uint8x8_t D = vqmovn_u16(vcombine_u16(C0, C1));
     85      vst1_u8(dst + x_out, D);
     86    }
     87    for (; x_out < x_out_max; ++x_out) {
     88      const uint32_t J = frow[x_out];
     89      const int v = (int)MULT_FIX_C(J, fy_scale);
     90      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     91    }
     92  } else {
     93    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
     94    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
     95    for (x_out = 0; x_out < max_span; x_out += 8) {
     96      const uint32x4_t C0 =
     97          Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
     98      const uint32x4_t C1 =
     99          Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
    100      const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
    101      const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
    102      const uint16x4_t E0 = vmovn_u32(D0);
    103      const uint16x4_t E1 = vmovn_u32(D1);
    104      const uint8x8_t F = vqmovn_u16(vcombine_u16(E0, E1));
    105      vst1_u8(dst + x_out, F);
    106    }
    107    for (; x_out < x_out_max; ++x_out) {
    108      const uint64_t I = (uint64_t)A * frow[x_out]
    109                       + (uint64_t)B * irow[x_out];
    110      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
    111      const int v = (int)MULT_FIX_C(J, fy_scale);
    112      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    113    }
    114  }
    115 }
    116 
    117 static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
    118  int x_out;
    119  uint8_t* const dst = wrk->dst;
    120  rescaler_t* const irow = wrk->irow;
    121  const int x_out_max = wrk->dst_width * wrk->num_channels;
    122  const int max_span = x_out_max & ~7;
    123  const rescaler_t* const frow = wrk->frow;
    124  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
    125  const uint32_t fxy_scale = wrk->fxy_scale;
    126  const uint32x4_t zero = vdupq_n_u32(0);
    127  const int32x4_t yscale_half = MAKE_HALF_CST(yscale);
    128  const int32x4_t fxy_scale_half = MAKE_HALF_CST(fxy_scale);
    129  assert(!WebPRescalerOutputDone(wrk));
    130  assert(wrk->y_accum <= 0);
    131  assert(!wrk->y_expand);
    132  if (yscale) {
    133    for (x_out = 0; x_out < max_span; x_out += 8) {
    134      LOAD_32x8(frow + x_out, in0, in1);
    135      LOAD_32x8(irow + x_out, in2, in3);
    136      const uint32x4_t A0 = MULT_FIX_FLOOR(in0, yscale_half);
    137      const uint32x4_t A1 = MULT_FIX_FLOOR(in1, yscale_half);
    138      const uint32x4_t B0 = vqsubq_u32(in2, A0);
    139      const uint32x4_t B1 = vqsubq_u32(in3, A1);
    140      const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
    141      const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
    142      const uint16x4_t D0 = vmovn_u32(C0);
    143      const uint16x4_t D1 = vmovn_u32(C1);
    144      const uint8x8_t E = vqmovn_u16(vcombine_u16(D0, D1));
    145      vst1_u8(dst + x_out, E);
    146      STORE_32x8(A0, A1, irow + x_out);
    147    }
    148    for (; x_out < x_out_max; ++x_out) {
    149      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
    150      const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
    151      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    152      irow[x_out] = frac;   // new fractional start
    153    }
    154  } else {
    155    for (x_out = 0; x_out < max_span; x_out += 8) {
    156      LOAD_32x8(irow + x_out, in0, in1);
    157      const uint32x4_t A0 = MULT_FIX(in0, fxy_scale_half);
    158      const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
    159      const uint16x4_t B0 = vmovn_u32(A0);
    160      const uint16x4_t B1 = vmovn_u32(A1);
    161      const uint8x8_t C = vqmovn_u16(vcombine_u16(B0, B1));
    162      vst1_u8(dst + x_out, C);
    163      STORE_32x8(zero, zero, irow + x_out);
    164    }
    165    for (; x_out < x_out_max; ++x_out) {
    166      const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
    167      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    168      irow[x_out] = 0;
    169    }
    170  }
    171 }
    172 
    173 #undef MULT_FIX_FLOOR_C
    174 #undef MULT_FIX_C
    175 #undef MULT_FIX_FLOOR
    176 #undef MULT_FIX
    177 #undef ROUNDER
    178 
    179 //------------------------------------------------------------------------------
    180 
    181 extern void WebPRescalerDspInitNEON(void);
    182 
    183 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
    184  WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
    185  WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
    186 }
    187 
    188 #else     // !WEBP_USE_NEON
    189 
    190 WEBP_DSP_INIT_STUB(WebPRescalerDspInitNEON)
    191 
    192 #endif    // WEBP_USE_NEON