tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

hadamard_neon.c (12060B)


      1 /*
      2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 #include "aom/aom_integer.h"
     16 #include "aom_dsp/arm/mem_neon.h"
     17 #include "aom_dsp/arm/transpose_neon.h"
     18 
     19 static inline void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
     20                                         int16x4_t *a2, int16x4_t *a3) {
     21  const int16x4_t b0 = vhadd_s16(*a0, *a1);
     22  const int16x4_t b1 = vhsub_s16(*a0, *a1);
     23  const int16x4_t b2 = vhadd_s16(*a2, *a3);
     24  const int16x4_t b3 = vhsub_s16(*a2, *a3);
     25 
     26  *a0 = vadd_s16(b0, b2);
     27  *a1 = vadd_s16(b1, b3);
     28  *a2 = vsub_s16(b0, b2);
     29  *a3 = vsub_s16(b1, b3);
     30 }
     31 
     32 void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     33                           tran_low_t *coeff) {
     34  int16x4_t a0 = vld1_s16(src_diff);
     35  int16x4_t a1 = vld1_s16(src_diff + src_stride);
     36  int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride);
     37  int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride);
     38 
     39  hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
     40 
     41  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
     42 
     43  hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
     44 
     45  store_s16_to_tran_low(coeff, a0);
     46  store_s16_to_tran_low(coeff + 4, a1);
     47  store_s16_to_tran_low(coeff + 8, a2);
     48  store_s16_to_tran_low(coeff + 12, a3);
     49 }
     50 
     51 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
     52                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
     53                                 int16x8_t *a6, int16x8_t *a7) {
     54  const int16x8_t b0 = vaddq_s16(*a0, *a1);
     55  const int16x8_t b1 = vsubq_s16(*a0, *a1);
     56  const int16x8_t b2 = vaddq_s16(*a2, *a3);
     57  const int16x8_t b3 = vsubq_s16(*a2, *a3);
     58  const int16x8_t b4 = vaddq_s16(*a4, *a5);
     59  const int16x8_t b5 = vsubq_s16(*a4, *a5);
     60  const int16x8_t b6 = vaddq_s16(*a6, *a7);
     61  const int16x8_t b7 = vsubq_s16(*a6, *a7);
     62 
     63  const int16x8_t c0 = vaddq_s16(b0, b2);
     64  const int16x8_t c1 = vaddq_s16(b1, b3);
     65  const int16x8_t c2 = vsubq_s16(b0, b2);
     66  const int16x8_t c3 = vsubq_s16(b1, b3);
     67  const int16x8_t c4 = vaddq_s16(b4, b6);
     68  const int16x8_t c5 = vaddq_s16(b5, b7);
     69  const int16x8_t c6 = vsubq_s16(b4, b6);
     70  const int16x8_t c7 = vsubq_s16(b5, b7);
     71 
     72  *a0 = vaddq_s16(c0, c4);
     73  *a1 = vsubq_s16(c2, c6);
     74  *a2 = vsubq_s16(c0, c4);
     75  *a3 = vaddq_s16(c2, c6);
     76  *a4 = vaddq_s16(c3, c7);
     77  *a5 = vsubq_s16(c3, c7);
     78  *a6 = vsubq_s16(c1, c5);
     79  *a7 = vaddq_s16(c1, c5);
     80 }
     81 
     82 void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     83                           tran_low_t *coeff) {
     84  int16x8_t a0 = vld1q_s16(src_diff);
     85  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
     86  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
     87  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
     88  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
     89  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
     90  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
     91  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
     92 
     93  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
     94 
     95  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
     96 
     97  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
     98 
     99  // Skip the second transpose because it is not required.
    100 
    101  store_s16q_to_tran_low(coeff + 0, a0);
    102  store_s16q_to_tran_low(coeff + 8, a1);
    103  store_s16q_to_tran_low(coeff + 16, a2);
    104  store_s16q_to_tran_low(coeff + 24, a3);
    105  store_s16q_to_tran_low(coeff + 32, a4);
    106  store_s16q_to_tran_low(coeff + 40, a5);
    107  store_s16q_to_tran_low(coeff + 48, a6);
    108  store_s16q_to_tran_low(coeff + 56, a7);
    109 }
    110 
    111 void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
    112                              int16_t *coeff) {
    113  int16x8_t a0 = vld1q_s16(src_diff);
    114  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
    115  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
    116  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
    117  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
    118  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
    119  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
    120  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
    121 
    122  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
    123 
    124  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
    125 
    126  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
    127 
    128  // Skip the second transpose because it is not required.
    129 
    130  vst1q_s16(coeff + 0, a0);
    131  vst1q_s16(coeff + 8, a1);
    132  vst1q_s16(coeff + 16, a2);
    133  vst1q_s16(coeff + 24, a3);
    134  vst1q_s16(coeff + 32, a4);
    135  vst1q_s16(coeff + 40, a5);
    136  vst1q_s16(coeff + 48, a6);
    137  vst1q_s16(coeff + 56, a7);
    138 }
    139 
    140 void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff,
    141                                   ptrdiff_t src_stride, int16_t *coeff) {
    142  for (int i = 0; i < 2; i++) {
    143    aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64));
    144  }
    145 }
    146 
    147 void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
    148                                int16_t *coeff) {
    149  /* Rearrange 16x16 to 8x32 and remove stride.
    150   * Top left first. */
    151  aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
    152                           coeff + 0);
    153  /* Top right. */
    154  aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
    155                           coeff + 64);
    156  /* Bottom left. */
    157  aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
    158                           coeff + 128);
    159  /* Bottom right. */
    160  aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
    161                           coeff + 192);
    162 
    163  for (int i = 0; i < 64; i += 8) {
    164    const int16x8_t a0 = vld1q_s16(coeff + 0);
    165    const int16x8_t a1 = vld1q_s16(coeff + 64);
    166    const int16x8_t a2 = vld1q_s16(coeff + 128);
    167    const int16x8_t a3 = vld1q_s16(coeff + 192);
    168 
    169    const int16x8_t b0 = vhaddq_s16(a0, a1);
    170    const int16x8_t b1 = vhsubq_s16(a0, a1);
    171    const int16x8_t b2 = vhaddq_s16(a2, a3);
    172    const int16x8_t b3 = vhsubq_s16(a2, a3);
    173 
    174    const int16x8_t c0 = vaddq_s16(b0, b2);
    175    const int16x8_t c1 = vaddq_s16(b1, b3);
    176    const int16x8_t c2 = vsubq_s16(b0, b2);
    177    const int16x8_t c3 = vsubq_s16(b1, b3);
    178 
    179    vst1q_s16(coeff + 0, c0);
    180    vst1q_s16(coeff + 64, c1);
    181    vst1q_s16(coeff + 128, c2);
    182    vst1q_s16(coeff + 192, c3);
    183 
    184    coeff += 8;
    185  }
    186 }
    187 
    188 void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
    189                             tran_low_t *coeff) {
    190  /* Rearrange 16x16 to 8x32 and remove stride.
    191   * Top left first. */
    192  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
    193  /* Top right. */
    194  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
    195  /* Bottom left. */
    196  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
    197  /* Bottom right. */
    198  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
    199 
    200  // Each iteration of the loop operates on entire rows (16 samples each)
    201  // because we need to swap the second and third quarters of every row in the
    202  // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
    203  // loop at the end of aom_hadamard_16x16_c.
    204  for (int i = 0; i < 64; i += 16) {
    205    const int32x4_t a00 = vld1q_s32(coeff + 0);
    206    const int32x4_t a01 = vld1q_s32(coeff + 64);
    207    const int32x4_t a02 = vld1q_s32(coeff + 128);
    208    const int32x4_t a03 = vld1q_s32(coeff + 192);
    209 
    210    const int32x4_t b00 = vhaddq_s32(a00, a01);
    211    const int32x4_t b01 = vhsubq_s32(a00, a01);
    212    const int32x4_t b02 = vhaddq_s32(a02, a03);
    213    const int32x4_t b03 = vhsubq_s32(a02, a03);
    214 
    215    const int32x4_t c00 = vaddq_s32(b00, b02);
    216    const int32x4_t c01 = vaddq_s32(b01, b03);
    217    const int32x4_t c02 = vsubq_s32(b00, b02);
    218    const int32x4_t c03 = vsubq_s32(b01, b03);
    219 
    220    const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
    221    const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
    222    const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
    223    const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
    224 
    225    const int32x4_t b10 = vhaddq_s32(a10, a11);
    226    const int32x4_t b11 = vhsubq_s32(a10, a11);
    227    const int32x4_t b12 = vhaddq_s32(a12, a13);
    228    const int32x4_t b13 = vhsubq_s32(a12, a13);
    229 
    230    const int32x4_t c10 = vaddq_s32(b10, b12);
    231    const int32x4_t c11 = vaddq_s32(b11, b13);
    232    const int32x4_t c12 = vsubq_s32(b10, b12);
    233    const int32x4_t c13 = vsubq_s32(b11, b13);
    234 
    235    const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
    236    const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
    237    const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
    238    const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
    239 
    240    const int32x4_t b20 = vhaddq_s32(a20, a21);
    241    const int32x4_t b21 = vhsubq_s32(a20, a21);
    242    const int32x4_t b22 = vhaddq_s32(a22, a23);
    243    const int32x4_t b23 = vhsubq_s32(a22, a23);
    244 
    245    const int32x4_t c20 = vaddq_s32(b20, b22);
    246    const int32x4_t c21 = vaddq_s32(b21, b23);
    247    const int32x4_t c22 = vsubq_s32(b20, b22);
    248    const int32x4_t c23 = vsubq_s32(b21, b23);
    249 
    250    const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
    251    const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
    252    const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
    253    const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
    254 
    255    const int32x4_t b30 = vhaddq_s32(a30, a31);
    256    const int32x4_t b31 = vhsubq_s32(a30, a31);
    257    const int32x4_t b32 = vhaddq_s32(a32, a33);
    258    const int32x4_t b33 = vhsubq_s32(a32, a33);
    259 
    260    const int32x4_t c30 = vaddq_s32(b30, b32);
    261    const int32x4_t c31 = vaddq_s32(b31, b33);
    262    const int32x4_t c32 = vsubq_s32(b30, b32);
    263    const int32x4_t c33 = vsubq_s32(b31, b33);
    264 
    265    vst1q_s32(coeff + 0 + 0, c00);
    266    vst1q_s32(coeff + 0 + 4, c20);
    267    vst1q_s32(coeff + 0 + 8, c10);
    268    vst1q_s32(coeff + 0 + 12, c30);
    269 
    270    vst1q_s32(coeff + 64 + 0, c01);
    271    vst1q_s32(coeff + 64 + 4, c21);
    272    vst1q_s32(coeff + 64 + 8, c11);
    273    vst1q_s32(coeff + 64 + 12, c31);
    274 
    275    vst1q_s32(coeff + 128 + 0, c02);
    276    vst1q_s32(coeff + 128 + 4, c22);
    277    vst1q_s32(coeff + 128 + 8, c12);
    278    vst1q_s32(coeff + 128 + 12, c32);
    279 
    280    vst1q_s32(coeff + 192 + 0, c03);
    281    vst1q_s32(coeff + 192 + 4, c23);
    282    vst1q_s32(coeff + 192 + 8, c13);
    283    vst1q_s32(coeff + 192 + 12, c33);
    284 
    285    coeff += 16;
    286  }
    287 }
    288 
    289 void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
    290                             tran_low_t *coeff) {
    291  /* Top left first. */
    292  aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
    293  /* Top right. */
    294  aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
    295                          coeff + 256);
    296  /* Bottom left. */
    297  aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
    298                          coeff + 512);
    299  /* Bottom right. */
    300  aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
    301                          coeff + 768);
    302 
    303  for (int i = 0; i < 256; i += 4) {
    304    const int32x4_t a0 = vld1q_s32(coeff);
    305    const int32x4_t a1 = vld1q_s32(coeff + 256);
    306    const int32x4_t a2 = vld1q_s32(coeff + 512);
    307    const int32x4_t a3 = vld1q_s32(coeff + 768);
    308 
    309    const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
    310    const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
    311    const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
    312    const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
    313 
    314    const int32x4_t c0 = vaddq_s32(b0, b2);
    315    const int32x4_t c1 = vaddq_s32(b1, b3);
    316    const int32x4_t c2 = vsubq_s32(b0, b2);
    317    const int32x4_t c3 = vsubq_s32(b1, b3);
    318 
    319    vst1q_s32(coeff + 0, c0);
    320    vst1q_s32(coeff + 256, c1);
    321    vst1q_s32(coeff + 512, c2);
    322    vst1q_s32(coeff + 768, c3);
    323 
    324    coeff += 4;
    325  }
    326 }