tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jsimd_neon.S (43740B)


      1 /*
      2 * Armv7 Neon optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
      5 *                          All Rights Reserved.
      6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
      7 * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
      8 * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
      9 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
     10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
     11 *
     12 * This software is provided 'as-is', without any express or implied
     13 * warranty.  In no event will the authors be held liable for any damages
     14 * arising from the use of this software.
     15 *
     16 * Permission is granted to anyone to use this software for any purpose,
     17 * including commercial applications, and to alter it and redistribute it
     18 * freely, subject to the following restrictions:
     19 *
     20 * 1. The origin of this software must not be misrepresented; you must not
     21 *    claim that you wrote the original software. If you use this software
     22 *    in a product, an acknowledgment in the product documentation would be
     23 *    appreciated but is not required.
     24 * 2. Altered source versions must be plainly marked as such, and must not be
     25 *    misrepresented as being the original software.
     26 * 3. This notice may not be removed or altered from any source distribution.
     27 */
     28 
     29 #if defined(__linux__) && defined(__ELF__)
     30 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
     31 #endif
     32 
     33 .text
     34 .fpu neon
     35 .arch armv7a
     36 .object_arch armv4
     37 .arm
     38 .syntax unified
     39 
     40 
     41 /*****************************************************************************/
     42 
     43 /* Supplementary macro for setting function attributes */
     44 .macro asm_function fname
     45 #ifdef __APPLE__
     46    .private_extern _\fname
     47    .globl _\fname
     48 _\fname:
     49 #else
     50    .global \fname
     51 #ifdef __ELF__
     52    .hidden \fname
     53    .type \fname, %function
     54 #endif
     55 \fname:
     56 #endif
     57 .endm
     58 
     59 
     60 #define CENTERJSAMPLE  128
     61 
     62 /*****************************************************************************/
     63 
     64 /*
     65 * Perform dequantization and inverse DCT on one block of coefficients.
     66 *
     67 * GLOBAL(void)
     68 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
     69 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
     70 */
     71 
     72 #define FIX_0_298631336  (2446)
     73 #define FIX_0_390180644  (3196)
     74 #define FIX_0_541196100  (4433)
     75 #define FIX_0_765366865  (6270)
     76 #define FIX_0_899976223  (7373)
     77 #define FIX_1_175875602  (9633)
     78 #define FIX_1_501321110  (12299)
     79 #define FIX_1_847759065  (15137)
     80 #define FIX_1_961570560  (16069)
     81 #define FIX_2_053119869  (16819)
     82 #define FIX_2_562915447  (20995)
     83 #define FIX_3_072711026  (25172)
     84 
     85 #define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
     86 #define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
     87 #define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
     88 #define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
     89 #define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
     90 #define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
     91 #define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
     92 #define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
     93 
     94 /*
     95 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
     96 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
     97 */
     98 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
     99  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
    100  JLONG   q1, q2, q3, q4, q5, q6, q7; \
    101  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
    102  \
    103  /* 1-D iDCT input data */ \
    104  row0 = xrow0; \
    105  row1 = xrow1; \
    106  row2 = xrow2; \
    107  row3 = xrow3; \
    108  row4 = xrow4; \
    109  row5 = xrow5; \
    110  row6 = xrow6; \
    111  row7 = xrow7; \
    112  \
    113  q5 = row7 + row3; \
    114  q4 = row5 + row1; \
    115  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
    116       MULTIPLY(q4, FIX_1_175875602); \
    117  q7 = MULTIPLY(q5, FIX_1_175875602) + \
    118       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
    119  q2 = MULTIPLY(row2, FIX_0_541196100) + \
    120       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
    121  q4 = q6; \
    122  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
    123  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
    124        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
    125  /* now we can use q1 (reloadable constants have been used up) */ \
    126  q1 = q3 + q2; \
    127  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
    128        MULTIPLY(row1, -FIX_0_899976223); \
    129  q5 = q7; \
    130  q1 = q1 + q6; \
    131  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
    132        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
    133  \
    134  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
    135  tmp11_plus_tmp2 = q1; \
    136  row1 = 0; \
    137  \
    138  q1 = q1 - q6; \
    139  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
    140        MULTIPLY(row3, -FIX_2_562915447); \
    141  q1 = q1 - q6; \
    142  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
    143       MULTIPLY(row6, FIX_0_541196100); \
    144  q3 = q3 - q2; \
    145  \
    146  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
    147  tmp11_minus_tmp2 = q1; \
    148  \
    149  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
    150  q2 = q1 + q6; \
    151  q1 = q1 - q6; \
    152  \
    153  /* pick up the results */ \
    154  tmp0  = q4; \
    155  tmp1  = q5; \
    156  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
    157  tmp3  = q7; \
    158  tmp10 = q2; \
    159  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
    160  tmp12 = q3; \
    161  tmp13 = q1; \
    162 }
    163 
    164 #define XFIX_0_899976223                    d0[0]
    165 #define XFIX_0_541196100                    d0[1]
    166 #define XFIX_2_562915447                    d0[2]
    167 #define XFIX_0_298631336_MINUS_0_899976223  d0[3]
    168 #define XFIX_1_501321110_MINUS_0_899976223  d1[0]
    169 #define XFIX_2_053119869_MINUS_2_562915447  d1[1]
    170 #define XFIX_0_541196100_PLUS_0_765366865   d1[2]
    171 #define XFIX_1_175875602                    d1[3]
    172 #define XFIX_1_175875602_MINUS_0_390180644  d2[0]
    173 #define XFIX_0_541196100_MINUS_1_847759065  d2[1]
    174 #define XFIX_3_072711026_MINUS_2_562915447  d2[2]
    175 #define XFIX_1_175875602_MINUS_1_961570560  d2[3]
    176 
    177 .balign 16
    178 jsimd_idct_islow_neon_consts:
    179  .short FIX_0_899976223                    /* d0[0] */
    180  .short FIX_0_541196100                    /* d0[1] */
    181  .short FIX_2_562915447                    /* d0[2] */
    182  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
    183  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
    184  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
    185  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
    186  .short FIX_1_175875602                    /* d1[3] */
    187  /* reloadable constants */
    188  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
    189  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
    190  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
    191  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
    192 
    193 asm_function jsimd_idct_islow_neon
    194 
    195    DCT_TABLE       .req r0
    196    COEF_BLOCK      .req r1
    197    OUTPUT_BUF      .req r2
    198    OUTPUT_COL      .req r3
    199    TMP1            .req r0
    200    TMP2            .req r1
    201    TMP3            .req r2
    202    TMP4            .req ip
    203 
    204    ROW0L           .req d16
    205    ROW0R           .req d17
    206    ROW1L           .req d18
    207    ROW1R           .req d19
    208    ROW2L           .req d20
    209    ROW2R           .req d21
    210    ROW3L           .req d22
    211    ROW3R           .req d23
    212    ROW4L           .req d24
    213    ROW4R           .req d25
    214    ROW5L           .req d26
    215    ROW5R           .req d27
    216    ROW6L           .req d28
    217    ROW6R           .req d29
    218    ROW7L           .req d30
    219    ROW7R           .req d31
    220 
    221    /* Load and dequantize coefficients into Neon registers
    222     * with the following allocation:
    223     *       0 1 2 3 | 4 5 6 7
    224     *      ---------+--------
    225     *   0 | d16     | d17     ( q8  )
    226     *   1 | d18     | d19     ( q9  )
    227     *   2 | d20     | d21     ( q10 )
    228     *   3 | d22     | d23     ( q11 )
    229     *   4 | d24     | d25     ( q12 )
    230     *   5 | d26     | d27     ( q13 )
    231     *   6 | d28     | d29     ( q14 )
    232     *   7 | d30     | d31     ( q15 )
    233     */
    234    adr             ip, jsimd_idct_islow_neon_consts
    235    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    236    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    237    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    238    vmul.s16        q8, q8, q0
    239    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    240    vmul.s16        q9, q9, q1
    241    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    242    vmul.s16        q10, q10, q2
    243    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    244    vmul.s16        q11, q11, q3
    245    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    246    vmul.s16        q12, q12, q0
    247    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    248    vmul.s16        q14, q14, q2
    249    vmul.s16        q13, q13, q1
    250    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
    251    add             ip, ip, #16
    252    vmul.s16        q15, q15, q3
    253    vpush           {d8 - d15}                    /* save Neon registers */
    254    /* 1-D IDCT, pass 1, left 4x8 half */
    255    vadd.s16        d4, ROW7L, ROW3L
    256    vadd.s16        d5, ROW5L, ROW1L
    257    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
    258    vmlal.s16       q6, d5, XFIX_1_175875602
    259    vmull.s16       q7, d4, XFIX_1_175875602
    260      /* Check for the zero coefficients in the right 4x8 half */
    261      push            {r4, r5}
    262    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
    263    vsubl.s16       q3, ROW0L, ROW4L
    264      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
    265    vmull.s16       q2, ROW2L, XFIX_0_541196100
    266    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
    267      orr             r0, r4, r5
    268    vmov            q4, q6
    269    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
    270      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
    271    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
    272    vshl.s32        q3, q3, #13
    273      orr             r0, r0, r4
    274    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
    275      orr             r0, r0, r5
    276    vadd.s32        q1, q3, q2
    277      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
    278    vmov            q5, q7
    279    vadd.s32        q1, q1, q6
    280      orr             r0, r0, r4
    281    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
    282      orr             r0, r0, r5
    283    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
    284    vrshrn.s32      ROW1L, q1, #11
    285      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
    286    vsub.s32        q1, q1, q6
    287    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
    288      orr             r0, r0, r4
    289    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
    290      orr             r0, r0, r5
    291    vsub.s32        q1, q1, q6
    292    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
    293      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
    294    vmlal.s16       q6, ROW6L, XFIX_0_541196100
    295    vsub.s32        q3, q3, q2
    296      orr             r0, r0, r4
    297    vrshrn.s32      ROW6L, q1, #11
    298      orr             r0, r0, r5
    299    vadd.s32        q1, q3, q5
    300      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
    301    vsub.s32        q3, q3, q5
    302    vaddl.s16       q5, ROW0L, ROW4L
    303      orr             r0, r0, r4
    304    vrshrn.s32      ROW2L, q1, #11
    305      orr             r0, r0, r5
    306    vrshrn.s32      ROW5L, q3, #11
    307      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
    308    vshl.s32        q5, q5, #13
    309    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
    310      orr             r0, r0, r4
    311    vadd.s32        q2, q5, q6
    312      orrs            r0, r0, r5
    313    vsub.s32        q1, q5, q6
    314    vadd.s32        q6, q2, q7
    315      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
    316    vsub.s32        q2, q2, q7
    317    vadd.s32        q5, q1, q4
    318      orr             r0, r4, r5
    319    vsub.s32        q3, q1, q4
    320      pop             {r4, r5}
    321    vrshrn.s32      ROW7L, q2, #11
    322    vrshrn.s32      ROW3L, q5, #11
    323    vrshrn.s32      ROW0L, q6, #11
    324    vrshrn.s32      ROW4L, q3, #11
    325 
    326      beq             3f  /* Go to do some special handling for the sparse
    327                             right 4x8 half */
    328 
    329    /* 1-D IDCT, pass 1, right 4x8 half */
    330    vld1.s16        {d2}, [ip, :64]  /* reload constants */
    331    vadd.s16        d10, ROW7R, ROW3R
    332    vadd.s16        d8, ROW5R, ROW1R
    333      /* Transpose left 4x8 half */
    334      vtrn.16         ROW6L, ROW7L
    335    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
    336    vmlal.s16       q6, d8, XFIX_1_175875602
    337      vtrn.16         ROW2L, ROW3L
    338    vmull.s16       q7, d10, XFIX_1_175875602
    339    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
    340      vtrn.16         ROW0L, ROW1L
    341    vsubl.s16       q3, ROW0R, ROW4R
    342    vmull.s16       q2, ROW2R, XFIX_0_541196100
    343    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
    344      vtrn.16         ROW4L, ROW5L
    345    vmov            q4, q6
    346    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
    347    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
    348      vtrn.32         ROW1L, ROW3L
    349    vshl.s32        q3, q3, #13
    350    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
    351      vtrn.32         ROW4L, ROW6L
    352    vadd.s32        q1, q3, q2
    353    vmov            q5, q7
    354    vadd.s32        q1, q1, q6
    355      vtrn.32         ROW0L, ROW2L
    356    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
    357    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
    358    vrshrn.s32      ROW1R, q1, #11
    359      vtrn.32         ROW5L, ROW7L
    360    vsub.s32        q1, q1, q6
    361    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
    362    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
    363    vsub.s32        q1, q1, q6
    364    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
    365    vmlal.s16       q6, ROW6R, XFIX_0_541196100
    366    vsub.s32        q3, q3, q2
    367    vrshrn.s32      ROW6R, q1, #11
    368    vadd.s32        q1, q3, q5
    369    vsub.s32        q3, q3, q5
    370    vaddl.s16       q5, ROW0R, ROW4R
    371    vrshrn.s32      ROW2R, q1, #11
    372    vrshrn.s32      ROW5R, q3, #11
    373    vshl.s32        q5, q5, #13
    374    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
    375    vadd.s32        q2, q5, q6
    376    vsub.s32        q1, q5, q6
    377    vadd.s32        q6, q2, q7
    378    vsub.s32        q2, q2, q7
    379    vadd.s32        q5, q1, q4
    380    vsub.s32        q3, q1, q4
    381    vrshrn.s32      ROW7R, q2, #11
    382    vrshrn.s32      ROW3R, q5, #11
    383    vrshrn.s32      ROW0R, q6, #11
    384    vrshrn.s32      ROW4R, q3, #11
    385    /* Transpose right 4x8 half */
    386    vtrn.16         ROW6R, ROW7R
    387    vtrn.16         ROW2R, ROW3R
    388    vtrn.16         ROW0R, ROW1R
    389    vtrn.16         ROW4R, ROW5R
    390    vtrn.32         ROW1R, ROW3R
    391    vtrn.32         ROW4R, ROW6R
    392    vtrn.32         ROW0R, ROW2R
    393    vtrn.32         ROW5R, ROW7R
    394 
    395 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
    396    vld1.s16        {d2}, [ip, :64]               /* reload constants */
    397    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
    398    vmlal.s16       q6, ROW1L, XFIX_1_175875602
    399    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
    400    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
    401    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
    402    vmlal.s16       q7, ROW3L, XFIX_1_175875602
    403    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
    404    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
    405    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
    406    vmull.s16       q2, ROW2L, XFIX_0_541196100
    407    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
    408    vmov            q4, q6
    409    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
    410    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
    411    vshl.s32        q3, q3, #13
    412    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
    413    vadd.s32        q1, q3, q2
    414    vmov            q5, q7
    415    vadd.s32        q1, q1, q6
    416    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
    417    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
    418    vshrn.s32       ROW1L, q1, #16
    419    vsub.s32        q1, q1, q6
    420    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
    421    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
    422    vsub.s32        q1, q1, q6
    423    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
    424    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
    425    vsub.s32        q3, q3, q2
    426    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
    427    vadd.s32        q1, q3, q5
    428    vsub.s32        q3, q3, q5
    429    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
    430    vshrn.s32       ROW2L, q1, #16
    431    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
    432    vshl.s32        q5, q5, #13
    433    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
    434    vadd.s32        q2, q5, q6
    435    vsub.s32        q1, q5, q6
    436    vadd.s32        q6, q2, q7
    437    vsub.s32        q2, q2, q7
    438    vadd.s32        q5, q1, q4
    439    vsub.s32        q3, q1, q4
    440    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
    441    vshrn.s32       ROW3L, q5, #16
    442    vshrn.s32       ROW0L, q6, #16
    443    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
    444    /* 1-D IDCT, pass 2, right 4x8 half */
    445    vld1.s16        {d2}, [ip, :64]               /* reload constants */
    446    vmull.s16       q6, ROW5R, XFIX_1_175875602
    447    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
    448    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
    449    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
    450    vmull.s16       q7, ROW7R, XFIX_1_175875602
    451    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
    452    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
    453    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
    454    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
    455    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
    456    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
    457    vmov            q4, q6
    458    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
    459    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
    460    vshl.s32        q3, q3, #13
    461    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
    462    vadd.s32        q1, q3, q2
    463    vmov            q5, q7
    464    vadd.s32        q1, q1, q6
    465    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
    466    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
    467    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
    468    vsub.s32        q1, q1, q6
    469    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
    470    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
    471    vsub.s32        q1, q1, q6
    472    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
    473    vmlal.s16       q6, ROW6R, XFIX_0_541196100
    474    vsub.s32        q3, q3, q2
    475    vshrn.s32       ROW6R, q1, #16
    476    vadd.s32        q1, q3, q5
    477    vsub.s32        q3, q3, q5
    478    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
    479    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
    480    vshrn.s32       ROW5R, q3, #16
    481    vshl.s32        q5, q5, #13
    482    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
    483    vadd.s32        q2, q5, q6
    484    vsub.s32        q1, q5, q6
    485    vadd.s32        q6, q2, q7
    486    vsub.s32        q2, q2, q7
    487    vadd.s32        q5, q1, q4
    488    vsub.s32        q3, q1, q4
    489    vshrn.s32       ROW7R, q2, #16
    490    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
    491    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
    492    vshrn.s32       ROW4R, q3, #16
    493 
    494 2:  /* Descale to 8-bit and range limit */
    495    vqrshrn.s16     d16, q8, #2
    496    vqrshrn.s16     d17, q9, #2
    497    vqrshrn.s16     d18, q10, #2
    498    vqrshrn.s16     d19, q11, #2
    499    vpop            {d8 - d15}                    /* restore Neon registers */
    500    vqrshrn.s16     d20, q12, #2
    501      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
    502      vtrn.16         q8, q9
    503    vqrshrn.s16     d21, q13, #2
    504    vqrshrn.s16     d22, q14, #2
    505      vmov.u8         q0, #(CENTERJSAMPLE)
    506    vqrshrn.s16     d23, q15, #2
    507      vtrn.8          d16, d17
    508      vtrn.8          d18, d19
    509      vadd.u8         q8, q8, q0
    510      vadd.u8         q9, q9, q0
    511      vtrn.16         q10, q11
    512        /* Store results to the output buffer */
    513        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    514        add             TMP1, TMP1, OUTPUT_COL
    515        add             TMP2, TMP2, OUTPUT_COL
    516        vst1.8          {d16}, [TMP1]
    517      vtrn.8          d20, d21
    518        vst1.8          {d17}, [TMP2]
    519        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    520        add             TMP1, TMP1, OUTPUT_COL
    521        add             TMP2, TMP2, OUTPUT_COL
    522        vst1.8          {d18}, [TMP1]
    523      vadd.u8         q10, q10, q0
    524        vst1.8          {d19}, [TMP2]
    525        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    526        add             TMP1, TMP1, OUTPUT_COL
    527        add             TMP2, TMP2, OUTPUT_COL
    528        add             TMP3, TMP3, OUTPUT_COL
    529        add             TMP4, TMP4, OUTPUT_COL
    530      vtrn.8          d22, d23
    531        vst1.8          {d20}, [TMP1]
    532      vadd.u8         q11, q11, q0
    533        vst1.8          {d21}, [TMP2]
    534        vst1.8          {d22}, [TMP3]
    535        vst1.8          {d23}, [TMP4]
    536    bx              lr
    537 
    538 3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
    539 
    540    /* Transpose left 4x8 half */
    541    vtrn.16         ROW6L, ROW7L
    542    vtrn.16         ROW2L, ROW3L
    543    vtrn.16         ROW0L, ROW1L
    544    vtrn.16         ROW4L, ROW5L
    545    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
    546    vtrn.32         ROW1L, ROW3L
    547    vtrn.32         ROW4L, ROW6L
    548    vtrn.32         ROW0L, ROW2L
    549    vtrn.32         ROW5L, ROW7L
    550 
    551    cmp             r0, #0
    552    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
    553                           pass */
    554 
    555    /* Only row 0 is non-zero for the right 4x8 half  */
    556    vdup.s16        ROW1R, ROW0R[1]
    557    vdup.s16        ROW2R, ROW0R[2]
    558    vdup.s16        ROW3R, ROW0R[3]
    559    vdup.s16        ROW4R, ROW0R[0]
    560    vdup.s16        ROW5R, ROW0R[1]
    561    vdup.s16        ROW6R, ROW0R[2]
    562    vdup.s16        ROW7R, ROW0R[3]
    563    vdup.s16        ROW0R, ROW0R[0]
    564    b               1b  /* Go to 'normal' second pass */
    565 
    566 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
    567    vld1.s16        {d2}, [ip, :64]               /* reload constants */
    568    vmull.s16       q6, ROW1L, XFIX_1_175875602
    569    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
    570    vmull.s16       q7, ROW3L, XFIX_1_175875602
    571    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
    572    vmull.s16       q2, ROW2L, XFIX_0_541196100
    573    vshll.s16       q3, ROW0L, #13
    574    vmov            q4, q6
    575    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
    576    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
    577    vadd.s32        q1, q3, q2
    578    vmov            q5, q7
    579    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
    580    vadd.s32        q1, q1, q6
    581    vadd.s32        q6, q6, q6
    582    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
    583    vshrn.s32       ROW1L, q1, #16
    584    vsub.s32        q1, q1, q6
    585    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
    586    vsub.s32        q3, q3, q2
    587    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
    588    vadd.s32        q1, q3, q5
    589    vsub.s32        q3, q3, q5
    590    vshll.s16       q5, ROW0L, #13
    591    vshrn.s32       ROW2L, q1, #16
    592    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
    593    vadd.s32        q2, q5, q6
    594    vsub.s32        q1, q5, q6
    595    vadd.s32        q6, q2, q7
    596    vsub.s32        q2, q2, q7
    597    vadd.s32        q5, q1, q4
    598    vsub.s32        q3, q1, q4
    599    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
    600    vshrn.s32       ROW3L, q5, #16
    601    vshrn.s32       ROW0L, q6, #16
    602    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
    603    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
    604    vld1.s16        {d2}, [ip, :64]               /* reload constants */
    605    vmull.s16       q6, ROW5L, XFIX_1_175875602
    606    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
    607    vmull.s16       q7, ROW7L, XFIX_1_175875602
    608    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
    609    vmull.s16       q2, ROW6L, XFIX_0_541196100
    610    vshll.s16       q3, ROW4L, #13
    611    vmov            q4, q6
    612    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
    613    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
    614    vadd.s32        q1, q3, q2
    615    vmov            q5, q7
    616    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
    617    vadd.s32        q1, q1, q6
    618    vadd.s32        q6, q6, q6
    619    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
    620    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
    621    vsub.s32        q1, q1, q6
    622    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
    623    vsub.s32        q3, q3, q2
    624    vshrn.s32       ROW6R, q1, #16
    625    vadd.s32        q1, q3, q5
    626    vsub.s32        q3, q3, q5
    627    vshll.s16       q5, ROW4L, #13
    628    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
    629    vshrn.s32       ROW5R, q3, #16
    630    vadd.s32        q2, q5, q6
    631    vsub.s32        q1, q5, q6
    632    vadd.s32        q6, q2, q7
    633    vsub.s32        q2, q2, q7
    634    vadd.s32        q5, q1, q4
    635    vsub.s32        q3, q1, q4
    636    vshrn.s32       ROW7R, q2, #16
    637    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
    638    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
    639    vshrn.s32       ROW4R, q3, #16
    640    b               2b                            /* Go to epilogue */
    641 
    642    .unreq          DCT_TABLE
    643    .unreq          COEF_BLOCK
    644    .unreq          OUTPUT_BUF
    645    .unreq          OUTPUT_COL
    646    .unreq          TMP1
    647    .unreq          TMP2
    648    .unreq          TMP3
    649    .unreq          TMP4
    650 
    651    .unreq          ROW0L
    652    .unreq          ROW0R
    653    .unreq          ROW1L
    654    .unreq          ROW1R
    655    .unreq          ROW2L
    656    .unreq          ROW2R
    657    .unreq          ROW3L
    658    .unreq          ROW3R
    659    .unreq          ROW4L
    660    .unreq          ROW4R
    661    .unreq          ROW5L
    662    .unreq          ROW5R
    663    .unreq          ROW6L
    664    .unreq          ROW6R
    665    .unreq          ROW7L
    666    .unreq          ROW7R
    667 
    668 
    669 /*****************************************************************************/
    670 
    671 /*
    672 * jsimd_idct_ifast_neon
    673 *
    674 * This function contains a fast, not so accurate integer implementation of
    675 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
    676 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
    677 * function from jidctfst.c
    678 *
    679 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
    680 * But in Arm Neon case some extra additions are required because VQDMULH
    681 * instruction can't handle the constants larger than 1. So the expressions
    682 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
    683 * which introduces an extra addition. Overall, there are 6 extra additions
    684 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
    685 */
    686 
    687 #define XFIX_1_082392200  d0[0]
    688 #define XFIX_1_414213562  d0[1]
    689 #define XFIX_1_847759065  d0[2]
    690 #define XFIX_2_613125930  d0[3]
    691 
    692 .balign 16
    693 jsimd_idct_ifast_neon_consts:
    694  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
    695  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
    696  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
    697  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
    698 
    699 asm_function jsimd_idct_ifast_neon
    700 
    701    DCT_TABLE       .req r0
    702    COEF_BLOCK      .req r1
    703    OUTPUT_BUF      .req r2
    704    OUTPUT_COL      .req r3
    705    TMP1            .req r0
    706    TMP2            .req r1
    707    TMP3            .req r2
    708    TMP4            .req ip
    709 
    710    /* Load and dequantize coefficients into Neon registers
    711     * with the following allocation:
    712     *       0 1 2 3 | 4 5 6 7
    713     *      ---------+--------
    714     *   0 | d16     | d17     ( q8  )
    715     *   1 | d18     | d19     ( q9  )
    716     *   2 | d20     | d21     ( q10 )
    717     *   3 | d22     | d23     ( q11 )
    718     *   4 | d24     | d25     ( q12 )
    719     *   5 | d26     | d27     ( q13 )
    720     *   6 | d28     | d29     ( q14 )
    721     *   7 | d30     | d31     ( q15 )
    722     */
    723    adr             ip, jsimd_idct_ifast_neon_consts
    724    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    725    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    726    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    727    vmul.s16        q8, q8, q0
    728    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    729    vmul.s16        q9, q9, q1
    730    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    731    vmul.s16        q10, q10, q2
    732    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    733    vmul.s16        q11, q11, q3
    734    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    735    vmul.s16        q12, q12, q0
    736    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    737    vmul.s16        q14, q14, q2
    738    vmul.s16        q13, q13, q1
    739    vld1.16         {d0}, [ip, :64]  /* load constants */
    740    vmul.s16        q15, q15, q3
    741    vpush           {d8 - d13}       /* save Neon registers */
    742    /* 1-D IDCT, pass 1 */
    743    vsub.s16        q2, q10, q14
    744    vadd.s16        q14, q10, q14
    745    vsub.s16        q1, q11, q13
    746    vadd.s16        q13, q11, q13
    747    vsub.s16        q5, q9, q15
    748    vadd.s16        q15, q9, q15
    749    vqdmulh.s16     q4, q2, XFIX_1_414213562
    750    vqdmulh.s16     q6, q1, XFIX_2_613125930
    751    vadd.s16        q3, q1, q1
    752    vsub.s16        q1, q5, q1
    753    vadd.s16        q10, q2, q4
    754    vqdmulh.s16     q4, q1, XFIX_1_847759065
    755    vsub.s16        q2, q15, q13
    756    vadd.s16        q3, q3, q6
    757    vqdmulh.s16     q6, q2, XFIX_1_414213562
    758    vadd.s16        q1, q1, q4
    759    vqdmulh.s16     q4, q5, XFIX_1_082392200
    760    vsub.s16        q10, q10, q14
    761    vadd.s16        q2, q2, q6
    762    vsub.s16        q6, q8, q12
    763    vadd.s16        q12, q8, q12
    764    vadd.s16        q9, q5, q4
    765    vadd.s16        q5, q6, q10
    766    vsub.s16        q10, q6, q10
    767    vadd.s16        q6, q15, q13
    768    vadd.s16        q8, q12, q14
    769    vsub.s16        q3, q6, q3
    770    vsub.s16        q12, q12, q14
    771    vsub.s16        q3, q3, q1
    772    vsub.s16        q1, q9, q1
    773    vadd.s16        q2, q3, q2
    774    vsub.s16        q15, q8, q6
    775    vadd.s16        q1, q1, q2
    776    vadd.s16        q8, q8, q6
    777    vadd.s16        q14, q5, q3
    778    vsub.s16        q9, q5, q3
    779    vsub.s16        q13, q10, q2
    780    vadd.s16        q10, q10, q2
    781      /* Transpose */
    782      vtrn.16         q8, q9
    783    vsub.s16        q11, q12, q1
    784      vtrn.16         q14, q15
    785    vadd.s16        q12, q12, q1
    786      vtrn.16         q10, q11
    787      vtrn.16         q12, q13
    788      vtrn.32         q9, q11
    789      vtrn.32         q12, q14
    790      vtrn.32         q8, q10
    791      vtrn.32         q13, q15
    792      vswp            d28, d21
    793      vswp            d26, d19
    794    /* 1-D IDCT, pass 2 */
    795    vsub.s16        q2, q10, q14
    796      vswp            d30, d23
    797    vadd.s16        q14, q10, q14
    798      vswp            d24, d17
    799    vsub.s16        q1, q11, q13
    800    vadd.s16        q13, q11, q13
    801    vsub.s16        q5, q9, q15
    802    vadd.s16        q15, q9, q15
    803    vqdmulh.s16     q4, q2, XFIX_1_414213562
    804    vqdmulh.s16     q6, q1, XFIX_2_613125930
    805    vadd.s16        q3, q1, q1
    806    vsub.s16        q1, q5, q1
    807    vadd.s16        q10, q2, q4
    808    vqdmulh.s16     q4, q1, XFIX_1_847759065
    809    vsub.s16        q2, q15, q13
    810    vadd.s16        q3, q3, q6
    811    vqdmulh.s16     q6, q2, XFIX_1_414213562
    812    vadd.s16        q1, q1, q4
    813    vqdmulh.s16     q4, q5, XFIX_1_082392200
    814    vsub.s16        q10, q10, q14
    815    vadd.s16        q2, q2, q6
    816    vsub.s16        q6, q8, q12
    817    vadd.s16        q12, q8, q12
    818    vadd.s16        q9, q5, q4
    819    vadd.s16        q5, q6, q10
    820    vsub.s16        q10, q6, q10
    821    vadd.s16        q6, q15, q13
    822    vadd.s16        q8, q12, q14
    823    vsub.s16        q3, q6, q3
    824    vsub.s16        q12, q12, q14
    825    vsub.s16        q3, q3, q1
    826    vsub.s16        q1, q9, q1
    827    vadd.s16        q2, q3, q2
    828    vsub.s16        q15, q8, q6
    829    vadd.s16        q1, q1, q2
    830    vadd.s16        q8, q8, q6
    831    vadd.s16        q14, q5, q3
    832    vsub.s16        q9, q5, q3
    833    vsub.s16        q13, q10, q2
    834    vpop            {d8 - d13}    /* restore Neon registers */
    835    vadd.s16        q10, q10, q2
    836    vsub.s16        q11, q12, q1
    837    vadd.s16        q12, q12, q1
    838    /* Descale to 8-bit and range limit */
    839    vmov.u8         q0, #0x80
    840    vqshrn.s16      d16, q8, #5
    841    vqshrn.s16      d17, q9, #5
    842    vqshrn.s16      d18, q10, #5
    843    vqshrn.s16      d19, q11, #5
    844    vqshrn.s16      d20, q12, #5
    845    vqshrn.s16      d21, q13, #5
    846    vqshrn.s16      d22, q14, #5
    847    vqshrn.s16      d23, q15, #5
    848    vadd.u8         q8, q8, q0
    849    vadd.u8         q9, q9, q0
    850    vadd.u8         q10, q10, q0
    851    vadd.u8         q11, q11, q0
    852    /* Transpose the final 8-bit samples */
    853    vtrn.16         q8, q9
    854    vtrn.16         q10, q11
    855    vtrn.32         q8, q10
    856    vtrn.32         q9, q11
    857    vtrn.8          d16, d17
    858    vtrn.8          d18, d19
    859      /* Store results to the output buffer */
    860      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    861      add             TMP1, TMP1, OUTPUT_COL
    862      add             TMP2, TMP2, OUTPUT_COL
    863      vst1.8          {d16}, [TMP1]
    864      vst1.8          {d17}, [TMP2]
    865      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    866      add             TMP1, TMP1, OUTPUT_COL
    867      add             TMP2, TMP2, OUTPUT_COL
    868      vst1.8          {d18}, [TMP1]
    869    vtrn.8          d20, d21
    870      vst1.8          {d19}, [TMP2]
    871      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    872      add             TMP1, TMP1, OUTPUT_COL
    873      add             TMP2, TMP2, OUTPUT_COL
    874      add             TMP3, TMP3, OUTPUT_COL
    875      add             TMP4, TMP4, OUTPUT_COL
    876      vst1.8          {d20}, [TMP1]
    877    vtrn.8          d22, d23
    878      vst1.8          {d21}, [TMP2]
    879      vst1.8          {d22}, [TMP3]
    880      vst1.8          {d23}, [TMP4]
    881    bx              lr
    882 
    883    .unreq          DCT_TABLE
    884    .unreq          COEF_BLOCK
    885    .unreq          OUTPUT_BUF
    886    .unreq          OUTPUT_COL
    887    .unreq          TMP1
    888    .unreq          TMP2
    889    .unreq          TMP3
    890    .unreq          TMP4
    891 
    892 
    893 /*****************************************************************************/
    894 
    895 /*
    896 * jsimd_extrgb_ycc_convert_neon
    897 * jsimd_extbgr_ycc_convert_neon
    898 * jsimd_extrgbx_ycc_convert_neon
    899 * jsimd_extbgrx_ycc_convert_neon
    900 * jsimd_extxbgr_ycc_convert_neon
    901 * jsimd_extxrgb_ycc_convert_neon
    902 *
    903 * Colorspace conversion RGB -> YCbCr
    904 */
    905 
    906 .macro do_store size
    907  .if \size == 8
    908    vst1.8          {d20}, [Y]!
    909    vst1.8          {d21}, [U]!
    910    vst1.8          {d22}, [V]!
    911  .elseif \size == 4
    912    vst1.8          {d20[0]}, [Y]!
    913    vst1.8          {d20[1]}, [Y]!
    914    vst1.8          {d20[2]}, [Y]!
    915    vst1.8          {d20[3]}, [Y]!
    916    vst1.8          {d21[0]}, [U]!
    917    vst1.8          {d21[1]}, [U]!
    918    vst1.8          {d21[2]}, [U]!
    919    vst1.8          {d21[3]}, [U]!
    920    vst1.8          {d22[0]}, [V]!
    921    vst1.8          {d22[1]}, [V]!
    922    vst1.8          {d22[2]}, [V]!
    923    vst1.8          {d22[3]}, [V]!
    924  .elseif \size == 2
    925    vst1.8          {d20[4]}, [Y]!
    926    vst1.8          {d20[5]}, [Y]!
    927    vst1.8          {d21[4]}, [U]!
    928    vst1.8          {d21[5]}, [U]!
    929    vst1.8          {d22[4]}, [V]!
    930    vst1.8          {d22[5]}, [V]!
    931  .elseif \size == 1
    932    vst1.8          {d20[6]}, [Y]!
    933    vst1.8          {d21[6]}, [U]!
    934    vst1.8          {d22[6]}, [V]!
    935  .else
    936    .error unsupported macroblock size
    937  .endif
    938 .endm
    939 
    940 .macro do_load bpp, size
    941  .if \bpp == 24
    942    .if \size == 8
    943      vld3.8        {d10, d11, d12}, [RGB]!
    944      pld           [RGB, #128]
    945    .elseif \size == 4
    946      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
    947      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
    948      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
    949      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
    950    .elseif \size == 2
    951      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
    952      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
    953    .elseif \size == 1
    954      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
    955    .else
    956      .error unsupported macroblock size
    957    .endif
    958  .elseif \bpp == 32
    959    .if \size == 8
    960      vld4.8        {d10, d11, d12, d13}, [RGB]!
    961      pld           [RGB, #128]
    962    .elseif \size == 4
    963      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
    964      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
    965      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
    966      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
    967    .elseif \size == 2
    968      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
    969      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
    970    .elseif \size == 1
    971      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
    972    .else
    973      .error unsupported macroblock size
    974    .endif
    975  .else
    976    .error unsupported bpp
    977  .endif
    978 .endm
    979 
    980 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
    981 
    982 /*
    983 * 2-stage pipelined RGB->YCbCr conversion
    984 */
    985 
    986 .macro do_rgb_to_yuv_stage1
    987    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
    988    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
    989    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
    990    vmull.u16       q7, d4, d0[0]
    991    vmlal.u16       q7, d6, d0[1]
    992    vmlal.u16       q7, d8, d0[2]
    993    vmull.u16       q8, d5, d0[0]
    994    vmlal.u16       q8, d7, d0[1]
    995    vmlal.u16       q8, d9, d0[2]
    996    vrev64.32       q9, q1
    997    vrev64.32       q13, q1
    998    vmlsl.u16       q9, d4, d0[3]
    999    vmlsl.u16       q9, d6, d1[0]
   1000    vmlal.u16       q9, d8, d1[1]
   1001    vmlsl.u16       q13, d5, d0[3]
   1002    vmlsl.u16       q13, d7, d1[0]
   1003    vmlal.u16       q13, d9, d1[1]
   1004    vrev64.32       q14, q1
   1005    vrev64.32       q15, q1
   1006    vmlal.u16       q14, d4, d1[1]
   1007    vmlsl.u16       q14, d6, d1[2]
   1008    vmlsl.u16       q14, d8, d1[3]
   1009    vmlal.u16       q15, d5, d1[1]
   1010    vmlsl.u16       q15, d7, d1[2]
   1011    vmlsl.u16       q15, d9, d1[3]
   1012 .endm
   1013 
   1014 .macro do_rgb_to_yuv_stage2
   1015    vrshrn.u32      d20, q7, #16
   1016    vrshrn.u32      d21, q8, #16
   1017    vshrn.u32       d22, q9, #16
   1018    vshrn.u32       d23, q13, #16
   1019    vshrn.u32       d24, q14, #16
   1020    vshrn.u32       d25, q15, #16
   1021    vmovn.u16       d20, q10       /* d20 = y */
   1022    vmovn.u16       d21, q11       /* d21 = u */
   1023    vmovn.u16       d22, q12       /* d22 = v */
   1024 .endm
   1025 
   1026 .macro do_rgb_to_yuv
   1027    do_rgb_to_yuv_stage1
   1028    do_rgb_to_yuv_stage2
   1029 .endm
   1030 
   1031 .macro do_rgb_to_yuv_stage2_store_load_stage1
   1032      vrshrn.u32      d20, q7, #16
   1033      vrshrn.u32      d21, q8, #16
   1034      vshrn.u32       d22, q9, #16
   1035    vrev64.32       q9, q1
   1036      vshrn.u32       d23, q13, #16
   1037    vrev64.32       q13, q1
   1038      vshrn.u32       d24, q14, #16
   1039      vshrn.u32       d25, q15, #16
   1040    do_load         \bpp, 8
   1041      vmovn.u16       d20, q10     /* d20 = y */
   1042    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
   1043      vmovn.u16       d21, q11     /* d21 = u */
   1044    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
   1045      vmovn.u16       d22, q12     /* d22 = v */
   1046    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
   1047    vmull.u16       q7, d4, d0[0]
   1048    vmlal.u16       q7, d6, d0[1]
   1049    vmlal.u16       q7, d8, d0[2]
   1050      vst1.8          {d20}, [Y]!
   1051    vmull.u16       q8, d5, d0[0]
   1052    vmlal.u16       q8, d7, d0[1]
   1053    vmlal.u16       q8, d9, d0[2]
   1054    vmlsl.u16       q9, d4, d0[3]
   1055    vmlsl.u16       q9, d6, d1[0]
   1056    vmlal.u16       q9, d8, d1[1]
   1057      vst1.8          {d21}, [U]!
   1058    vmlsl.u16       q13, d5, d0[3]
   1059    vmlsl.u16       q13, d7, d1[0]
   1060    vmlal.u16       q13, d9, d1[1]
   1061    vrev64.32       q14, q1
   1062    vrev64.32       q15, q1
   1063    vmlal.u16       q14, d4, d1[1]
   1064    vmlsl.u16       q14, d6, d1[2]
   1065    vmlsl.u16       q14, d8, d1[3]
   1066      vst1.8          {d22}, [V]!
   1067    vmlal.u16       q15, d5, d1[1]
   1068    vmlsl.u16       q15, d7, d1[2]
   1069    vmlsl.u16       q15, d9, d1[3]
   1070 .endm
   1071 
   1072 .balign 16
   1073 jsimd_\colorid\()_ycc_neon_consts:
   1074  .short 19595, 38470, 7471,  11059
   1075  .short 21709, 32768, 27439, 5329
   1076  .short 32767, 128,   32767, 128
   1077  .short 32767, 128,   32767, 128
   1078 
   1079 asm_function jsimd_\colorid\()_ycc_convert_neon
   1080    OUTPUT_WIDTH    .req r0
   1081    INPUT_BUF       .req r1
   1082    OUTPUT_BUF      .req r2
   1083    OUTPUT_ROW      .req r3
   1084    NUM_ROWS        .req r4
   1085 
   1086    OUTPUT_BUF0     .req r5
   1087    OUTPUT_BUF1     .req r6
   1088    OUTPUT_BUF2     .req OUTPUT_BUF
   1089 
   1090    RGB             .req r7
   1091    Y               .req r8
   1092    U               .req r9
   1093    V               .req r10
   1094    N               .req ip
   1095 
   1096    /* Load constants to d0, d1, d2, d3 */
   1097    adr             ip, jsimd_\colorid\()_ycc_neon_consts
   1098    vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1099 
   1100    /* Save Arm registers and handle input arguments */
   1101    push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1102    ldr             NUM_ROWS, [sp, #(4 * 8)]
   1103    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   1104    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
   1105    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
   1106    .unreq          OUTPUT_BUF
   1107 
   1108    /* Save Neon registers */
   1109    vpush           {d8 - d15}
   1110 
   1111    /* Outer loop over scanlines */
   1112    cmp             NUM_ROWS, #1
   1113    blt             9f
   1114 0:
   1115    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
   1116    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
   1117    mov             N, OUTPUT_WIDTH
   1118    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
   1119    add             OUTPUT_ROW, OUTPUT_ROW, #1
   1120    ldr             RGB, [INPUT_BUF], #4
   1121 
   1122    /* Inner loop over pixels */
   1123    subs            N, N, #8
   1124    blt             3f
   1125    do_load         \bpp, 8
   1126    do_rgb_to_yuv_stage1
   1127    subs            N, N, #8
   1128    blt             2f
   1129 1:
   1130    do_rgb_to_yuv_stage2_store_load_stage1
   1131    subs            N, N, #8
   1132    bge             1b
   1133 2:
   1134    do_rgb_to_yuv_stage2
   1135    do_store        8
   1136    tst             N, #7
   1137    beq             8f
   1138 3:
   1139    tst             N, #4
   1140    beq             3f
   1141    do_load         \bpp, 4
   1142 3:
   1143    tst             N, #2
   1144    beq             4f
   1145    do_load         \bpp, 2
   1146 4:
   1147    tst             N, #1
   1148    beq             5f
   1149    do_load         \bpp, 1
   1150 5:
   1151    do_rgb_to_yuv
   1152    tst             N, #4
   1153    beq             6f
   1154    do_store        4
   1155 6:
   1156    tst             N, #2
   1157    beq             7f
   1158    do_store        2
   1159 7:
   1160    tst             N, #1
   1161    beq             8f
   1162    do_store        1
   1163 8:
   1164    subs            NUM_ROWS, NUM_ROWS, #1
   1165    bgt             0b
   1166 9:
   1167    /* Restore all registers and return */
   1168    vpop            {d8 - d15}
   1169    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1170 
   1171    .unreq          OUTPUT_WIDTH
   1172    .unreq          OUTPUT_ROW
   1173    .unreq          INPUT_BUF
   1174    .unreq          NUM_ROWS
   1175    .unreq          OUTPUT_BUF0
   1176    .unreq          OUTPUT_BUF1
   1177    .unreq          OUTPUT_BUF2
   1178    .unreq          RGB
   1179    .unreq          Y
   1180    .unreq          U
   1181    .unreq          V
   1182    .unreq          N
   1183 
   1184 .purgem do_rgb_to_yuv
   1185 .purgem do_rgb_to_yuv_stage1
   1186 .purgem do_rgb_to_yuv_stage2
   1187 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   1188 
   1189 .endm
   1190 
   1191 /*--------------------------------- id ----- bpp R  G  B */
   1192 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
   1193 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
   1194 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
   1195 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
   1196 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
   1197 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
   1198 
   1199 .purgem do_load
   1200 .purgem do_store