tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jsimd_neon.S (98482B)


      1 /*
      2 * Armv8 Neon optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
      5 *                          All Rights Reserved.
      6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
      7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
      8 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
      9 * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
     10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
     11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
     12 *
     13 * This software is provided 'as-is', without any express or implied
     14 * warranty.  In no event will the authors be held liable for any damages
     15 * arising from the use of this software.
     16 *
     17 * Permission is granted to anyone to use this software for any purpose,
     18 * including commercial applications, and to alter it and redistribute it
     19 * freely, subject to the following restrictions:
     20 *
     21 * 1. The origin of this software must not be misrepresented; you must not
     22 *    claim that you wrote the original software. If you use this software
     23 *    in a product, an acknowledgment in the product documentation would be
     24 *    appreciated but is not required.
     25 * 2. Altered source versions must be plainly marked as such, and must not be
     26 *    misrepresented as being the original software.
     27 * 3. This notice may not be removed or altered from any source distribution.
     28 */
     29 
     30 #if defined(__linux__) && defined(__ELF__)
     31 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
     32 #endif
     33 
     34 #if defined(__APPLE__)
     35 .section __DATA, __const
     36 #elif defined(_WIN32)
     37 .section .rdata
     38 #else
     39 .section .rodata, "a", %progbits
     40 #endif
     41 
     42 /* Constants for jsimd_idct_islow_neon() */
     43 
     44 #define F_0_298   2446  /* FIX(0.298631336) */
     45 #define F_0_390   3196  /* FIX(0.390180644) */
     46 #define F_0_541   4433  /* FIX(0.541196100) */
     47 #define F_0_765   6270  /* FIX(0.765366865) */
     48 #define F_0_899   7373  /* FIX(0.899976223) */
     49 #define F_1_175   9633  /* FIX(1.175875602) */
     50 #define F_1_501  12299  /* FIX(1.501321110) */
     51 #define F_1_847  15137  /* FIX(1.847759065) */
     52 #define F_1_961  16069  /* FIX(1.961570560) */
     53 #define F_2_053  16819  /* FIX(2.053119869) */
     54 #define F_2_562  20995  /* FIX(2.562915447) */
     55 #define F_3_072  25172  /* FIX(3.072711026) */
     56 
     57 .balign 16
     58 Ljsimd_idct_islow_neon_consts:
     59  .short F_0_298
     60  .short -F_0_390
     61  .short F_0_541
     62  .short F_0_765
     63  .short - F_0_899
     64  .short F_1_175
     65  .short F_1_501
     66  .short - F_1_847
     67  .short - F_1_961
     68  .short F_2_053
     69  .short - F_2_562
     70  .short F_3_072
     71  .short 0          /* padding */
     72  .short 0
     73  .short 0
     74  .short 0
     75 
     76 #undef F_0_298
     77 #undef F_0_390
     78 #undef F_0_541
     79 #undef F_0_765
     80 #undef F_0_899
     81 #undef F_1_175
     82 #undef F_1_501
     83 #undef F_1_847
     84 #undef F_1_961
     85 #undef F_2_053
     86 #undef F_2_562
     87 #undef F_3_072
     88 
     89 /* Constants for jsimd_ycc_*_neon() */
     90 
     91 .balign 16
     92 Ljsimd_ycc_rgb_neon_consts:
     93  .short 0,      0,     0,      0
     94  .short 22971, -11277, -23401, 29033
     95  .short -128,  -128,   -128,   -128
     96  .short -128,  -128,   -128,   -128
     97 
     98 /* Constants for jsimd_*_ycc_neon() */
     99 
    100 .balign 16
    101 Ljsimd_rgb_ycc_neon_consts:
    102  .short 19595, 38470, 7471, 11059
    103  .short 21709, 32768, 27439, 5329
    104  .short 32767, 128, 32767, 128
    105  .short 32767, 128, 32767, 128
    106 
    107 /* Constants for jsimd_fdct_islow_neon() */
    108 
    109 #define F_0_298   2446  /* FIX(0.298631336) */
    110 #define F_0_390   3196  /* FIX(0.390180644) */
    111 #define F_0_541   4433  /* FIX(0.541196100) */
    112 #define F_0_765   6270  /* FIX(0.765366865) */
    113 #define F_0_899   7373  /* FIX(0.899976223) */
    114 #define F_1_175   9633  /* FIX(1.175875602) */
    115 #define F_1_501  12299  /* FIX(1.501321110) */
    116 #define F_1_847  15137  /* FIX(1.847759065) */
    117 #define F_1_961  16069  /* FIX(1.961570560) */
    118 #define F_2_053  16819  /* FIX(2.053119869) */
    119 #define F_2_562  20995  /* FIX(2.562915447) */
    120 #define F_3_072  25172  /* FIX(3.072711026) */
    121 
    122 .balign 16
    123 Ljsimd_fdct_islow_neon_consts:
    124  .short F_0_298
    125  .short -F_0_390
    126  .short F_0_541
    127  .short F_0_765
    128  .short - F_0_899
    129  .short F_1_175
    130  .short F_1_501
    131  .short - F_1_847
    132  .short - F_1_961
    133  .short F_2_053
    134  .short - F_2_562
    135  .short F_3_072
    136  .short 0          /* padding */
    137  .short 0
    138  .short 0
    139  .short 0
    140 
    141 #undef F_0_298
    142 #undef F_0_390
    143 #undef F_0_541
    144 #undef F_0_765
    145 #undef F_0_899
    146 #undef F_1_175
    147 #undef F_1_501
    148 #undef F_1_847
    149 #undef F_1_961
    150 #undef F_2_053
    151 #undef F_2_562
    152 #undef F_3_072
    153 
    154 /* Constants for jsimd_huff_encode_one_block_neon() */
    155 
    156 .balign 16
    157 Ljsimd_huff_encode_one_block_neon_consts:
    158    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
    159          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
    160    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
    161            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
    162    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
    163            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
    164    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
    165           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
    166    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
    167            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
    168    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
    169            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
    170    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
    171            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
    172    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
    173            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
    174    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
    175            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
    176    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
    177           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
    178    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
    179             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
    180    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
    181           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
    182    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
    183           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
    184 
    185 .text
    186 
    187 
    188 /*****************************************************************************/
    189 
    190 /* Supplementary macro for setting function attributes */
    191 .macro asm_function fname
    192 #ifdef __APPLE__
    193    .private_extern _\fname
    194    .globl _\fname
    195 _\fname:
    196 #else
    197    .global \fname
    198 #ifdef __ELF__
    199    .hidden \fname
    200    .type \fname, %function
    201 #endif
    202 \fname:
    203 #endif
    204 .endm
    205 
    206 /* Get symbol location */
    207 .macro get_symbol_loc reg, symbol
    208 #ifdef __APPLE__
    209    adrp            \reg, \symbol@PAGE
    210    add             \reg, \reg, \symbol@PAGEOFF
    211 #else
    212    adrp            \reg, \symbol
    213    add             \reg, \reg, :lo12:\symbol
    214 #endif
    215 .endm
    216 
    217 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
    218    trn1            \t0\().8h, \l0\().8h, \l1\().8h
    219    trn1            \t1\().8h, \l2\().8h, \l3\().8h
    220    trn1            \t2\().8h, \l4\().8h, \l5\().8h
    221    trn1            \t3\().8h, \l6\().8h, \l7\().8h
    222    trn2            \l1\().8h, \l0\().8h, \l1\().8h
    223    trn2            \l3\().8h, \l2\().8h, \l3\().8h
    224    trn2            \l5\().8h, \l4\().8h, \l5\().8h
    225    trn2            \l7\().8h, \l6\().8h, \l7\().8h
    226 
    227    trn1            \l4\().4s, \t2\().4s, \t3\().4s
    228    trn2            \t3\().4s, \t2\().4s, \t3\().4s
    229    trn1            \t2\().4s, \t0\().4s, \t1\().4s
    230    trn2            \l2\().4s, \t0\().4s, \t1\().4s
    231    trn1            \t0\().4s, \l1\().4s, \l3\().4s
    232    trn2            \l3\().4s, \l1\().4s, \l3\().4s
    233    trn2            \t1\().4s, \l5\().4s, \l7\().4s
    234    trn1            \l5\().4s, \l5\().4s, \l7\().4s
    235 
    236    trn2            \l6\().2d, \l2\().2d, \t3\().2d
    237    trn1            \l0\().2d, \t2\().2d, \l4\().2d
    238    trn1            \l1\().2d, \t0\().2d, \l5\().2d
    239    trn2            \l7\().2d, \l3\().2d, \t1\().2d
    240    trn1            \l2\().2d, \l2\().2d, \t3\().2d
    241    trn2            \l4\().2d, \t2\().2d, \l4\().2d
    242    trn1            \l3\().2d, \l3\().2d, \t1\().2d
    243    trn2            \l5\().2d, \t0\().2d, \l5\().2d
    244 .endm
    245 
    246 
    247 #define CENTERJSAMPLE  128
    248 
    249 /*****************************************************************************/
    250 
    251 /*
    252 * Perform dequantization and inverse DCT on one block of coefficients.
    253 *
    254 * GLOBAL(void)
    255 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
    256 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
    257 */
    258 
    259 #define CONST_BITS  13
    260 #define PASS1_BITS  2
    261 
    262 #define XFIX_P_0_298  v0.h[0]
    263 #define XFIX_N_0_390  v0.h[1]
    264 #define XFIX_P_0_541  v0.h[2]
    265 #define XFIX_P_0_765  v0.h[3]
    266 #define XFIX_N_0_899  v0.h[4]
    267 #define XFIX_P_1_175  v0.h[5]
    268 #define XFIX_P_1_501  v0.h[6]
    269 #define XFIX_N_1_847  v0.h[7]
    270 #define XFIX_N_1_961  v1.h[0]
    271 #define XFIX_P_2_053  v1.h[1]
    272 #define XFIX_N_2_562  v1.h[2]
    273 #define XFIX_P_3_072  v1.h[3]
    274 
    275 asm_function jsimd_idct_islow_neon
    276    DCT_TABLE       .req x0
    277    COEF_BLOCK      .req x1
    278    OUTPUT_BUF      .req x2
    279    OUTPUT_COL      .req x3
    280    TMP1            .req x0
    281    TMP2            .req x1
    282    TMP3            .req x9
    283    TMP4            .req x10
    284    TMP5            .req x11
    285    TMP6            .req x12
    286    TMP7            .req x13
    287    TMP8            .req x14
    288 
    289    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
    290       guarantee that the upper (unused) 32 bits of x3 are valid.  This
    291       instruction ensures that those bits are set to zero. */
    292    uxtw x3, w3
    293 
    294    sub             sp, sp, #64
    295    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
    296    mov             x10, sp
    297    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
    298    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
    299    ld1             {v0.8h, v1.8h}, [x15]
    300    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
    301    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
    302    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
    303    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
    304 
    305    cmeq            v16.8h, v3.8h, #0
    306    cmeq            v26.8h, v4.8h, #0
    307    cmeq            v27.8h, v5.8h, #0
    308    cmeq            v28.8h, v6.8h, #0
    309    cmeq            v29.8h, v7.8h, #0
    310    cmeq            v30.8h, v8.8h, #0
    311    cmeq            v31.8h, v9.8h, #0
    312 
    313    and             v10.16b, v16.16b, v26.16b
    314    and             v11.16b, v27.16b, v28.16b
    315    and             v12.16b, v29.16b, v30.16b
    316    and             v13.16b, v31.16b, v10.16b
    317    and             v14.16b, v11.16b, v12.16b
    318    mul             v2.8h, v2.8h, v18.8h
    319    and             v15.16b, v13.16b, v14.16b
    320    shl             v10.8h, v2.8h, #(PASS1_BITS)
    321    sqxtn           v16.8b, v15.8h
    322    mov             TMP1, v16.d[0]
    323    mvn             TMP2, TMP1
    324 
    325    cbnz            TMP2, 2f
    326    /* case all AC coeffs are zeros */
    327    dup             v2.2d, v10.d[0]
    328    dup             v6.2d, v10.d[1]
    329    mov             v3.16b, v2.16b
    330    mov             v7.16b, v6.16b
    331    mov             v4.16b, v2.16b
    332    mov             v8.16b, v6.16b
    333    mov             v5.16b, v2.16b
    334    mov             v9.16b, v6.16b
    335 1:
    336    /* for this transpose, we should organise data like this:
    337     * 00, 01, 02, 03, 40, 41, 42, 43
    338     * 10, 11, 12, 13, 50, 51, 52, 53
    339     * 20, 21, 22, 23, 60, 61, 62, 63
    340     * 30, 31, 32, 33, 70, 71, 72, 73
    341     * 04, 05, 06, 07, 44, 45, 46, 47
    342     * 14, 15, 16, 17, 54, 55, 56, 57
    343     * 24, 25, 26, 27, 64, 65, 66, 67
    344     * 34, 35, 36, 37, 74, 75, 76, 77
    345     */
    346    trn1            v28.8h, v2.8h, v3.8h
    347    trn1            v29.8h, v4.8h, v5.8h
    348    trn1            v30.8h, v6.8h, v7.8h
    349    trn1            v31.8h, v8.8h, v9.8h
    350    trn2            v16.8h, v2.8h, v3.8h
    351    trn2            v17.8h, v4.8h, v5.8h
    352    trn2            v18.8h, v6.8h, v7.8h
    353    trn2            v19.8h, v8.8h, v9.8h
    354    trn1            v2.4s, v28.4s, v29.4s
    355    trn1            v6.4s, v30.4s, v31.4s
    356    trn1            v3.4s, v16.4s, v17.4s
    357    trn1            v7.4s, v18.4s, v19.4s
    358    trn2            v4.4s, v28.4s, v29.4s
    359    trn2            v8.4s, v30.4s, v31.4s
    360    trn2            v5.4s, v16.4s, v17.4s
    361    trn2            v9.4s, v18.4s, v19.4s
    362    /* Even part: reverse the even part of the forward DCT. */
    363    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    364    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    365    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    366    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    367    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    368    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    369    mov             v21.16b, v19.16b               /* tmp3 = z1 */
    370    mov             v20.16b, v18.16b               /* tmp3 = z1 */
    371    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    372    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    373    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    374    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    375    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    376    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    377    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    378    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    379    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    380    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    381    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    382    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    383    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    384    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    385    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    386 
    387    /* Odd part per figure 8; the matrix is unitary and hence its
    388     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    389     */
    390 
    391    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    392    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    393    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    394    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    395    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    396 
    397    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    398    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    399    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    400    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    401    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    402    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    403    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    404    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    405    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    406 
    407    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    408    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    409    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    410    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    411    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    412    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    413    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    414    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    415    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    416 
    417    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    418    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    419    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    420    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    421 
    422    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    423    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    424    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    425    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    426    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    427    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    428    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    429    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    430 
    431    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    432    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    433    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    434    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    435    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    436    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    437    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    438    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    439 
    440    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    441 
    442    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
    443    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    444    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
    445    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    446    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
    447    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    448    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
    449    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    450    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
    451    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    452    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
    453    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    454    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
    455    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    456    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
    457    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    458 
    459    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
    460    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
    461    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
    462    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
    463    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
    464    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
    465    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
    466    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
    467    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
    468    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
    469    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
    470    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
    471    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
    472    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
    473    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
    474    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
    475    movi            v0.16b, #(CENTERJSAMPLE)
    476    /* Prepare pointers (dual-issue with Neon instructions) */
    477      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
    478    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    479      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
    480    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    481      add             TMP1, TMP1, OUTPUT_COL
    482    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    483      add             TMP2, TMP2, OUTPUT_COL
    484    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    485      add             TMP3, TMP3, OUTPUT_COL
    486    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    487      add             TMP4, TMP4, OUTPUT_COL
    488    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    489      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
    490    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    491      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
    492    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
    493      add             TMP5, TMP5, OUTPUT_COL
    494    add             v16.16b, v28.16b, v0.16b
    495      add             TMP6, TMP6, OUTPUT_COL
    496    add             v18.16b, v29.16b, v0.16b
    497      add             TMP7, TMP7, OUTPUT_COL
    498    add             v20.16b, v30.16b, v0.16b
    499      add             TMP8, TMP8, OUTPUT_COL
    500    add             v22.16b, v31.16b, v0.16b
    501 
    502    /* Transpose the final 8-bit samples */
    503    trn1            v28.16b, v16.16b, v18.16b
    504    trn1            v30.16b, v20.16b, v22.16b
    505    trn2            v29.16b, v16.16b, v18.16b
    506    trn2            v31.16b, v20.16b, v22.16b
    507 
    508    trn1            v16.8h, v28.8h, v30.8h
    509    trn2            v18.8h, v28.8h, v30.8h
    510    trn1            v20.8h, v29.8h, v31.8h
    511    trn2            v22.8h, v29.8h, v31.8h
    512 
    513    uzp1            v28.4s, v16.4s, v18.4s
    514    uzp2            v30.4s, v16.4s, v18.4s
    515    uzp1            v29.4s, v20.4s, v22.4s
    516    uzp2            v31.4s, v20.4s, v22.4s
    517 
    518    /* Store results to the output buffer */
    519    st1             {v28.d}[0], [TMP1]
    520    st1             {v29.d}[0], [TMP2]
    521    st1             {v28.d}[1], [TMP3]
    522    st1             {v29.d}[1], [TMP4]
    523    st1             {v30.d}[0], [TMP5]
    524    st1             {v31.d}[0], [TMP6]
    525    st1             {v30.d}[1], [TMP7]
    526    st1             {v31.d}[1], [TMP8]
    527    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
    528    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
    529    blr             x30
    530 
    531 .balign 16
    532 2:
    533    mul             v3.8h, v3.8h, v19.8h
    534    mul             v4.8h, v4.8h, v20.8h
    535    mul             v5.8h, v5.8h, v21.8h
    536    add             TMP4, xzr, TMP2, LSL #32
    537    mul             v6.8h, v6.8h, v22.8h
    538    mul             v7.8h, v7.8h, v23.8h
    539    adds            TMP3, xzr, TMP2, LSR #32
    540    mul             v8.8h, v8.8h, v24.8h
    541    mul             v9.8h, v9.8h, v25.8h
    542    b.ne            3f
    543    /* Right AC coef is zero */
    544    dup             v15.2d, v10.d[1]
    545    /* Even part: reverse the even part of the forward DCT. */
    546    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    547    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    548    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    549    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    550    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    551    mov             v20.16b, v18.16b               /* tmp3 = z1 */
    552    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    553    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    554    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    555    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    556    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    557    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    558    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    559 
    560    /* Odd part per figure 8; the matrix is unitary and hence its
    561     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    562     */
    563 
    564    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    565    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    566    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    567    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    568    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
    569 
    570    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    571    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    572    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    573    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    574    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    575    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    576    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    577    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    578    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    579 
    580    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    581    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    582 
    583    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    584    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    585    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    586    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    587 
    588    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    589    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    590    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    591    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    592 
    593    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    594 
    595    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
    596    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
    597    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
    598    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
    599    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
    600    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
    601    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
    602    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
    603 
    604    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    605    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    606    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    607    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    608    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    609    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    610    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    611    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    612    mov             v6.16b, v15.16b
    613    mov             v7.16b, v15.16b
    614    mov             v8.16b, v15.16b
    615    mov             v9.16b, v15.16b
    616    b               1b
    617 
    618 .balign 16
    619 3:
    620    cbnz            TMP4, 4f
    621    /* Left AC coef is zero */
    622    dup             v14.2d, v10.d[0]
    623    /* Even part: reverse the even part of the forward DCT. */
    624    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    625    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    626    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    627    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    628    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    629    mov             v21.16b, v19.16b               /* tmp3 = z1 */
    630    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    631    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    632    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    633    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    634    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    635    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    636    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    637 
    638    /* Odd part per figure 8; the matrix is unitary and hence its
    639     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    640     */
    641 
    642    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    643    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    644    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    645    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    646    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    647 
    648    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    649    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    650    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    651    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    652    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    653    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    654    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    655    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    656    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    657 
    658    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    659    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    660    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    661    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    662 
    663    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    664    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    665    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    666    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    667 
    668    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    669    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    670    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    671    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    672 
    673    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    674 
    675    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    676    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    677    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    678    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    679    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    680    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    681    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    682    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    683 
    684    mov             v2.16b, v14.16b
    685    mov             v3.16b, v14.16b
    686    mov             v4.16b, v14.16b
    687    mov             v5.16b, v14.16b
    688    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    689    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    690    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    691    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    692    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    693    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    694    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    695    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    696    b               1b
    697 
    698 .balign 16
    699 4:
    700    /* "No" AC coef is zero */
    701    /* Even part: reverse the even part of the forward DCT. */
    702    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    703    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    704    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    705    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    706    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    707    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    708    mov             v21.16b, v19.16b               /* tmp3 = z1 */
    709    mov             v20.16b, v18.16b               /* tmp3 = z1 */
    710    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    711    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    712    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    713    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    714    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    715    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    716    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    717    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    718    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    719    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    720    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    721    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    722    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    723    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    724    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    725 
    726    /* Odd part per figure 8; the matrix is unitary and hence its
    727     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    728     */
    729 
    730    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    731    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    732    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    733    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    734    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    735 
    736    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    737    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    738    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    739    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    740    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    741    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    742    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    743    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    744    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    745 
    746    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    747    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    748    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    749    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    750    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    751    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    752    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    753    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    754    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    755 
    756    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    757    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    758    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    759    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    760 
    761    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    762    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    763    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    764    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    765    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    766    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    767    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    768    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    769 
    770    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    771    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    772    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    773    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    774    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    775    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    776    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    777    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    778 
    779    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    780 
    781    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
    782    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    783    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
    784    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    785    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
    786    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    787    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
    788    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    789    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
    790    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    791    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
    792    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    793    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
    794    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    795    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
    796    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    797 
    798    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    799    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    800    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    801    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    802    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    803    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    804    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    805    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    806    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    807    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    808    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    809    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    810    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    811    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    812    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    813    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    814    b               1b
    815 
    816    .unreq          DCT_TABLE
    817    .unreq          COEF_BLOCK
    818    .unreq          OUTPUT_BUF
    819    .unreq          OUTPUT_COL
    820    .unreq          TMP1
    821    .unreq          TMP2
    822    .unreq          TMP3
    823    .unreq          TMP4
    824    .unreq          TMP5
    825    .unreq          TMP6
    826    .unreq          TMP7
    827    .unreq          TMP8
    828 
    829 #undef CENTERJSAMPLE
    830 #undef CONST_BITS
    831 #undef PASS1_BITS
    832 #undef XFIX_P_0_298
    833 #undef XFIX_N_0_390
    834 #undef XFIX_P_0_541
    835 #undef XFIX_P_0_765
    836 #undef XFIX_N_0_899
    837 #undef XFIX_P_1_175
    838 #undef XFIX_P_1_501
    839 #undef XFIX_N_1_847
    840 #undef XFIX_N_1_961
    841 #undef XFIX_P_2_053
    842 #undef XFIX_N_2_562
    843 #undef XFIX_P_3_072
    844 
    845 
    846 /*****************************************************************************/
    847 
    848 /*
    849 * jsimd_ycc_extrgb_convert_neon
    850 * jsimd_ycc_extbgr_convert_neon
    851 * jsimd_ycc_extrgbx_convert_neon
    852 * jsimd_ycc_extbgrx_convert_neon
    853 * jsimd_ycc_extxbgr_convert_neon
    854 * jsimd_ycc_extxrgb_convert_neon
    855 *
    856 * Colorspace conversion YCbCr -> RGB
    857 */
    858 
    859 .macro do_load size
    860  .if \size == 8
    861    ld1             {v4.8b}, [U], 8
    862    ld1             {v5.8b}, [V], 8
    863    ld1             {v0.8b}, [Y], 8
    864    prfm            pldl1keep, [U, #64]
    865    prfm            pldl1keep, [V, #64]
    866    prfm            pldl1keep, [Y, #64]
    867  .elseif \size == 4
    868    ld1             {v4.b}[0], [U], 1
    869    ld1             {v4.b}[1], [U], 1
    870    ld1             {v4.b}[2], [U], 1
    871    ld1             {v4.b}[3], [U], 1
    872    ld1             {v5.b}[0], [V], 1
    873    ld1             {v5.b}[1], [V], 1
    874    ld1             {v5.b}[2], [V], 1
    875    ld1             {v5.b}[3], [V], 1
    876    ld1             {v0.b}[0], [Y], 1
    877    ld1             {v0.b}[1], [Y], 1
    878    ld1             {v0.b}[2], [Y], 1
    879    ld1             {v0.b}[3], [Y], 1
    880  .elseif \size == 2
    881    ld1             {v4.b}[4], [U], 1
    882    ld1             {v4.b}[5], [U], 1
    883    ld1             {v5.b}[4], [V], 1
    884    ld1             {v5.b}[5], [V], 1
    885    ld1             {v0.b}[4], [Y], 1
    886    ld1             {v0.b}[5], [Y], 1
    887  .elseif \size == 1
    888    ld1             {v4.b}[6], [U], 1
    889    ld1             {v5.b}[6], [V], 1
    890    ld1             {v0.b}[6], [Y], 1
    891  .else
    892    .error unsupported macroblock size
    893  .endif
    894 .endm
    895 
    896 .macro do_store bpp, size, fast_st3
    897  .if \bpp == 24
    898    .if \size == 8
    899      .if \fast_st3 == 1
    900        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
    901      .else
    902        st1         {v10.b}[0], [RGB], #1
    903        st1         {v11.b}[0], [RGB], #1
    904        st1         {v12.b}[0], [RGB], #1
    905 
    906        st1         {v10.b}[1], [RGB], #1
    907        st1         {v11.b}[1], [RGB], #1
    908        st1         {v12.b}[1], [RGB], #1
    909 
    910        st1         {v10.b}[2], [RGB], #1
    911        st1         {v11.b}[2], [RGB], #1
    912        st1         {v12.b}[2], [RGB], #1
    913 
    914        st1         {v10.b}[3], [RGB], #1
    915        st1         {v11.b}[3], [RGB], #1
    916        st1         {v12.b}[3], [RGB], #1
    917 
    918        st1         {v10.b}[4], [RGB], #1
    919        st1         {v11.b}[4], [RGB], #1
    920        st1         {v12.b}[4], [RGB], #1
    921 
    922        st1         {v10.b}[5], [RGB], #1
    923        st1         {v11.b}[5], [RGB], #1
    924        st1         {v12.b}[5], [RGB], #1
    925 
    926        st1         {v10.b}[6], [RGB], #1
    927        st1         {v11.b}[6], [RGB], #1
    928        st1         {v12.b}[6], [RGB], #1
    929 
    930        st1         {v10.b}[7], [RGB], #1
    931        st1         {v11.b}[7], [RGB], #1
    932        st1         {v12.b}[7], [RGB], #1
    933      .endif
    934    .elseif \size == 4
    935      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
    936      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
    937      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
    938      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
    939    .elseif \size == 2
    940      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
    941      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
    942    .elseif \size == 1
    943      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
    944    .else
    945     .error unsupported macroblock size
    946    .endif
    947  .elseif \bpp == 32
    948    .if \size == 8
    949      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
    950    .elseif \size == 4
    951      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
    952      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
    953      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
    954      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
    955    .elseif \size == 2
    956      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
    957      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
    958    .elseif \size == 1
    959      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
    960    .else
    961      .error unsupported macroblock size
    962    .endif
    963  .elseif \bpp == 16
    964    .if \size == 8
    965      st1           {v25.8h}, [RGB], 16
    966    .elseif \size == 4
    967      st1           {v25.4h}, [RGB], 8
    968    .elseif \size == 2
    969      st1           {v25.h}[4], [RGB], 2
    970      st1           {v25.h}[5], [RGB], 2
    971    .elseif \size == 1
    972      st1           {v25.h}[6], [RGB], 2
    973    .else
    974      .error unsupported macroblock size
    975    .endif
    976  .else
    977    .error unsupported bpp
    978  .endif
    979 .endm
    980 
    981 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
    982                                           g_offs, gsize, b_offs, bsize, \
    983                                           defsize, fast_st3
    984 
    985 /*
    986 * 2-stage pipelined YCbCr->RGB conversion
    987 */
    988 
    989 .macro do_yuv_to_rgb_stage1
    990    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
    991    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
    992    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
    993    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
    994    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
    995    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
    996    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
    997    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
    998    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
    999    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
   1000 .endm
   1001 
   1002 .macro do_yuv_to_rgb_stage2
   1003    rshrn           v20.4h, v20.4s, #15
   1004    rshrn2          v20.8h, v22.4s, #15
   1005    rshrn           v24.4h, v24.4s, #14
   1006    rshrn2          v24.8h, v26.4s, #14
   1007    rshrn           v28.4h, v28.4s, #14
   1008    rshrn2          v28.8h, v30.4s, #14
   1009    uaddw           v20.8h, v20.8h, v0.8b
   1010    uaddw           v24.8h, v24.8h, v0.8b
   1011    uaddw           v28.8h, v28.8h, v0.8b
   1012  .if \bpp != 16
   1013    sqxtun          v1\g_offs\defsize, v20.8h
   1014    sqxtun          v1\r_offs\defsize, v24.8h
   1015    sqxtun          v1\b_offs\defsize, v28.8h
   1016  .else
   1017    sqshlu          v21.8h, v20.8h, #8
   1018    sqshlu          v25.8h, v24.8h, #8
   1019    sqshlu          v29.8h, v28.8h, #8
   1020    sri             v25.8h, v21.8h, #5
   1021    sri             v25.8h, v29.8h, #11
   1022  .endif
   1023 .endm
   1024 
   1025 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
   1026    rshrn           v20.4h, v20.4s, #15
   1027    rshrn           v24.4h, v24.4s, #14
   1028    rshrn           v28.4h, v28.4s, #14
   1029    ld1             {v4.8b}, [U], 8
   1030    rshrn2          v20.8h, v22.4s, #15
   1031    rshrn2          v24.8h, v26.4s, #14
   1032    rshrn2          v28.8h, v30.4s, #14
   1033    ld1             {v5.8b}, [V], 8
   1034    uaddw           v20.8h, v20.8h, v0.8b
   1035    uaddw           v24.8h, v24.8h, v0.8b
   1036    uaddw           v28.8h, v28.8h, v0.8b
   1037  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
   1038    sqxtun          v1\g_offs\defsize, v20.8h
   1039    ld1             {v0.8b}, [Y], 8
   1040    sqxtun          v1\r_offs\defsize, v24.8h
   1041    prfm            pldl1keep, [U, #64]
   1042    prfm            pldl1keep, [V, #64]
   1043    prfm            pldl1keep, [Y, #64]
   1044    sqxtun          v1\b_offs\defsize, v28.8h
   1045    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   1046    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1047    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1048    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1049    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1050    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1051    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1052    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1053  .else  /**************************** rgb565 ********************************/
   1054    sqshlu          v21.8h, v20.8h, #8
   1055    sqshlu          v25.8h, v24.8h, #8
   1056    sqshlu          v29.8h, v28.8h, #8
   1057    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   1058    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1059    ld1             {v0.8b}, [Y], 8
   1060    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1061    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1062    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1063    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1064    sri             v25.8h, v21.8h, #5
   1065    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1066    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1067    prfm            pldl1keep, [U, #64]
   1068    prfm            pldl1keep, [V, #64]
   1069    prfm            pldl1keep, [Y, #64]
   1070    sri             v25.8h, v29.8h, #11
   1071  .endif
   1072    do_store        \bpp, 8, \fast_st3
   1073    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
   1074    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
   1075 .endm
   1076 
   1077 .macro do_yuv_to_rgb
   1078    do_yuv_to_rgb_stage1
   1079    do_yuv_to_rgb_stage2
   1080 .endm
   1081 
   1082 .if \fast_st3 == 1
   1083 asm_function jsimd_ycc_\colorid\()_convert_neon
   1084 .else
   1085 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
   1086 .endif
   1087    OUTPUT_WIDTH    .req w0
   1088    INPUT_BUF       .req x1
   1089    INPUT_ROW       .req w2
   1090    OUTPUT_BUF      .req x3
   1091    NUM_ROWS        .req w4
   1092 
   1093    INPUT_BUF0      .req x5
   1094    INPUT_BUF1      .req x6
   1095    INPUT_BUF2      .req x1
   1096 
   1097    RGB             .req x7
   1098    Y               .req x9
   1099    U               .req x10
   1100    V               .req x11
   1101    N               .req w15
   1102 
   1103    sub             sp, sp, 64
   1104    mov             x9, sp
   1105 
   1106    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
   1107    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
   1108 
   1109    /* Save Neon registers */
   1110    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1111    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1112    ld1             {v0.4h, v1.4h}, [x15], 16
   1113    ld1             {v2.8h}, [x15]
   1114 
   1115    ldr             INPUT_BUF0, [INPUT_BUF]
   1116    ldr             INPUT_BUF1, [INPUT_BUF, #8]
   1117    ldr             INPUT_BUF2, [INPUT_BUF, #16]
   1118    .unreq          INPUT_BUF
   1119 
   1120    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
   1121    movi            v10.16b, #255
   1122    movi            v13.16b, #255
   1123 
   1124    /* Outer loop over scanlines */
   1125    cmp             NUM_ROWS, #1
   1126    b.lt            9f
   1127 0:
   1128    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
   1129    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
   1130    mov             N, OUTPUT_WIDTH
   1131    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
   1132    add             INPUT_ROW, INPUT_ROW, #1
   1133    ldr             RGB, [OUTPUT_BUF], #8
   1134 
   1135    /* Inner loop over pixels */
   1136    subs            N, N, #8
   1137    b.lt            3f
   1138    do_load         8
   1139    do_yuv_to_rgb_stage1
   1140    subs            N, N, #8
   1141    b.lt            2f
   1142 1:
   1143    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
   1144    subs            N, N, #8
   1145    b.ge            1b
   1146 2:
   1147    do_yuv_to_rgb_stage2
   1148    do_store        \bpp, 8, \fast_st3
   1149    tst             N, #7
   1150    b.eq            8f
   1151 3:
   1152    tst             N, #4
   1153    b.eq            3f
   1154    do_load         4
   1155 3:
   1156    tst             N, #2
   1157    b.eq            4f
   1158    do_load         2
   1159 4:
   1160    tst             N, #1
   1161    b.eq            5f
   1162    do_load         1
   1163 5:
   1164    do_yuv_to_rgb
   1165    tst             N, #4
   1166    b.eq            6f
   1167    do_store        \bpp, 4, \fast_st3
   1168 6:
   1169    tst             N, #2
   1170    b.eq            7f
   1171    do_store        \bpp, 2, \fast_st3
   1172 7:
   1173    tst             N, #1
   1174    b.eq            8f
   1175    do_store        \bpp, 1, \fast_st3
   1176 8:
   1177    subs            NUM_ROWS, NUM_ROWS, #1
   1178    b.gt            0b
   1179 9:
   1180    /* Restore all registers and return */
   1181    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1182    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1183    br              x30
   1184    .unreq          OUTPUT_WIDTH
   1185    .unreq          INPUT_ROW
   1186    .unreq          OUTPUT_BUF
   1187    .unreq          NUM_ROWS
   1188    .unreq          INPUT_BUF0
   1189    .unreq          INPUT_BUF1
   1190    .unreq          INPUT_BUF2
   1191    .unreq          RGB
   1192    .unreq          Y
   1193    .unreq          U
   1194    .unreq          V
   1195    .unreq          N
   1196 
   1197 .purgem do_yuv_to_rgb
   1198 .purgem do_yuv_to_rgb_stage1
   1199 .purgem do_yuv_to_rgb_stage2
   1200 .purgem do_yuv_to_rgb_stage2_store_load_stage1
   1201 
   1202 .endm
   1203 
   1204 /*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
   1205 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
   1206 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
   1207 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
   1208 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
   1209 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
   1210 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
   1211 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
   1212 
   1213 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
   1214 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
   1215 
   1216 .purgem do_load
   1217 .purgem do_store
   1218 
   1219 
   1220 /*****************************************************************************/
   1221 
   1222 /*
   1223 * jsimd_extrgb_ycc_convert_neon
   1224 * jsimd_extbgr_ycc_convert_neon
   1225 * jsimd_extrgbx_ycc_convert_neon
   1226 * jsimd_extbgrx_ycc_convert_neon
   1227 * jsimd_extxbgr_ycc_convert_neon
   1228 * jsimd_extxrgb_ycc_convert_neon
   1229 *
   1230 * Colorspace conversion RGB -> YCbCr
   1231 */
   1232 
   1233 .macro do_store size
   1234  .if \size == 8
   1235    st1             {v20.8b}, [Y], #8
   1236    st1             {v21.8b}, [U], #8
   1237    st1             {v22.8b}, [V], #8
   1238  .elseif \size == 4
   1239    st1             {v20.b}[0], [Y], #1
   1240    st1             {v20.b}[1], [Y], #1
   1241    st1             {v20.b}[2], [Y], #1
   1242    st1             {v20.b}[3], [Y], #1
   1243    st1             {v21.b}[0], [U], #1
   1244    st1             {v21.b}[1], [U], #1
   1245    st1             {v21.b}[2], [U], #1
   1246    st1             {v21.b}[3], [U], #1
   1247    st1             {v22.b}[0], [V], #1
   1248    st1             {v22.b}[1], [V], #1
   1249    st1             {v22.b}[2], [V], #1
   1250    st1             {v22.b}[3], [V], #1
   1251  .elseif \size == 2
   1252    st1             {v20.b}[4], [Y], #1
   1253    st1             {v20.b}[5], [Y], #1
   1254    st1             {v21.b}[4], [U], #1
   1255    st1             {v21.b}[5], [U], #1
   1256    st1             {v22.b}[4], [V], #1
   1257    st1             {v22.b}[5], [V], #1
   1258  .elseif \size == 1
   1259    st1             {v20.b}[6], [Y], #1
   1260    st1             {v21.b}[6], [U], #1
   1261    st1             {v22.b}[6], [V], #1
   1262  .else
   1263    .error unsupported macroblock size
   1264  .endif
   1265 .endm
   1266 
   1267 .macro do_load bpp, size, fast_ld3
   1268  .if \bpp == 24
   1269    .if \size == 8
   1270      .if \fast_ld3 == 1
   1271        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
   1272      .else
   1273        ld1         {v10.b}[0], [RGB], #1
   1274        ld1         {v11.b}[0], [RGB], #1
   1275        ld1         {v12.b}[0], [RGB], #1
   1276 
   1277        ld1         {v10.b}[1], [RGB], #1
   1278        ld1         {v11.b}[1], [RGB], #1
   1279        ld1         {v12.b}[1], [RGB], #1
   1280 
   1281        ld1         {v10.b}[2], [RGB], #1
   1282        ld1         {v11.b}[2], [RGB], #1
   1283        ld1         {v12.b}[2], [RGB], #1
   1284 
   1285        ld1         {v10.b}[3], [RGB], #1
   1286        ld1         {v11.b}[3], [RGB], #1
   1287        ld1         {v12.b}[3], [RGB], #1
   1288 
   1289        ld1         {v10.b}[4], [RGB], #1
   1290        ld1         {v11.b}[4], [RGB], #1
   1291        ld1         {v12.b}[4], [RGB], #1
   1292 
   1293        ld1         {v10.b}[5], [RGB], #1
   1294        ld1         {v11.b}[5], [RGB], #1
   1295        ld1         {v12.b}[5], [RGB], #1
   1296 
   1297        ld1         {v10.b}[6], [RGB], #1
   1298        ld1         {v11.b}[6], [RGB], #1
   1299        ld1         {v12.b}[6], [RGB], #1
   1300 
   1301        ld1         {v10.b}[7], [RGB], #1
   1302        ld1         {v11.b}[7], [RGB], #1
   1303        ld1         {v12.b}[7], [RGB], #1
   1304      .endif
   1305      prfm          pldl1keep, [RGB, #128]
   1306    .elseif \size == 4
   1307      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
   1308      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
   1309      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
   1310      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
   1311    .elseif \size == 2
   1312      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
   1313      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
   1314    .elseif \size == 1
   1315      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
   1316    .else
   1317      .error unsupported macroblock size
   1318    .endif
   1319  .elseif \bpp == 32
   1320    .if \size == 8
   1321      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
   1322      prfm          pldl1keep, [RGB, #128]
   1323    .elseif \size == 4
   1324      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
   1325      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
   1326      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
   1327      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
   1328    .elseif \size == 2
   1329      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
   1330      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
   1331    .elseif \size == 1
   1332      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
   1333    .else
   1334      .error unsupported macroblock size
   1335    .endif
   1336  .else
   1337    .error unsupported bpp
   1338  .endif
   1339 .endm
   1340 
   1341 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
   1342                                           b_offs, fast_ld3
   1343 
   1344 /*
   1345 * 2-stage pipelined RGB->YCbCr conversion
   1346 */
   1347 
   1348 .macro do_rgb_to_yuv_stage1
   1349    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
   1350    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
   1351    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
   1352    rev64           v18.4s, v1.4s
   1353    rev64           v26.4s, v1.4s
   1354    rev64           v28.4s, v1.4s
   1355    rev64           v30.4s, v1.4s
   1356    umull           v14.4s, v4.4h, v0.h[0]
   1357    umull2          v16.4s, v4.8h, v0.h[0]
   1358    umlsl           v18.4s, v4.4h, v0.h[3]
   1359    umlsl2          v26.4s, v4.8h, v0.h[3]
   1360    umlal           v28.4s, v4.4h, v0.h[5]
   1361    umlal2          v30.4s, v4.8h, v0.h[5]
   1362    umlal           v14.4s, v6.4h, v0.h[1]
   1363    umlal2          v16.4s, v6.8h, v0.h[1]
   1364    umlsl           v18.4s, v6.4h, v0.h[4]
   1365    umlsl2          v26.4s, v6.8h, v0.h[4]
   1366    umlsl           v28.4s, v6.4h, v0.h[6]
   1367    umlsl2          v30.4s, v6.8h, v0.h[6]
   1368    umlal           v14.4s, v8.4h, v0.h[2]
   1369    umlal2          v16.4s, v8.8h, v0.h[2]
   1370    umlal           v18.4s, v8.4h, v0.h[5]
   1371    umlal2          v26.4s, v8.8h, v0.h[5]
   1372    umlsl           v28.4s, v8.4h, v0.h[7]
   1373    umlsl2          v30.4s, v8.8h, v0.h[7]
   1374 .endm
   1375 
   1376 .macro do_rgb_to_yuv_stage2
   1377    rshrn           v20.4h, v14.4s, #16
   1378    shrn            v22.4h, v18.4s, #16
   1379    shrn            v24.4h, v28.4s, #16
   1380    rshrn2          v20.8h, v16.4s, #16
   1381    shrn2           v22.8h, v26.4s, #16
   1382    shrn2           v24.8h, v30.4s, #16
   1383    xtn             v20.8b, v20.8h       /* v20 = y */
   1384    xtn             v21.8b, v22.8h       /* v21 = u */
   1385    xtn             v22.8b, v24.8h       /* v22 = v */
   1386 .endm
   1387 
   1388 .macro do_rgb_to_yuv
   1389    do_rgb_to_yuv_stage1
   1390    do_rgb_to_yuv_stage2
   1391 .endm
   1392 
   1393 /* TODO: expand macros and interleave instructions if some in-order
   1394 *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
   1395 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
   1396    do_rgb_to_yuv_stage2
   1397    do_load         \bpp, 8, \fast_ld3
   1398    st1             {v20.8b}, [Y], #8
   1399    st1             {v21.8b}, [U], #8
   1400    st1             {v22.8b}, [V], #8
   1401    do_rgb_to_yuv_stage1
   1402 .endm
   1403 
   1404 .if \fast_ld3 == 1
   1405 asm_function jsimd_\colorid\()_ycc_convert_neon
   1406 .else
   1407 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
   1408 .endif
   1409    OUTPUT_WIDTH    .req w0
   1410    INPUT_BUF       .req x1
   1411    OUTPUT_BUF      .req x2
   1412    OUTPUT_ROW      .req w3
   1413    NUM_ROWS        .req w4
   1414 
   1415    OUTPUT_BUF0     .req x5
   1416    OUTPUT_BUF1     .req x6
   1417    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
   1418 
   1419    RGB             .req x7
   1420    Y               .req x9
   1421    U               .req x10
   1422    V               .req x11
   1423    N               .req w12
   1424 
   1425    /* Load constants to d0, d1, d2, d3 */
   1426    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
   1427    ld1             {v0.8h, v1.8h}, [x13]
   1428 
   1429    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   1430    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
   1431    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
   1432    .unreq          OUTPUT_BUF
   1433 
   1434    /* Save Neon registers */
   1435    sub             sp, sp, #64
   1436    mov             x9, sp
   1437    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1438    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1439 
   1440    /* Outer loop over scanlines */
   1441    cmp             NUM_ROWS, #1
   1442    b.lt            9f
   1443 0:
   1444    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
   1445    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
   1446    mov             N, OUTPUT_WIDTH
   1447    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
   1448    add             OUTPUT_ROW, OUTPUT_ROW, #1
   1449    ldr             RGB, [INPUT_BUF], #8
   1450 
   1451    /* Inner loop over pixels */
   1452    subs            N, N, #8
   1453    b.lt            3f
   1454    do_load         \bpp, 8, \fast_ld3
   1455    do_rgb_to_yuv_stage1
   1456    subs            N, N, #8
   1457    b.lt            2f
   1458 1:
   1459    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
   1460    subs            N, N, #8
   1461    b.ge            1b
   1462 2:
   1463    do_rgb_to_yuv_stage2
   1464    do_store        8
   1465    tst             N, #7
   1466    b.eq            8f
   1467 3:
   1468    tbz             N, #2, 3f
   1469    do_load         \bpp, 4, \fast_ld3
   1470 3:
   1471    tbz             N, #1, 4f
   1472    do_load         \bpp, 2, \fast_ld3
   1473 4:
   1474    tbz             N, #0, 5f
   1475    do_load         \bpp, 1, \fast_ld3
   1476 5:
   1477    do_rgb_to_yuv
   1478    tbz             N, #2, 6f
   1479    do_store        4
   1480 6:
   1481    tbz             N, #1, 7f
   1482    do_store        2
   1483 7:
   1484    tbz             N, #0, 8f
   1485    do_store        1
   1486 8:
   1487    subs            NUM_ROWS, NUM_ROWS, #1
   1488    b.gt            0b
   1489 9:
   1490    /* Restore all registers and return */
   1491    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1492    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1493    br              x30
   1494 
   1495    .unreq          OUTPUT_WIDTH
   1496    .unreq          OUTPUT_ROW
   1497    .unreq          INPUT_BUF
   1498    .unreq          NUM_ROWS
   1499    .unreq          OUTPUT_BUF0
   1500    .unreq          OUTPUT_BUF1
   1501    .unreq          OUTPUT_BUF2
   1502    .unreq          RGB
   1503    .unreq          Y
   1504    .unreq          U
   1505    .unreq          V
   1506    .unreq          N
   1507 
   1508 .purgem do_rgb_to_yuv
   1509 .purgem do_rgb_to_yuv_stage1
   1510 .purgem do_rgb_to_yuv_stage2
   1511 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   1512 
   1513 .endm
   1514 
   1515 /*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
   1516 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
   1517 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
   1518 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
   1519 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
   1520 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
   1521 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
   1522 
   1523 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
   1524 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
   1525 
   1526 .purgem do_load
   1527 .purgem do_store
   1528 
   1529 
   1530 /*****************************************************************************/
   1531 
   1532 /*
   1533 * jsimd_fdct_islow_neon
   1534 *
   1535 * This file contains a slower but more accurate integer implementation of the
   1536 * forward DCT (Discrete Cosine Transform). The following code is based
   1537 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
   1538 * more details.
   1539 *
   1540 * TODO: can be combined with 'jsimd_convsamp_neon' to get
   1541 *       rid of a bunch of VLD1.16 instructions
   1542 */
   1543 
   1544 #define CONST_BITS  13
   1545 #define PASS1_BITS  2
   1546 
   1547 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
   1548 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
   1549 
   1550 #define XFIX_P_0_298  v0.h[0]
   1551 #define XFIX_N_0_390  v0.h[1]
   1552 #define XFIX_P_0_541  v0.h[2]
   1553 #define XFIX_P_0_765  v0.h[3]
   1554 #define XFIX_N_0_899  v0.h[4]
   1555 #define XFIX_P_1_175  v0.h[5]
   1556 #define XFIX_P_1_501  v0.h[6]
   1557 #define XFIX_N_1_847  v0.h[7]
   1558 #define XFIX_N_1_961  v1.h[0]
   1559 #define XFIX_P_2_053  v1.h[1]
   1560 #define XFIX_N_2_562  v1.h[2]
   1561 #define XFIX_P_3_072  v1.h[3]
   1562 
   1563 asm_function jsimd_fdct_islow_neon
   1564 
   1565    DATA            .req x0
   1566    TMP             .req x9
   1567 
   1568    /* Load constants */
   1569    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
   1570    ld1             {v0.8h, v1.8h}, [TMP]
   1571 
   1572    /* Save Neon registers */
   1573    sub             sp, sp, #64
   1574    mov             x10, sp
   1575    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
   1576    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
   1577 
   1578    /* Load all DATA into Neon registers with the following allocation:
   1579     *       0 1 2 3 | 4 5 6 7
   1580     *      ---------+--------
   1581     *   0 | d16     | d17    | v16.8h
   1582     *   1 | d18     | d19    | v17.8h
   1583     *   2 | d20     | d21    | v18.8h
   1584     *   3 | d22     | d23    | v19.8h
   1585     *   4 | d24     | d25    | v20.8h
   1586     *   5 | d26     | d27    | v21.8h
   1587     *   6 | d28     | d29    | v22.8h
   1588     *   7 | d30     | d31    | v23.8h
   1589     */
   1590 
   1591    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   1592    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   1593    sub             DATA, DATA, #64
   1594 
   1595    /* Transpose */
   1596    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
   1597    /* 1-D FDCT */
   1598    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
   1599    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
   1600    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
   1601    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
   1602    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
   1603    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
   1604    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
   1605    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
   1606 
   1607    /* even part */
   1608 
   1609    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
   1610    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
   1611    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
   1612    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
   1613 
   1614    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
   1615    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
   1616 
   1617    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
   1618 
   1619    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
   1620    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
   1621 
   1622    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   1623    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   1624    mov             v22.16b, v18.16b
   1625    mov             v25.16b, v24.16b
   1626 
   1627    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   1628    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   1629    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   1630    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   1631 
   1632    rshrn           v18.4h, v18.4s, #DESCALE_P1
   1633    rshrn           v22.4h, v22.4s, #DESCALE_P1
   1634    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
   1635    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
   1636 
   1637    /* Odd part */
   1638 
   1639    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
   1640    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
   1641    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
   1642    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
   1643    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
   1644    smull2          v5.4s, v10.8h, XFIX_P_1_175
   1645    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
   1646    smlal2          v5.4s, v11.8h, XFIX_P_1_175
   1647 
   1648    smull2          v24.4s, v28.8h, XFIX_P_0_298
   1649    smull2          v25.4s, v29.8h, XFIX_P_2_053
   1650    smull2          v26.4s, v30.8h, XFIX_P_3_072
   1651    smull2          v27.4s, v31.8h, XFIX_P_1_501
   1652    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
   1653    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
   1654    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
   1655    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
   1656 
   1657    smull2          v12.4s, v8.8h, XFIX_N_0_899
   1658    smull2          v13.4s, v9.8h, XFIX_N_2_562
   1659    smull2          v14.4s, v10.8h, XFIX_N_1_961
   1660    smull2          v15.4s, v11.8h, XFIX_N_0_390
   1661    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
   1662    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
   1663    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
   1664    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
   1665 
   1666    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
   1667    add             v14.4s, v14.4s, v5.4s
   1668    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
   1669    add             v15.4s, v15.4s, v5.4s
   1670 
   1671    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
   1672    add             v24.4s, v24.4s, v12.4s
   1673    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
   1674    add             v25.4s, v25.4s, v13.4s
   1675    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
   1676    add             v26.4s, v26.4s, v14.4s
   1677    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
   1678    add             v27.4s, v27.4s, v15.4s
   1679 
   1680    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
   1681    add             v24.4s, v24.4s, v14.4s
   1682    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
   1683    add             v25.4s, v25.4s, v15.4s
   1684    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
   1685    add             v26.4s, v26.4s, v13.4s
   1686    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
   1687    add             v27.4s, v27.4s, v12.4s
   1688 
   1689    rshrn           v23.4h, v28.4s, #DESCALE_P1
   1690    rshrn           v21.4h, v29.4s, #DESCALE_P1
   1691    rshrn           v19.4h, v30.4s, #DESCALE_P1
   1692    rshrn           v17.4h, v31.4s, #DESCALE_P1
   1693    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
   1694    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
   1695    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
   1696    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
   1697 
   1698    /* Transpose */
   1699    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
   1700 
   1701    /* 1-D FDCT */
   1702    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
   1703    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
   1704    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
   1705    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
   1706    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
   1707    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
   1708    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
   1709    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
   1710 
   1711    /* even part */
   1712    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
   1713    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
   1714    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
   1715    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
   1716 
   1717    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
   1718    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
   1719 
   1720    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
   1721 
   1722    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
   1723    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
   1724 
   1725    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   1726    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   1727    mov             v22.16b, v18.16b
   1728    mov             v25.16b, v24.16b
   1729 
   1730    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   1731    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   1732    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   1733    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   1734 
   1735    rshrn           v18.4h, v18.4s, #DESCALE_P2
   1736    rshrn           v22.4h, v22.4s, #DESCALE_P2
   1737    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
   1738    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
   1739 
   1740    /* Odd part */
   1741    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
   1742    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
   1743    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
   1744    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
   1745 
   1746    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
   1747    smull2          v5.4s, v10.8h, XFIX_P_1_175
   1748    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
   1749    smlal2          v5.4s, v11.8h, XFIX_P_1_175
   1750 
   1751    smull2          v24.4s, v28.8h, XFIX_P_0_298
   1752    smull2          v25.4s, v29.8h, XFIX_P_2_053
   1753    smull2          v26.4s, v30.8h, XFIX_P_3_072
   1754    smull2          v27.4s, v31.8h, XFIX_P_1_501
   1755    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
   1756    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
   1757    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
   1758    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
   1759 
   1760    smull2          v12.4s, v8.8h, XFIX_N_0_899
   1761    smull2          v13.4s, v9.8h, XFIX_N_2_562
   1762    smull2          v14.4s, v10.8h, XFIX_N_1_961
   1763    smull2          v15.4s, v11.8h, XFIX_N_0_390
   1764    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
   1765    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
   1766    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
   1767    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
   1768 
   1769    add             v10.4s, v10.4s, v4.4s
   1770    add             v14.4s, v14.4s, v5.4s
   1771    add             v11.4s, v11.4s, v4.4s
   1772    add             v15.4s, v15.4s, v5.4s
   1773 
   1774    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
   1775    add             v24.4s, v24.4s, v12.4s
   1776    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
   1777    add             v25.4s, v25.4s, v13.4s
   1778    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
   1779    add             v26.4s, v26.4s, v14.4s
   1780    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
   1781    add             v27.4s, v27.4s, v15.4s
   1782 
   1783    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
   1784    add             v24.4s, v24.4s, v14.4s
   1785    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
   1786    add             v25.4s, v25.4s, v15.4s
   1787    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
   1788    add             v26.4s, v26.4s, v13.4s
   1789    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
   1790    add             v27.4s, v27.4s, v12.4s
   1791 
   1792    rshrn           v23.4h, v28.4s, #DESCALE_P2
   1793    rshrn           v21.4h, v29.4s, #DESCALE_P2
   1794    rshrn           v19.4h, v30.4s, #DESCALE_P2
   1795    rshrn           v17.4h, v31.4s, #DESCALE_P2
   1796    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
   1797    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
   1798    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
   1799    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
   1800 
   1801    /* store results */
   1802    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   1803    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   1804 
   1805    /* Restore Neon registers */
   1806    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1807    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1808 
   1809    br              x30
   1810 
   1811    .unreq          DATA
   1812    .unreq          TMP
   1813 
   1814 #undef XFIX_P_0_298
   1815 #undef XFIX_N_0_390
   1816 #undef XFIX_P_0_541
   1817 #undef XFIX_P_0_765
   1818 #undef XFIX_N_0_899
   1819 #undef XFIX_P_1_175
   1820 #undef XFIX_P_1_501
   1821 #undef XFIX_N_1_847
   1822 #undef XFIX_N_1_961
   1823 #undef XFIX_P_2_053
   1824 #undef XFIX_N_2_562
   1825 #undef XFIX_P_3_072
   1826 
   1827 
   1828 /*****************************************************************************/
   1829 
   1830 /*
   1831 * GLOBAL(JOCTET *)
   1832 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
   1833 *                             JCOEFPTR block, int last_dc_val,
   1834 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
   1835 *
   1836 */
   1837 
   1838    BUFFER          .req x1
   1839    PUT_BUFFER      .req x6
   1840    PUT_BITS        .req x7
   1841    PUT_BITSw       .req w7
   1842 
   1843 .macro emit_byte
   1844    sub             PUT_BITS, PUT_BITS, #0x8
   1845    lsr             x19, PUT_BUFFER, PUT_BITS
   1846    uxtb            w19, w19
   1847    strb            w19, [BUFFER, #1]!
   1848    cmp             w19, #0xff
   1849    b.ne            14f
   1850    strb            wzr, [BUFFER, #1]!
   1851 14:
   1852 .endm
   1853 .macro put_bits CODE, SIZE
   1854    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
   1855    add             PUT_BITS, PUT_BITS, \SIZE
   1856    orr             PUT_BUFFER, PUT_BUFFER, \CODE
   1857 .endm
   1858 .macro checkbuf31
   1859    cmp             PUT_BITS, #0x20
   1860    b.lt            31f
   1861    emit_byte
   1862    emit_byte
   1863    emit_byte
   1864    emit_byte
   1865 31:
   1866 .endm
   1867 .macro checkbuf47
   1868    cmp             PUT_BITS, #0x30
   1869    b.lt            47f
   1870    emit_byte
   1871    emit_byte
   1872    emit_byte
   1873    emit_byte
   1874    emit_byte
   1875    emit_byte
   1876 47:
   1877 .endm
   1878 
   1879 .macro generate_jsimd_huff_encode_one_block fast_tbl
   1880 
   1881 .if \fast_tbl == 1
   1882 asm_function jsimd_huff_encode_one_block_neon
   1883 .else
   1884 asm_function jsimd_huff_encode_one_block_neon_slowtbl
   1885 .endif
   1886    sub             sp, sp, 272
   1887    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
   1888    /* Save Arm registers */
   1889    stp             x19, x20, [sp]
   1890    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
   1891    ldr             PUT_BUFFER, [x0, #0x10]
   1892    ldr             PUT_BITSw, [x0, #0x18]
   1893    ldrsh           w12, [x2]               /* load DC coeff in w12 */
   1894    /* prepare data */
   1895 .if \fast_tbl == 1
   1896    ld1             {v23.16b}, [x15], #16
   1897    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
   1898    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
   1899    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
   1900    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
   1901    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
   1902    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
   1903    /* ZigZag 8x8 */
   1904    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
   1905    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
   1906    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
   1907    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
   1908    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
   1909    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
   1910    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
   1911    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
   1912    ins             v0.h[0], w12
   1913    tbx             v1.16b, {v28.16b}, v16.16b
   1914    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
   1915    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
   1916    tbx             v6.16b, {v31.16b}, v19.16b
   1917 .else
   1918      add             x13, x2, #0x22
   1919      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
   1920    ld1             {v23.16b}, [x15]
   1921      add             x14, x2, #0x18
   1922      add             x3, x2, #0x36
   1923    ins             v0.h[0], w12
   1924      add             x9, x2, #0x2
   1925    ld1             {v1.h}[0], [x13]
   1926      add             x15, x2, #0x30
   1927    ld1             {v2.h}[0], [x14]
   1928      add             x19, x2, #0x26
   1929    ld1             {v3.h}[0], [x3]
   1930      add             x20, x2, #0x28
   1931    ld1             {v0.h}[1], [x9]
   1932      add             x12, x2, #0x10
   1933    ld1             {v1.h}[1], [x15]
   1934      add             x13, x2, #0x40
   1935    ld1             {v2.h}[1], [x19]
   1936      add             x14, x2, #0x34
   1937    ld1             {v3.h}[1], [x20]
   1938      add             x3, x2, #0x1a
   1939    ld1             {v0.h}[2], [x12]
   1940      add             x9, x2, #0x20
   1941    ld1             {v1.h}[2], [x13]
   1942      add             x15, x2, #0x32
   1943    ld1             {v2.h}[2], [x14]
   1944      add             x19, x2, #0x42
   1945    ld1             {v3.h}[2], [x3]
   1946      add             x20, x2, #0xc
   1947    ld1             {v0.h}[3], [x9]
   1948      add             x12, x2, #0x12
   1949    ld1             {v1.h}[3], [x15]
   1950      add             x13, x2, #0x24
   1951    ld1             {v2.h}[3], [x19]
   1952      add             x14, x2, #0x50
   1953    ld1             {v3.h}[3], [x20]
   1954      add             x3, x2, #0xe
   1955    ld1             {v0.h}[4], [x12]
   1956      add             x9, x2, #0x4
   1957    ld1             {v1.h}[4], [x13]
   1958      add             x15, x2, #0x16
   1959    ld1             {v2.h}[4], [x14]
   1960      add             x19, x2, #0x60
   1961    ld1             {v3.h}[4], [x3]
   1962      add             x20, x2, #0x1c
   1963    ld1             {v0.h}[5], [x9]
   1964      add             x12, x2, #0x6
   1965    ld1             {v1.h}[5], [x15]
   1966      add             x13, x2, #0x8
   1967    ld1             {v2.h}[5], [x19]
   1968      add             x14, x2, #0x52
   1969    ld1             {v3.h}[5], [x20]
   1970      add             x3, x2, #0x2a
   1971    ld1             {v0.h}[6], [x12]
   1972      add             x9, x2, #0x14
   1973    ld1             {v1.h}[6], [x13]
   1974      add             x15, x2, #0xa
   1975    ld1             {v2.h}[6], [x14]
   1976      add             x19, x2, #0x44
   1977    ld1             {v3.h}[6], [x3]
   1978      add             x20, x2, #0x38
   1979    ld1             {v0.h}[7], [x9]
   1980      add             x12, x2, #0x46
   1981    ld1             {v1.h}[7], [x15]
   1982      add             x13, x2, #0x3a
   1983    ld1             {v2.h}[7], [x19]
   1984      add             x14, x2, #0x74
   1985    ld1             {v3.h}[7], [x20]
   1986      add             x3, x2, #0x6a
   1987    ld1             {v4.h}[0], [x12]
   1988      add             x9, x2, #0x54
   1989    ld1             {v5.h}[0], [x13]
   1990      add             x15, x2, #0x2c
   1991    ld1             {v6.h}[0], [x14]
   1992      add             x19, x2, #0x76
   1993    ld1             {v7.h}[0], [x3]
   1994      add             x20, x2, #0x78
   1995    ld1             {v4.h}[1], [x9]
   1996      add             x12, x2, #0x62
   1997    ld1             {v5.h}[1], [x15]
   1998      add             x13, x2, #0x1e
   1999    ld1             {v6.h}[1], [x19]
   2000      add             x14, x2, #0x68
   2001    ld1             {v7.h}[1], [x20]
   2002      add             x3, x2, #0x7a
   2003    ld1             {v4.h}[2], [x12]
   2004      add             x9, x2, #0x70
   2005    ld1             {v5.h}[2], [x13]
   2006      add             x15, x2, #0x2e
   2007    ld1             {v6.h}[2], [x14]
   2008      add             x19, x2, #0x5a
   2009    ld1             {v7.h}[2], [x3]
   2010      add             x20, x2, #0x6c
   2011    ld1             {v4.h}[3], [x9]
   2012      add             x12, x2, #0x72
   2013    ld1             {v5.h}[3], [x15]
   2014      add             x13, x2, #0x3c
   2015    ld1             {v6.h}[3], [x19]
   2016      add             x14, x2, #0x4c
   2017    ld1             {v7.h}[3], [x20]
   2018      add             x3, x2, #0x5e
   2019    ld1             {v4.h}[4], [x12]
   2020      add             x9, x2, #0x64
   2021    ld1             {v5.h}[4], [x13]
   2022      add             x15, x2, #0x4a
   2023    ld1             {v6.h}[4], [x14]
   2024      add             x19, x2, #0x3e
   2025    ld1             {v7.h}[4], [x3]
   2026      add             x20, x2, #0x6e
   2027    ld1             {v4.h}[5], [x9]
   2028      add             x12, x2, #0x56
   2029    ld1             {v5.h}[5], [x15]
   2030      add             x13, x2, #0x58
   2031    ld1             {v6.h}[5], [x19]
   2032      add             x14, x2, #0x4e
   2033    ld1             {v7.h}[5], [x20]
   2034      add             x3, x2, #0x7c
   2035    ld1             {v4.h}[6], [x12]
   2036      add             x9, x2, #0x48
   2037    ld1             {v5.h}[6], [x13]
   2038      add             x15, x2, #0x66
   2039    ld1             {v6.h}[6], [x14]
   2040      add             x19, x2, #0x5c
   2041    ld1             {v7.h}[6], [x3]
   2042      add             x20, x2, #0x7e
   2043    ld1             {v4.h}[7], [x9]
   2044    ld1             {v5.h}[7], [x15]
   2045    ld1             {v6.h}[7], [x19]
   2046    ld1             {v7.h}[7], [x20]
   2047 .endif
   2048    cmlt            v24.8h, v0.8h, #0
   2049    cmlt            v25.8h, v1.8h, #0
   2050    cmlt            v26.8h, v2.8h, #0
   2051    cmlt            v27.8h, v3.8h, #0
   2052    cmlt            v28.8h, v4.8h, #0
   2053    cmlt            v29.8h, v5.8h, #0
   2054    cmlt            v30.8h, v6.8h, #0
   2055    cmlt            v31.8h, v7.8h, #0
   2056    abs             v0.8h, v0.8h
   2057    abs             v1.8h, v1.8h
   2058    abs             v2.8h, v2.8h
   2059    abs             v3.8h, v3.8h
   2060    abs             v4.8h, v4.8h
   2061    abs             v5.8h, v5.8h
   2062    abs             v6.8h, v6.8h
   2063    abs             v7.8h, v7.8h
   2064    eor             v24.16b, v24.16b, v0.16b
   2065    eor             v25.16b, v25.16b, v1.16b
   2066    eor             v26.16b, v26.16b, v2.16b
   2067    eor             v27.16b, v27.16b, v3.16b
   2068    eor             v28.16b, v28.16b, v4.16b
   2069    eor             v29.16b, v29.16b, v5.16b
   2070    eor             v30.16b, v30.16b, v6.16b
   2071    eor             v31.16b, v31.16b, v7.16b
   2072    cmeq            v16.8h, v0.8h, #0
   2073    cmeq            v17.8h, v1.8h, #0
   2074    cmeq            v18.8h, v2.8h, #0
   2075    cmeq            v19.8h, v3.8h, #0
   2076    cmeq            v20.8h, v4.8h, #0
   2077    cmeq            v21.8h, v5.8h, #0
   2078    cmeq            v22.8h, v6.8h, #0
   2079    xtn             v16.8b, v16.8h
   2080    xtn             v18.8b, v18.8h
   2081    xtn             v20.8b, v20.8h
   2082    xtn             v22.8b, v22.8h
   2083      umov            w14, v0.h[0]
   2084    xtn2            v16.16b, v17.8h
   2085      umov            w13, v24.h[0]
   2086    xtn2            v18.16b, v19.8h
   2087      clz             w14, w14
   2088    xtn2            v20.16b, v21.8h
   2089      lsl             w13, w13, w14
   2090    cmeq            v17.8h, v7.8h, #0
   2091      sub             w12, w14, #32
   2092    xtn2            v22.16b, v17.8h
   2093      lsr             w13, w13, w14
   2094    and             v16.16b, v16.16b, v23.16b
   2095      neg             w12, w12
   2096    and             v18.16b, v18.16b, v23.16b
   2097      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
   2098    and             v20.16b, v20.16b, v23.16b
   2099      add             x15, sp, #0x90           /* x15 = t2 */
   2100    and             v22.16b, v22.16b, v23.16b
   2101      ldr             w10, [x4, x12, lsl #2]
   2102    addp            v16.16b, v16.16b, v18.16b
   2103      ldrb            w11, [x3, x12]
   2104    addp            v20.16b, v20.16b, v22.16b
   2105      checkbuf47
   2106    addp            v16.16b, v16.16b, v20.16b
   2107      put_bits        x10, x11
   2108    addp            v16.16b, v16.16b, v18.16b
   2109      checkbuf47
   2110    umov            x9, v16.D[0]
   2111      put_bits        x13, x12
   2112    cnt             v17.8b, v16.8b
   2113      mvn             x9, x9
   2114    addv            B18, v17.8b
   2115      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
   2116    umov            w12, v18.b[0]
   2117      lsr             x9, x9, #0x1     /* clear AC coeff */
   2118    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
   2119    rbit            x9, x9             /* x9 = index0 */
   2120    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
   2121    cmp             w12, #(64-8)
   2122    add             x11, sp, #16
   2123    b.lt            4f
   2124    cbz             x9, 6f
   2125    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
   2126    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
   2127    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
   2128    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
   2129 1:
   2130    clz             x2, x9
   2131    add             x15, x15, x2, lsl #1
   2132    lsl             x9, x9, x2
   2133    ldrh            w20, [x15, #-126]
   2134 2:
   2135    cmp             x2, #0x10
   2136    b.lt            3f
   2137    sub             x2, x2, #0x10
   2138    checkbuf47
   2139    put_bits        x13, x14
   2140    b               2b
   2141 3:
   2142    clz             w20, w20
   2143    ldrh            w3, [x15, #2]!
   2144    sub             w11, w20, #32
   2145    lsl             w3, w3, w20
   2146    neg             w11, w11
   2147    lsr             w3, w3, w20
   2148    add             x2, x11, x2, lsl #4
   2149    lsl             x9, x9, #0x1
   2150    ldr             w12, [x5, x2, lsl #2]
   2151    ldrb            w10, [x4, x2]
   2152    checkbuf31
   2153    put_bits        x12, x10
   2154    put_bits        x3, x11
   2155    cbnz            x9, 1b
   2156    b               6f
   2157 4:
   2158    movi            v21.8h, #0x0010
   2159    clz             v0.8h, v0.8h
   2160    clz             v1.8h, v1.8h
   2161    clz             v2.8h, v2.8h
   2162    clz             v3.8h, v3.8h
   2163    clz             v4.8h, v4.8h
   2164    clz             v5.8h, v5.8h
   2165    clz             v6.8h, v6.8h
   2166    clz             v7.8h, v7.8h
   2167    ushl            v24.8h, v24.8h, v0.8h
   2168    ushl            v25.8h, v25.8h, v1.8h
   2169    ushl            v26.8h, v26.8h, v2.8h
   2170    ushl            v27.8h, v27.8h, v3.8h
   2171    ushl            v28.8h, v28.8h, v4.8h
   2172    ushl            v29.8h, v29.8h, v5.8h
   2173    ushl            v30.8h, v30.8h, v6.8h
   2174    ushl            v31.8h, v31.8h, v7.8h
   2175    neg             v0.8h, v0.8h
   2176    neg             v1.8h, v1.8h
   2177    neg             v2.8h, v2.8h
   2178    neg             v3.8h, v3.8h
   2179    neg             v4.8h, v4.8h
   2180    neg             v5.8h, v5.8h
   2181    neg             v6.8h, v6.8h
   2182    neg             v7.8h, v7.8h
   2183    ushl            v24.8h, v24.8h, v0.8h
   2184    ushl            v25.8h, v25.8h, v1.8h
   2185    ushl            v26.8h, v26.8h, v2.8h
   2186    ushl            v27.8h, v27.8h, v3.8h
   2187    ushl            v28.8h, v28.8h, v4.8h
   2188    ushl            v29.8h, v29.8h, v5.8h
   2189    ushl            v30.8h, v30.8h, v6.8h
   2190    ushl            v31.8h, v31.8h, v7.8h
   2191    add             v0.8h, v21.8h, v0.8h
   2192    add             v1.8h, v21.8h, v1.8h
   2193    add             v2.8h, v21.8h, v2.8h
   2194    add             v3.8h, v21.8h, v3.8h
   2195    add             v4.8h, v21.8h, v4.8h
   2196    add             v5.8h, v21.8h, v5.8h
   2197    add             v6.8h, v21.8h, v6.8h
   2198    add             v7.8h, v21.8h, v7.8h
   2199    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
   2200    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
   2201    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
   2202    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
   2203 1:
   2204    clz             x2, x9
   2205    add             x15, x15, x2, lsl #1
   2206    lsl             x9, x9, x2
   2207    ldrh            w11, [x15, #-126]
   2208 2:
   2209    cmp             x2, #0x10
   2210    b.lt            3f
   2211    sub             x2, x2, #0x10
   2212    checkbuf47
   2213    put_bits        x13, x14
   2214    b               2b
   2215 3:
   2216    ldrh            w3, [x15, #2]!
   2217    add             x2, x11, x2, lsl #4
   2218    lsl             x9, x9, #0x1
   2219    ldr             w12, [x5, x2, lsl #2]
   2220    ldrb            w10, [x4, x2]
   2221    checkbuf31
   2222    put_bits        x12, x10
   2223    put_bits        x3, x11
   2224    cbnz            x9, 1b
   2225 6:
   2226    add             x13, sp, #0x10e
   2227    cmp             x15, x13
   2228    b.hs            1f
   2229    ldr             w12, [x5]
   2230    ldrb            w14, [x4]
   2231    checkbuf47
   2232    put_bits        x12, x14
   2233 1:
   2234    str             PUT_BUFFER, [x0, #0x10]
   2235    str             PUT_BITSw, [x0, #0x18]
   2236    ldp             x19, x20, [sp], 16
   2237    add             x0, BUFFER, #0x1
   2238    add             sp, sp, 256
   2239    br              x30
   2240 
   2241 .endm
   2242 
   2243 generate_jsimd_huff_encode_one_block 1
   2244 generate_jsimd_huff_encode_one_block 0
   2245 
   2246    .unreq          BUFFER
   2247    .unreq          PUT_BUFFER
   2248    .unreq          PUT_BITS
   2249    .unreq          PUT_BITSw
   2250 
   2251 .purgem emit_byte
   2252 .purgem put_bits
   2253 .purgem checkbuf31
   2254 .purgem checkbuf47