tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

fdctdsp_neon.S (17761B)


      1 /*
      2 * Armv8 Neon optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
      5 *                          All Rights Reserved.
      6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
      7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
      8 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
      9 * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
     10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
     11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
     12 *
     13 * This software is provided 'as-is', without any express or implied
     14 * warranty.  In no event will the authors be held liable for any damages
     15 * arising from the use of this software.
     16 *
     17 * Permission is granted to anyone to use this software for any purpose,
     18 * including commercial applications, and to alter it and redistribute it
     19 * freely, subject to the following restrictions:
     20 *
     21 * 1. The origin of this software must not be misrepresented; you must not
     22 *    claim that you wrote the original software. If you use this software
     23 *    in a product, an acknowledgment in the product documentation would be
     24 *    appreciated but is not required.
     25 * 2. Altered source versions must be plainly marked as such, and must not be
     26 *    misrepresented as being the original software.
     27 * 3. This notice may not be removed or altered from any source distribution.
     28 */
     29 
     30 #include "libavutil/aarch64/asm.S"
     31 #include "neon.S"
     32 
     33 // #define EIGHT_BIT_SAMPLES
     34 
     35 /* Constants for jsimd_fdct_islow_neon() */
     36 
     37 #define F_0_298   2446  /* FIX(0.298631336) */
     38 #define F_0_390   3196  /* FIX(0.390180644) */
     39 #define F_0_541   4433  /* FIX(0.541196100) */
     40 #define F_0_765   6270  /* FIX(0.765366865) */
     41 #define F_0_899   7373  /* FIX(0.899976223) */
     42 #define F_1_175   9633  /* FIX(1.175875602) */
     43 #define F_1_501  12299  /* FIX(1.501321110) */
     44 #define F_1_847  15137  /* FIX(1.847759065) */
     45 #define F_1_961  16069  /* FIX(1.961570560) */
     46 #define F_2_053  16819  /* FIX(2.053119869) */
     47 #define F_2_562  20995  /* FIX(2.562915447) */
     48 #define F_3_072  25172  /* FIX(3.072711026) */
     49 
     50 const jsimd_fdct_islow_neon_consts, align=4
     51        .short F_0_298
     52        .short -F_0_390
     53        .short F_0_541
     54        .short F_0_765
     55        .short - F_0_899
     56        .short F_1_175
     57        .short F_1_501
     58        .short - F_1_847
     59        .short - F_1_961
     60        .short F_2_053
     61        .short - F_2_562
     62        .short F_3_072
     63        .short 0          /* padding */
     64        .short 0
     65        .short 0
     66        .short 0
     67 endconst
     68 
     69 #undef F_0_298
     70 #undef F_0_390
     71 #undef F_0_541
     72 #undef F_0_765
     73 #undef F_0_899
     74 #undef F_1_175
     75 #undef F_1_501
     76 #undef F_1_847
     77 #undef F_1_961
     78 #undef F_2_053
     79 #undef F_2_562
     80 #undef F_3_072
     81 
     82 /*****************************************************************************/
     83 
     84 /*
     85 * jsimd_fdct_islow_neon
     86 *
     87 * This file contains a slower but more accurate integer implementation of the
     88 * forward DCT (Discrete Cosine Transform). The following code is based
     89 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
     90 * more details.
     91 */
     92 
     93 #define CONST_BITS  13
     94 #ifdef EIGHT_BIT_SAMPLES
     95 #define PASS1_BITS  2
     96 #else
     97 #define PASS1_BITS  1   /* lose a little precision to avoid overflow */
     98 #endif
     99 
    100 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
    101 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
    102 
    103 #define XFIX_P_0_298  v0.h[0]
    104 #define XFIX_N_0_390  v0.h[1]
    105 #define XFIX_P_0_541  v0.h[2]
    106 #define XFIX_P_0_765  v0.h[3]
    107 #define XFIX_N_0_899  v0.h[4]
    108 #define XFIX_P_1_175  v0.h[5]
    109 #define XFIX_P_1_501  v0.h[6]
    110 #define XFIX_N_1_847  v0.h[7]
    111 #define XFIX_N_1_961  v1.h[0]
    112 #define XFIX_P_2_053  v1.h[1]
    113 #define XFIX_N_2_562  v1.h[2]
    114 #define XFIX_P_3_072  v1.h[3]
    115 
    116 function ff_fdct_neon, export=1
    117 
    118        DATA            .req x0
    119        TMP             .req x9
    120 
    121        /* Load constants */
    122        movrel          TMP, jsimd_fdct_islow_neon_consts
    123        ld1             {v0.8h, v1.8h}, [TMP]
    124 
    125        /* Load all DATA into Neon registers with the following allocation:
    126         *       0 1 2 3 | 4 5 6 7
    127         *      ---------+--------
    128         *   0 | d16     | d17    | v16.8h
    129         *   1 | d18     | d19    | v17.8h
    130         *   2 | d20     | d21    | v18.8h
    131         *   3 | d22     | d23    | v19.8h
    132         *   4 | d24     | d25    | v20.8h
    133         *   5 | d26     | d27    | v21.8h
    134         *   6 | d28     | d29    | v22.8h
    135         *   7 | d30     | d31    | v23.8h
    136         */
    137 
    138        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
    139        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
    140        sub             DATA, DATA, #64
    141 
    142        /* Transpose */
    143        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
    144 
    145        /* 1-D FDCT */
    146        add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
    147        sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
    148        add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
    149        sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
    150        add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
    151        sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
    152        add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
    153        sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
    154 
    155        /* Even part */
    156        add             v4.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
    157        sub             v5.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
    158        add             v6.8h, v25.8h, v26.8h   /* tmp11 = tmp1 + tmp2; */
    159        sub             v7.8h, v25.8h, v26.8h   /* tmp12 = tmp1 - tmp2; */
    160 
    161        add             v16.8h, v4.8h, v6.8h   /* tmp10 + tmp11 */
    162        sub             v20.8h, v4.8h, v6.8h   /* tmp10 - tmp11 */
    163 
    164        add             v18.8h, v7.8h, v5.8h   /* tmp12 + tmp13 */
    165 
    166        shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
    167        shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
    168 
    169        smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
    170        smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
    171        mov             v22.16b, v18.16b
    172        mov             v25.16b, v24.16b
    173 
    174        smlal           v18.4s, v5.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
    175        smlal2          v24.4s, v5.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
    176        smlal           v22.4s, v7.4h, XFIX_N_1_847   /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
    177        smlal2          v25.4s, v7.8h, XFIX_N_1_847   /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
    178 
    179        rshrn           v18.4h, v18.4s, #DESCALE_P1
    180        rshrn           v22.4h, v22.4s, #DESCALE_P1
    181        rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
    182        rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
    183 
    184        /* Odd part */
    185        add             v2.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
    186        add             v3.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
    187        add             v6.8h, v28.8h, v30.8h        /* z3 = tmp4 + tmp6; */
    188        add             v7.8h, v29.8h, v31.8h        /* z4 = tmp5 + tmp7; */
    189        smull           v4.4s, v6.4h, XFIX_P_1_175   /* z5 lo = z3 lo * XFIX_P_1_175 */
    190        smull2          v5.4s, v6.8h, XFIX_P_1_175
    191        smlal           v4.4s, v7.4h, XFIX_P_1_175   /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
    192        smlal2          v5.4s, v7.8h, XFIX_P_1_175
    193 
    194        smull2          v24.4s, v28.8h, XFIX_P_0_298
    195        smull2          v25.4s, v29.8h, XFIX_P_2_053
    196        smull2          v26.4s, v30.8h, XFIX_P_3_072
    197        smull2          v27.4s, v31.8h, XFIX_P_1_501
    198        smull           v23.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
    199        smull           v21.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
    200        smull           v19.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
    201        smull           v17.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
    202 
    203        smull2          v28.4s, v2.8h, XFIX_N_0_899
    204        smull2          v29.4s, v3.8h, XFIX_N_2_562
    205        smull2          v30.4s, v6.8h, XFIX_N_1_961
    206        smull2          v31.4s, v7.8h, XFIX_N_0_390
    207        smull           v2.4s, v2.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
    208        smull           v3.4s, v3.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
    209        smull           v6.4s, v6.4h, XFIX_N_1_961    /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
    210        smull           v7.4s, v7.4h, XFIX_N_0_390    /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
    211 
    212        add             v6.4s, v6.4s, v4.4s    /* z3 += z5 */
    213        add             v30.4s, v30.4s, v5.4s
    214        add             v7.4s, v7.4s, v4.4s    /* z4 += z5 */
    215        add             v31.4s, v31.4s, v5.4s
    216 
    217        add             v23.4s, v23.4s, v2.4s   /* tmp4 += z1 */
    218        add             v24.4s, v24.4s, v28.4s
    219        add             v21.4s, v21.4s, v3.4s   /* tmp5 += z2 */
    220        add             v25.4s, v25.4s, v29.4s
    221        add             v19.4s, v19.4s, v6.4s   /* tmp6 += z3 */
    222        add             v26.4s, v26.4s, v30.4s
    223        add             v17.4s, v17.4s, v7.4s   /* tmp7 += z4 */
    224        add             v27.4s, v27.4s, v31.4s
    225 
    226        add             v23.4s, v23.4s, v6.4s   /* tmp4 += z3 */
    227        add             v24.4s, v24.4s, v30.4s
    228        add             v21.4s, v21.4s, v7.4s   /* tmp5 += z4 */
    229        add             v25.4s, v25.4s, v31.4s
    230        add             v19.4s, v19.4s, v3.4s   /* tmp6 += z2 */
    231        add             v26.4s, v26.4s, v29.4s
    232        add             v17.4s, v17.4s, v2.4s   /* tmp7 += z1 */
    233        add             v27.4s, v27.4s, v28.4s
    234 
    235        rshrn           v23.4h, v23.4s, #DESCALE_P1
    236        rshrn           v21.4h, v21.4s, #DESCALE_P1
    237        rshrn           v19.4h, v19.4s, #DESCALE_P1
    238        rshrn           v17.4h, v17.4s, #DESCALE_P1
    239        rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
    240        rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
    241        rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
    242        rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
    243 
    244        /* Transpose */
    245        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
    246 
    247        /* 1-D FDCT */
    248        add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
    249        sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
    250        add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
    251        sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
    252        add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
    253        sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
    254        add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
    255        sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
    256 
    257        /* Even part */
    258        add             v4.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
    259        sub             v5.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
    260        add             v6.8h, v25.8h, v26.8h   /* tmp11 = tmp1 + tmp2; */
    261        sub             v7.8h, v25.8h, v26.8h   /* tmp12 = tmp1 - tmp2; */
    262 
    263        add             v16.8h, v4.8h, v6.8h   /* tmp10 + tmp11 */
    264        sub             v20.8h, v4.8h, v6.8h   /* tmp10 - tmp11 */
    265 
    266        add             v18.8h, v7.8h, v5.8h   /* tmp12 + tmp13 */
    267 
    268        srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
    269        srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
    270 
    271        smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
    272        smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
    273        mov             v22.16b, v18.16b
    274        mov             v25.16b, v24.16b
    275 
    276        smlal           v18.4s, v5.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
    277        smlal2          v24.4s, v5.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
    278        smlal           v22.4s, v7.4h, XFIX_N_1_847   /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
    279        smlal2          v25.4s, v7.8h, XFIX_N_1_847   /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
    280 
    281        rshrn           v18.4h, v18.4s, #DESCALE_P2
    282        rshrn           v22.4h, v22.4s, #DESCALE_P2
    283        rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS+PASS1_BITS); */
    284        rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS+PASS1_BITS); */
    285 
    286        /* Odd part */
    287        add             v2.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
    288        add             v3.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
    289        add             v6.8h, v28.8h, v30.8h   /* z3 = tmp4 + tmp6; */
    290        add             v7.8h, v29.8h, v31.8h   /* z4 = tmp5 + tmp7; */
    291 
    292        smull           v4.4s, v6.4h, XFIX_P_1_175   /* z5 lo = z3 lo * XFIX_P_1_175 */
    293        smull2          v5.4s, v6.8h, XFIX_P_1_175
    294        smlal           v4.4s, v7.4h, XFIX_P_1_175   /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
    295        smlal2          v5.4s, v7.8h, XFIX_P_1_175
    296 
    297        smull2          v24.4s, v28.8h, XFIX_P_0_298
    298        smull2          v25.4s, v29.8h, XFIX_P_2_053
    299        smull2          v26.4s, v30.8h, XFIX_P_3_072
    300        smull2          v27.4s, v31.8h, XFIX_P_1_501
    301        smull           v23.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
    302        smull           v21.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
    303        smull           v19.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
    304        smull           v17.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
    305 
    306        smull2          v28.4s, v2.8h, XFIX_N_0_899
    307        smull2          v29.4s, v3.8h, XFIX_N_2_562
    308        smull2          v30.4s, v6.8h, XFIX_N_1_961
    309        smull2          v31.4s, v7.8h, XFIX_N_0_390
    310        smull           v2.4s, v2.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
    311        smull           v3.4s, v3.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
    312        smull           v6.4s, v6.4h, XFIX_N_1_961    /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
    313        smull           v7.4s, v7.4h, XFIX_N_0_390    /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
    314 
    315        add             v6.4s, v6.4s, v4.4s    /* z3 += z5 */
    316        add             v30.4s, v30.4s, v5.4s
    317        add             v7.4s, v7.4s, v4.4s    /* z4 += z5 */
    318        add             v31.4s, v31.4s, v5.4s
    319 
    320        add             v23.4s, v23.4s, v2.4s   /* tmp4 += z1 */
    321        add             v24.4s, v24.4s, v28.4s
    322        add             v21.4s, v21.4s, v3.4s   /* tmp5 += z2 */
    323        add             v25.4s, v25.4s, v29.4s
    324        add             v19.4s, v19.4s, v6.4s   /* tmp6 += z3 */
    325        add             v26.4s, v26.4s, v30.4s
    326        add             v17.4s, v17.4s, v7.4s   /* tmp7 += z4 */
    327        add             v27.4s, v27.4s, v31.4s
    328 
    329        add             v23.4s, v23.4s, v6.4s   /* tmp4 += z3 */
    330        add             v24.4s, v24.4s, v30.4s
    331        add             v21.4s, v21.4s, v7.4s   /* tmp5 += z4 */
    332        add             v25.4s, v25.4s, v31.4s
    333        add             v19.4s, v19.4s, v3.4s   /* tmp6 += z2 */
    334        add             v26.4s, v26.4s, v29.4s
    335        add             v17.4s, v17.4s, v2.4s   /* tmp7 += z1 */
    336        add             v27.4s, v27.4s, v28.4s
    337 
    338        rshrn           v23.4h, v23.4s, #DESCALE_P2
    339        rshrn           v21.4h, v21.4s, #DESCALE_P2
    340        rshrn           v19.4h, v19.4s, #DESCALE_P2
    341        rshrn           v17.4h, v17.4s, #DESCALE_P2
    342        rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS); */
    343        rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS); */
    344        rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS); */
    345        rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS); */
    346 
    347        /* Store results */
    348        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
    349        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
    350 
    351        ret
    352 
    353        .unreq          DATA
    354        .unreq          TMP
    355 endfunc
    356 
    357 #undef XFIX_P_0_298
    358 #undef XFIX_N_0_390
    359 #undef XFIX_P_0_541
    360 #undef XFIX_P_0_765
    361 #undef XFIX_N_0_899
    362 #undef XFIX_P_1_175
    363 #undef XFIX_P_1_501
    364 #undef XFIX_N_1_847
    365 #undef XFIX_N_1_961
    366 #undef XFIX_P_2_053
    367 #undef XFIX_N_2_562
    368 #undef XFIX_P_3_072