tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

idctdsp_neon.S (5088B)


      1 /*
      2 * IDCT AArch64 NEON optimisations
      3 *
      4 * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
      5 *
      6 * This file is part of FFmpeg.
      7 *
      8 * FFmpeg is free software; you can redistribute it and/or
      9 * modify it under the terms of the GNU Lesser General Public
     10 * License as published by the Free Software Foundation; either
     11 * version 2.1 of the License, or (at your option) any later version.
     12 *
     13 * FFmpeg is distributed in the hope that it will be useful,
     14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 * Lesser General Public License for more details.
     17 *
     18 * You should have received a copy of the GNU Lesser General Public
     19 * License along with FFmpeg; if not, write to the Free Software
     20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 */
     22 
     23 #include "libavutil/aarch64/asm.S"
     24 
     25 // Clamp 16-bit signed block coefficients to unsigned 8-bit
     26 // On entry:
     27 //   x0 -> array of 64x 16-bit coefficients
     28 //   x1 -> 8-bit results
     29 //   x2 = row stride for results, bytes
     30 function ff_put_pixels_clamped_neon, export=1
     31        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
     32        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
     33        sqxtun          v0.8b, v0.8h
     34        sqxtun          v1.8b, v1.8h
     35        sqxtun          v2.8b, v2.8h
     36        sqxtun          v3.8b, v3.8h
     37        sqxtun          v4.8b, v4.8h
     38        st1             {v0.8b}, [x1], x2
     39        sqxtun          v0.8b, v5.8h
     40        st1             {v1.8b}, [x1], x2
     41        sqxtun          v1.8b, v6.8h
     42        st1             {v2.8b}, [x1], x2
     43        sqxtun          v2.8b, v7.8h
     44        st1             {v3.8b}, [x1], x2
     45        st1             {v4.8b}, [x1], x2
     46        st1             {v0.8b}, [x1], x2
     47        st1             {v1.8b}, [x1], x2
     48        st1             {v2.8b}, [x1]
     49        ret
     50 endfunc
     51 
     52 // Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
     53 // On entry:
     54 //   x0 -> array of 64x 16-bit coefficients
     55 //   x1 -> 8-bit results
     56 //   x2 = row stride for results, bytes
     57 function ff_put_signed_pixels_clamped_neon, export=1
     58        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
     59        movi            v4.8b, #128
     60        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
     61        sqxtn           v0.8b, v0.8h
     62        sqxtn           v1.8b, v1.8h
     63        sqxtn           v2.8b, v2.8h
     64        sqxtn           v3.8b, v3.8h
     65        sqxtn           v5.8b, v16.8h
     66        add             v0.8b, v0.8b, v4.8b
     67        sqxtn           v6.8b, v17.8h
     68        add             v1.8b, v1.8b, v4.8b
     69        sqxtn           v7.8b, v18.8h
     70        add             v2.8b, v2.8b, v4.8b
     71        sqxtn           v16.8b, v19.8h
     72        add             v3.8b, v3.8b, v4.8b
     73        st1             {v0.8b}, [x1], x2
     74        add             v0.8b, v5.8b, v4.8b
     75        st1             {v1.8b}, [x1], x2
     76        add             v1.8b, v6.8b, v4.8b
     77        st1             {v2.8b}, [x1], x2
     78        add             v2.8b, v7.8b, v4.8b
     79        st1             {v3.8b}, [x1], x2
     80        add             v3.8b, v16.8b, v4.8b
     81        st1             {v0.8b}, [x1], x2
     82        st1             {v1.8b}, [x1], x2
     83        st1             {v2.8b}, [x1], x2
     84        st1             {v3.8b}, [x1]
     85        ret
     86 endfunc
     87 
     88 // Add 16-bit signed block coefficients to unsigned 8-bit
     89 // On entry:
     90 //   x0 -> array of 64x 16-bit coefficients
     91 //   x1 -> 8-bit input and results
     92 //   x2 = row stride for 8-bit input and results, bytes
     93 function ff_add_pixels_clamped_neon, export=1
     94        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
     95        mov             x3, x1
     96        ld1             {v4.8b}, [x1], x2
     97        ld1             {v5.8b}, [x1], x2
     98        ld1             {v6.8b}, [x1], x2
     99        ld1             {v7.8b}, [x1], x2
    100        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
    101        uaddw           v0.8h, v0.8h, v4.8b
    102        uaddw           v1.8h, v1.8h, v5.8b
    103        uaddw           v2.8h, v2.8h, v6.8b
    104        ld1             {v4.8b}, [x1], x2
    105        uaddw           v3.8h, v3.8h, v7.8b
    106        ld1             {v5.8b}, [x1], x2
    107        sqxtun          v0.8b, v0.8h
    108        ld1             {v6.8b}, [x1], x2
    109        sqxtun          v1.8b, v1.8h
    110        ld1             {v7.8b}, [x1]
    111        sqxtun          v2.8b, v2.8h
    112        sqxtun          v3.8b, v3.8h
    113        uaddw           v4.8h, v16.8h, v4.8b
    114        st1             {v0.8b}, [x3], x2
    115        uaddw           v0.8h, v17.8h, v5.8b
    116        st1             {v1.8b}, [x3], x2
    117        uaddw           v1.8h, v18.8h, v6.8b
    118        st1             {v2.8b}, [x3], x2
    119        uaddw           v2.8h, v19.8h, v7.8b
    120        sqxtun          v4.8b, v4.8h
    121        sqxtun          v0.8b, v0.8h
    122        st1             {v3.8b}, [x3], x2
    123        sqxtun          v1.8b, v1.8h
    124        sqxtun          v2.8b, v2.8h
    125        st1             {v4.8b}, [x3], x2
    126        st1             {v0.8b}, [x3], x2
    127        st1             {v1.8b}, [x3], x2
    128        st1             {v2.8b}, [x3]
    129        ret
    130 endfunc