mc16.S (3319B)
1 /****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Nathan Egge 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28 #include "src/riscv/asm.S" 29 30 function blend_vl256_16bpc_rvv, export=1, ext=zbb 31 ctz t0, a3 32 addi t0, t0, 0xc4 33 j L(blend_epilog) 34 endfunc 35 36 function blend_16bpc_rvv, export=1, ext="v,zbb" 37 ctz t0, a3 38 addi t0, t0, 0xc5 39 L(blend_epilog): 40 csrw vxrm, zero 41 andi t0, t0, 0xc7 42 li t1, 64 43 ori t0, t0, 8 44 add a6, a3, a3 45 vsetvl zero, a3, t0 46 1: 47 addi a4, a4, -2 48 vle8.v v24, (a5) 49 add a5, a5, a3 50 vle8.v v28, (a5) 51 add a5, a5, a3 52 vle16.v v8, (a2) 53 add a2, a2, a6 54 vle16.v v12, (a2) 55 add a2, a2, a6 56 vzext.vf2 v16, v24 57 vzext.vf2 v20, v28 58 vle16.v v0, (a0) 59 add t0, a0, a1 60 vle16.v v4, (t0) 61 vwmulu.vv v24, v8, v16 62 vwmulu.vv v8, v12, v20 63 vrsub.vx v16, v16, t1 64 vrsub.vx v20, v20, t1 65 vwmaccu.vv v24, v0, v16 66 vwmaccu.vv v8, v4, v20 67 vnclipu.wi v0, v24, 6 68 vnclipu.wi v4, v8, 6 69 vse16.v v0, (a0) 70 vse16.v v4, (t0) 71 add a0, t0, a1 72 bnez a4, 1b 73 ret 74 endfunc 75 76 function blend_v_vl256_16bpc_rvv, export=1, ext=zbb 77 srai t0, a3, 2 78 ctz t0, t0 79 addi t0, t0, 0xc6 80 j L(blend_v_epilog) 81 endfunc 82 83 function blend_v_16bpc_rvv, export=1, ext="v,zbb" 84 ctz t0, a3 85 addi t0, t0, 0xc5 86 L(blend_v_epilog): 87 andi t0, t0, 0xc7 88 ori t0, t0, 8 89 srai t1, a3, 2 90 sub t1, a3, t1 91 vsetvl zero, t1, t0 92 csrw vxrm, zero 93 la t1, dav1d_obmc_masks 94 add t1, t1, a3 95 vle8.v v20, (t1) 96 li t0, 64 97 vzext.vf2 v16, v20 98 add a3, a3, a3 99 vrsub.vx v20, v16, t0 100 1: 101 addi a4, a4, -2 102 vle16.v v8, (a2) 103 add a2, a2, a3 104 vle16.v v12, (a2) 105 add a2, a2, a3 106 vle16.v v0, (a0) 107 add t0, a0, a1 108 vle16.v v4, (t0) 109 vwmulu.vv v24, v8, v16 110 vwmulu.vv v8, v12, v16 111 vwmaccu.vv v24, v0, v20 112 vwmaccu.vv v8, v4, v20 113 vnclipu.wi v0, v24, 6 114 vnclipu.wi v4, v8, 6 115 vse16.v v0, (a0) 116 vse16.v v4, (t0) 117 add a0, t0, a1 118 bnez a4, 1b 119 ret 120 endfunc