mem_rvv.h (10357B)
1 /* 2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_RISCV_MEM_RVV_H_ 13 #define AOM_AOM_DSP_RISCV_MEM_RVV_H_ 14 15 #include <riscv_vector.h> 16 17 static inline void load_s16_4x5(const int16_t *s, int p, vint16mf2_t *const s0, 18 vint16mf2_t *const s1, vint16mf2_t *const s2, 19 vint16mf2_t *const s3, vint16mf2_t *const s4, 20 size_t vl) { 21 *s0 = __riscv_vle16_v_i16mf2(s, vl); 22 s += p; 23 *s1 = __riscv_vle16_v_i16mf2(s, vl); 24 s += p; 25 *s2 = __riscv_vle16_v_i16mf2(s, vl); 26 s += p; 27 *s3 = __riscv_vle16_v_i16mf2(s, vl); 28 s += p; 29 *s4 = __riscv_vle16_v_i16mf2(s, vl); 30 } 31 32 static inline void load_s16_4x4(const int16_t *s, int p, vint16mf2_t *const s0, 33 vint16mf2_t *const s1, vint16mf2_t *const s2, 34 vint16mf2_t *const s3, size_t vl) { 35 *s0 = __riscv_vle16_v_i16mf2(s, vl); 36 s += p; 37 *s1 = __riscv_vle16_v_i16mf2(s, vl); 38 s += p; 39 *s2 = __riscv_vle16_v_i16mf2(s, vl); 40 s += p; 41 *s3 = __riscv_vle16_v_i16mf2(s, vl); 42 } 43 44 static inline void load_s16_8x5(const int16_t *s, int p, vint16m1_t *const s0, 45 vint16m1_t *const s1, vint16m1_t *const s2, 46 vint16m1_t *const s3, vint16m1_t *const s4, 47 size_t vl) { 48 *s0 = __riscv_vle16_v_i16m1(s, vl); 49 s += p; 50 *s1 = __riscv_vle16_v_i16m1(s, vl); 51 s += p; 52 *s2 = __riscv_vle16_v_i16m1(s, vl); 53 s += p; 54 *s3 = __riscv_vle16_v_i16m1(s, vl); 55 s += p; 56 *s4 = __riscv_vle16_v_i16m1(s, vl); 57 } 58 59 static inline void load_s16_8x4(const int16_t *s, int p, vint16m1_t *const s0, 60 vint16m1_t *const s1, vint16m1_t *const s2, 61 vint16m1_t *const s3, size_t vl) { 62 *s0 = __riscv_vle16_v_i16m1(s, vl); 63 s += p; 64 *s1 = __riscv_vle16_v_i16m1(s, vl); 65 s += p; 66 *s2 = __riscv_vle16_v_i16m1(s, vl); 67 s += p; 68 *s3 = __riscv_vle16_v_i16m1(s, vl); 69 } 70 71 static inline void store_u16_8x4(uint16_t *s, int p, const vuint16m1_t s0, 72 const vuint16m1_t s1, const vuint16m1_t s2, 73 const vuint16m1_t s3, size_t vl) { 74 __riscv_vse16_v_u16m1(s, s0, vl); 75 s += p; 76 __riscv_vse16_v_u16m1(s, s1, vl); 77 s += p; 78 __riscv_vse16_v_u16m1(s, s2, vl); 79 s += p; 80 __riscv_vse16_v_u16m1(s, s3, vl); 81 } 82 83 static inline void load_s16_4x7(const int16_t *s, int p, vint16mf2_t *const s0, 84 vint16mf2_t *const s1, vint16mf2_t *const s2, 85 vint16mf2_t *const s3, vint16mf2_t *const s4, 86 vint16mf2_t *const s5, vint16mf2_t *const s6, 87 size_t vl) { 88 *s0 = __riscv_vle16_v_i16mf2(s, vl); 89 s += p; 90 *s1 = __riscv_vle16_v_i16mf2(s, vl); 91 s += p; 92 *s2 = __riscv_vle16_v_i16mf2(s, vl); 93 s += p; 94 *s3 = __riscv_vle16_v_i16mf2(s, vl); 95 s += p; 96 *s4 = __riscv_vle16_v_i16mf2(s, vl); 97 s += p; 98 *s5 = __riscv_vle16_v_i16mf2(s, vl); 99 s += p; 100 *s6 = __riscv_vle16_v_i16mf2(s, vl); 101 } 102 103 static inline void load_s16_8x7(const int16_t *s, int p, vint16m1_t *const s0, 104 vint16m1_t *const s1, vint16m1_t *const s2, 105 vint16m1_t *const s3, vint16m1_t *const s4, 106 vint16m1_t *const s5, vint16m1_t *const s6, 107 size_t vl) { 108 *s0 = __riscv_vle16_v_i16m1(s, vl); 109 s += p; 110 *s1 = __riscv_vle16_v_i16m1(s, vl); 111 s += p; 112 *s2 = __riscv_vle16_v_i16m1(s, vl); 113 s += p; 114 *s3 = __riscv_vle16_v_i16m1(s, vl); 115 s += p; 116 *s4 = __riscv_vle16_v_i16m1(s, vl); 117 s += p; 118 *s5 = __riscv_vle16_v_i16m1(s, vl); 119 s += p; 120 *s6 = __riscv_vle16_v_i16m1(s, vl); 121 } 122 123 static inline void load_s16_4x11(const int16_t *s, int p, vint16mf2_t *const s0, 124 vint16mf2_t *const s1, vint16mf2_t *const s2, 125 vint16mf2_t *const s3, vint16mf2_t *const s4, 126 vint16mf2_t *const s5, vint16mf2_t *const s6, 127 vint16mf2_t *const s7, vint16mf2_t *const s8, 128 vint16mf2_t *const s9, vint16mf2_t *const s10, 129 size_t vl) { 130 *s0 = __riscv_vle16_v_i16mf2(s, vl); 131 s += p; 132 *s1 = __riscv_vle16_v_i16mf2(s, vl); 133 s += p; 134 *s2 = __riscv_vle16_v_i16mf2(s, vl); 135 s += p; 136 *s3 = __riscv_vle16_v_i16mf2(s, vl); 137 s += p; 138 *s4 = __riscv_vle16_v_i16mf2(s, vl); 139 s += p; 140 *s5 = __riscv_vle16_v_i16mf2(s, vl); 141 s += p; 142 *s6 = __riscv_vle16_v_i16mf2(s, vl); 143 s += p; 144 *s7 = __riscv_vle16_v_i16mf2(s, vl); 145 s += p; 146 *s8 = __riscv_vle16_v_i16mf2(s, vl); 147 s += p; 148 *s9 = __riscv_vle16_v_i16mf2(s, vl); 149 s += p; 150 *s10 = __riscv_vle16_v_i16mf2(s, vl); 151 } 152 153 static inline void load_s16_8x11(const int16_t *s, int p, vint16m1_t *const s0, 154 vint16m1_t *const s1, vint16m1_t *const s2, 155 vint16m1_t *const s3, vint16m1_t *const s4, 156 vint16m1_t *const s5, vint16m1_t *const s6, 157 vint16m1_t *const s7, vint16m1_t *const s8, 158 vint16m1_t *const s9, vint16m1_t *const s10, 159 size_t vl) { 160 *s0 = __riscv_vle16_v_i16m1(s, vl); 161 s += p; 162 *s1 = __riscv_vle16_v_i16m1(s, vl); 163 s += p; 164 *s2 = __riscv_vle16_v_i16m1(s, vl); 165 s += p; 166 *s3 = __riscv_vle16_v_i16m1(s, vl); 167 s += p; 168 *s4 = __riscv_vle16_v_i16m1(s, vl); 169 s += p; 170 *s5 = __riscv_vle16_v_i16m1(s, vl); 171 s += p; 172 *s6 = __riscv_vle16_v_i16m1(s, vl); 173 s += p; 174 *s7 = __riscv_vle16_v_i16m1(s, vl); 175 s += p; 176 *s8 = __riscv_vle16_v_i16m1(s, vl); 177 s += p; 178 *s9 = __riscv_vle16_v_i16m1(s, vl); 179 s += p; 180 *s10 = __riscv_vle16_v_i16m1(s, vl); 181 } 182 183 static inline void load_s16_8x6(const int16_t *s, int p, vint16m1_t *const s0, 184 vint16m1_t *const s1, vint16m1_t *const s2, 185 vint16m1_t *const s3, vint16m1_t *const s4, 186 vint16m1_t *const s5, size_t vl) { 187 *s0 = __riscv_vle16_v_i16m1(s, vl); 188 s += p; 189 *s1 = __riscv_vle16_v_i16m1(s, vl); 190 s += p; 191 *s2 = __riscv_vle16_v_i16m1(s, vl); 192 s += p; 193 *s3 = __riscv_vle16_v_i16m1(s, vl); 194 s += p; 195 *s4 = __riscv_vle16_v_i16m1(s, vl); 196 s += p; 197 *s5 = __riscv_vle16_v_i16m1(s, vl); 198 } 199 200 static inline void load_s16_8x8(const int16_t *s, int p, vint16m1_t *const s0, 201 vint16m1_t *const s1, vint16m1_t *const s2, 202 vint16m1_t *const s3, vint16m1_t *const s4, 203 vint16m1_t *const s5, vint16m1_t *const s6, 204 vint16m1_t *const s7, size_t vl) { 205 *s0 = __riscv_vle16_v_i16m1(s, vl); 206 s += p; 207 *s1 = __riscv_vle16_v_i16m1(s, vl); 208 s += p; 209 *s2 = __riscv_vle16_v_i16m1(s, vl); 210 s += p; 211 *s3 = __riscv_vle16_v_i16m1(s, vl); 212 s += p; 213 *s4 = __riscv_vle16_v_i16m1(s, vl); 214 s += p; 215 *s5 = __riscv_vle16_v_i16m1(s, vl); 216 s += p; 217 *s6 = __riscv_vle16_v_i16m1(s, vl); 218 s += p; 219 *s7 = __riscv_vle16_v_i16m1(s, vl); 220 } 221 222 static inline void load_s16_8x12(const int16_t *s, int p, vint16m1_t *const s0, 223 vint16m1_t *const s1, vint16m1_t *const s2, 224 vint16m1_t *const s3, vint16m1_t *const s4, 225 vint16m1_t *const s5, vint16m1_t *const s6, 226 vint16m1_t *const s7, vint16m1_t *const s8, 227 vint16m1_t *const s9, vint16m1_t *const s10, 228 vint16m1_t *const s11, size_t vl) { 229 *s0 = __riscv_vle16_v_i16m1(s, vl); 230 s += p; 231 *s1 = __riscv_vle16_v_i16m1(s, vl); 232 s += p; 233 *s2 = __riscv_vle16_v_i16m1(s, vl); 234 s += p; 235 *s3 = __riscv_vle16_v_i16m1(s, vl); 236 s += p; 237 *s4 = __riscv_vle16_v_i16m1(s, vl); 238 s += p; 239 *s5 = __riscv_vle16_v_i16m1(s, vl); 240 s += p; 241 *s6 = __riscv_vle16_v_i16m1(s, vl); 242 s += p; 243 *s7 = __riscv_vle16_v_i16m1(s, vl); 244 s += p; 245 *s8 = __riscv_vle16_v_i16m1(s, vl); 246 s += p; 247 *s9 = __riscv_vle16_v_i16m1(s, vl); 248 s += p; 249 *s10 = __riscv_vle16_v_i16m1(s, vl); 250 s += p; 251 *s11 = __riscv_vle16_v_i16m1(s, vl); 252 } 253 254 static inline void load_s16_4x12(const int16_t *s, int p, vint16mf2_t *const s0, 255 vint16mf2_t *const s1, vint16mf2_t *const s2, 256 vint16mf2_t *const s3, vint16mf2_t *const s4, 257 vint16mf2_t *const s5, vint16mf2_t *const s6, 258 vint16mf2_t *const s7, vint16mf2_t *const s8, 259 vint16mf2_t *const s9, vint16mf2_t *const s10, 260 vint16mf2_t *const s11, size_t vl) { 261 *s0 = __riscv_vle16_v_i16mf2(s, vl); 262 s += p; 263 *s1 = __riscv_vle16_v_i16mf2(s, vl); 264 s += p; 265 *s2 = __riscv_vle16_v_i16mf2(s, vl); 266 s += p; 267 *s3 = __riscv_vle16_v_i16mf2(s, vl); 268 s += p; 269 *s4 = __riscv_vle16_v_i16mf2(s, vl); 270 s += p; 271 *s5 = __riscv_vle16_v_i16mf2(s, vl); 272 s += p; 273 *s6 = __riscv_vle16_v_i16mf2(s, vl); 274 s += p; 275 *s7 = __riscv_vle16_v_i16mf2(s, vl); 276 s += p; 277 *s8 = __riscv_vle16_v_i16mf2(s, vl); 278 s += p; 279 *s9 = __riscv_vle16_v_i16mf2(s, vl); 280 s += p; 281 *s10 = __riscv_vle16_v_i16mf2(s, vl); 282 s += p; 283 *s11 = __riscv_vle16_v_i16mf2(s, vl); 284 } 285 286 static inline void store_u16_4x4(uint16_t *s, int p, const vuint16mf2_t s0, 287 const vuint16mf2_t s1, const vuint16mf2_t s2, 288 const vuint16mf2_t s3, size_t vl) { 289 __riscv_vse16_v_u16mf2(s, s0, vl); 290 s += p; 291 __riscv_vse16_v_u16mf2(s, s1, vl); 292 s += p; 293 __riscv_vse16_v_u16mf2(s, s2, vl); 294 s += p; 295 __riscv_vse16_v_u16mf2(s, s3, vl); 296 } 297 298 #endif // AOM_AOM_DSP_RISCV_MEM_RVV_H_