transpose_neon.h (58345B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ 13 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ 14 15 #include <arm_neon.h> 16 17 #include "aom_dsp/aom_dsp_common.h" // For AOM_FORCE_INLINE. 18 #include "config/aom_config.h" 19 20 static inline void transpose_concat_elems_u8_4x4(uint8x8_t a0, uint8x8_t a1, 21 uint8x8_t a2, uint8x8_t a3, 22 uint8x16_t *b) { 23 // Transpose 8-bit elements and concatenate result rows as follows: 24 // a0: 00, 01, 02, 03, XX, XX, XX, XX 25 // a1: 10, 11, 12, 13, XX, XX, XX, XX 26 // a2: 20, 21, 22, 23, XX, XX, XX, XX 27 // a3: 30, 31, 32, 33, XX, XX, XX, XX 28 // 29 // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 30 31 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); 32 uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); 33 uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); 34 uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); 35 36 uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; 37 uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; 38 39 *b = vzipq_u8(a02, a13).val[0]; 40 } 41 42 static inline void transpose_concat_elems_u8_8x4(uint8x8_t a0, uint8x8_t a1, 43 uint8x8_t a2, uint8x8_t a3, 44 uint8x16_t *b0, 45 uint8x16_t *b1) { 46 // Transpose 8-bit elements and concatenate result rows as follows: 47 // a0: 00, 01, 02, 03, 04, 05, 06, 07 48 // a1: 10, 11, 12, 13, 14, 15, 16, 17 49 // a2: 20, 21, 22, 23, 24, 25, 26, 27 50 // a3: 30, 31, 32, 33, 34, 35, 36, 37 51 // 52 // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 53 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 54 55 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); 56 uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); 57 uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); 58 uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); 59 60 uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; 61 uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; 62 63 uint8x16x2_t a0123 = vzipq_u8(a02, a13); 64 65 *b0 = a0123.val[0]; 66 *b1 = a0123.val[1]; 67 } 68 69 static inline void transpose_concat_elems_s8_4x4(int8x8_t a0, int8x8_t a1, 70 int8x8_t a2, int8x8_t a3, 71 int8x16_t *b) { 72 // Transpose 8-bit elements and concatenate result rows as follows: 73 // a0: 00, 01, 02, 03, XX, XX, XX, XX 74 // a1: 10, 11, 12, 13, XX, XX, XX, XX 75 // a2: 20, 21, 22, 23, XX, XX, XX, XX 76 // a3: 30, 31, 32, 33, XX, XX, XX, XX 77 // 78 // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 79 80 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); 81 int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); 82 int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); 83 int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); 84 85 int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; 86 int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; 87 88 *b = vzipq_s8(a02, a13).val[0]; 89 } 90 91 static inline void transpose_concat_elems_s8_8x4(int8x8_t a0, int8x8_t a1, 92 int8x8_t a2, int8x8_t a3, 93 int8x16_t *b0, int8x16_t *b1) { 94 // Transpose 8-bit elements and concatenate result rows as follows: 95 // a0: 00, 01, 02, 03, 04, 05, 06, 07 96 // a1: 10, 11, 12, 13, 14, 15, 16, 17 97 // a2: 20, 21, 22, 23, 24, 25, 26, 27 98 // a3: 30, 31, 32, 33, 34, 35, 36, 37 99 // 100 // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 101 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 102 103 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); 104 int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); 105 int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); 106 int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); 107 108 int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; 109 int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; 110 111 int8x16x2_t a0123 = vzipq_s8(a02, a13); 112 113 *b0 = a0123.val[0]; 114 *b1 = a0123.val[1]; 115 } 116 117 static inline void transpose_concat_elems_s16_4x4(int16x4_t s0, int16x4_t s1, 118 int16x4_t s2, int16x4_t s3, 119 int16x8_t res[2]) { 120 // Transpose 16-bit elements and concatenate result rows as follows: 121 // s0: 00, 01, 02, 03 122 // s1: 10, 11, 12, 13 123 // s2: 20, 21, 22, 23 124 // s3: 30, 31, 32, 33 125 // 126 // res[0]: 00 10 20 30 01 11 21 31 127 // res[1]: 02 12 22 32 03 13 23 33 128 129 int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); 130 int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); 131 int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); 132 int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); 133 134 int16x8_t s02 = vzipq_s16(s0q, s2q).val[0]; 135 int16x8_t s13 = vzipq_s16(s1q, s3q).val[0]; 136 137 int16x8x2_t s0123 = vzipq_s16(s02, s13); 138 139 res[0] = s0123.val[0]; 140 res[1] = s0123.val[1]; 141 } 142 143 static inline void transpose_concat_elems_s16_8x4(int16x8_t s0, int16x8_t s1, 144 int16x8_t s2, int16x8_t s3, 145 int16x8_t res[4]) { 146 // Transpose 16-bit elements and concatenate result rows as follows: 147 // s0: 00, 01, 02, 03, 04, 05, 06, 07 148 // s1: 10, 11, 12, 13, 14, 15, 16, 17 149 // s2: 20, 21, 22, 23, 24, 25, 26, 27 150 // s3: 30, 31, 32, 33, 34, 35, 36, 37 151 // 152 // res[0]: 00 10 20 30 01 11 21 31 153 // res[1]: 02 12 22 32 03 13 23 33 154 // res[2]: 04 14 24 34 05 15 25 35 155 // res[3]: 06 16 26 36 07 17 27 37 156 157 int16x8x2_t s02 = vzipq_s16(s0, s2); 158 int16x8x2_t s13 = vzipq_s16(s1, s3); 159 160 int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]); 161 int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]); 162 163 res[0] = s0123_lo.val[0]; 164 res[1] = s0123_lo.val[1]; 165 res[2] = s0123_hi.val[0]; 166 res[3] = s0123_hi.val[1]; 167 } 168 169 static inline void transpose_elems_u8_8x8( 170 uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4, 171 uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1, 172 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, 173 uint8x8_t *o7) { 174 // Swap 8 bit elements. Goes from: 175 // a0: 00 01 02 03 04 05 06 07 176 // a1: 10 11 12 13 14 15 16 17 177 // a2: 20 21 22 23 24 25 26 27 178 // a3: 30 31 32 33 34 35 36 37 179 // a4: 40 41 42 43 44 45 46 47 180 // a5: 50 51 52 53 54 55 56 57 181 // a6: 60 61 62 63 64 65 66 67 182 // a7: 70 71 72 73 74 75 76 77 183 // to: 184 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 185 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 186 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 187 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 188 189 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5)); 190 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7)); 191 192 // Swap 16 bit elements resulting in: 193 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 194 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 195 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 196 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 197 198 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), 199 vreinterpretq_u16_u8(b1.val[0])); 200 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), 201 vreinterpretq_u16_u8(b1.val[1])); 202 203 // Unzip 32 bit elements resulting in: 204 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 205 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 206 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 207 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 208 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), 209 vreinterpretq_u32_u16(c1.val[0])); 210 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), 211 vreinterpretq_u32_u16(c1.val[1])); 212 213 *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); 214 *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); 215 *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); 216 *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); 217 *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); 218 *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); 219 *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); 220 *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); 221 } 222 223 static inline void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, 224 uint8x8_t *a2, uint8x8_t *a3, 225 uint8x8_t *a4, uint8x8_t *a5, 226 uint8x8_t *a6, 227 uint8x8_t *a7) { 228 transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3, 229 a4, a5, a6, a7); 230 } 231 232 static inline void transpose_arrays_u8_8x8(const uint8x8_t *in, 233 uint8x8_t *out) { 234 transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 235 &out[0], &out[1], &out[2], &out[3], &out[4], &out[5], 236 &out[6], &out[7]); 237 } 238 239 static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x, 240 uint8x16_t *d) { 241 uint8x8x2_t w0 = vzip_u8(x[0], x[1]); 242 uint8x8x2_t w1 = vzip_u8(x[2], x[3]); 243 uint8x8x2_t w2 = vzip_u8(x[4], x[5]); 244 uint8x8x2_t w3 = vzip_u8(x[6], x[7]); 245 246 uint8x8x2_t w8 = vzip_u8(x[8], x[9]); 247 uint8x8x2_t w9 = vzip_u8(x[10], x[11]); 248 uint8x8x2_t w10 = vzip_u8(x[12], x[13]); 249 uint8x8x2_t w11 = vzip_u8(x[14], x[15]); 250 251 uint16x4x2_t w4 = 252 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); 253 uint16x4x2_t w5 = 254 vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); 255 uint16x4x2_t w12 = 256 vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); 257 uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), 258 vreinterpret_u16_u8(w11.val[0])); 259 260 uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), 261 vreinterpret_u32_u16(w5.val[0])); 262 uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), 263 vreinterpret_u32_u16(w5.val[1])); 264 uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), 265 vreinterpret_u32_u16(w13.val[0])); 266 uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), 267 vreinterpret_u32_u16(w13.val[1])); 268 269 // Store first 4-line result 270 d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); 271 d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); 272 d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); 273 d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); 274 275 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); 276 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); 277 w12 = 278 vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); 279 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), 280 vreinterpret_u16_u8(w11.val[1])); 281 282 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), 283 vreinterpret_u32_u16(w5.val[0])); 284 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), 285 vreinterpret_u32_u16(w5.val[1])); 286 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), 287 vreinterpret_u32_u16(w13.val[0])); 288 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), 289 vreinterpret_u32_u16(w13.val[1])); 290 291 // Store second 4-line result 292 d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); 293 d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); 294 d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); 295 d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); 296 } 297 298 static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x, 299 uint8x8_t *d) { 300 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); 301 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); 302 uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); 303 uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); 304 305 uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), 306 vreinterpretq_u16_u8(w1.val[0])); 307 uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), 308 vreinterpretq_u16_u8(w3.val[0])); 309 uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), 310 vreinterpretq_u16_u8(w1.val[1])); 311 uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), 312 vreinterpretq_u16_u8(w3.val[1])); 313 314 uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), 315 vreinterpretq_u32_u16(w5.val[0])); 316 uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), 317 vreinterpretq_u32_u16(w7.val[0])); 318 uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), 319 vreinterpretq_u32_u16(w5.val[1])); 320 uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), 321 vreinterpretq_u32_u16(w7.val[1])); 322 323 d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0])); 324 d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0])); 325 d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1])); 326 d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1])); 327 d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0])); 328 d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0])); 329 d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1])); 330 d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1])); 331 d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0])); 332 d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0])); 333 d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1])); 334 d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1])); 335 d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0])); 336 d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0])); 337 d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1])); 338 d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1])); 339 } 340 341 static inline uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { 342 uint16x8x2_t b0; 343 #if AOM_ARCH_AARCH64 344 b0.val[0] = vreinterpretq_u16_u64( 345 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); 346 b0.val[1] = vreinterpretq_u16_u64( 347 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); 348 #else 349 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), 350 vreinterpret_u16_u32(vget_low_u32(a1))); 351 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), 352 vreinterpret_u16_u32(vget_high_u32(a1))); 353 #endif 354 return b0; 355 } 356 357 static inline void transpose_arrays_u8_16x16(const uint8x16_t *x, 358 uint8x16_t *d) { 359 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); 360 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); 361 uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); 362 uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); 363 364 uint8x16x2_t w4 = vzipq_u8(x[8], x[9]); 365 uint8x16x2_t w5 = vzipq_u8(x[10], x[11]); 366 uint8x16x2_t w6 = vzipq_u8(x[12], x[13]); 367 uint8x16x2_t w7 = vzipq_u8(x[14], x[15]); 368 369 uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), 370 vreinterpretq_u16_u8(w1.val[0])); 371 uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), 372 vreinterpretq_u16_u8(w3.val[0])); 373 uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), 374 vreinterpretq_u16_u8(w5.val[0])); 375 uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), 376 vreinterpretq_u16_u8(w7.val[0])); 377 378 uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), 379 vreinterpretq_u32_u16(w9.val[0])); 380 uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), 381 vreinterpretq_u32_u16(w11.val[0])); 382 uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), 383 vreinterpretq_u32_u16(w9.val[1])); 384 uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), 385 vreinterpretq_u32_u16(w11.val[1])); 386 387 uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); 388 d[0] = vreinterpretq_u8_u16(d01.val[0]); 389 d[1] = vreinterpretq_u8_u16(d01.val[1]); 390 uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); 391 d[2] = vreinterpretq_u8_u16(d23.val[0]); 392 d[3] = vreinterpretq_u8_u16(d23.val[1]); 393 uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); 394 d[4] = vreinterpretq_u8_u16(d45.val[0]); 395 d[5] = vreinterpretq_u8_u16(d45.val[1]); 396 uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); 397 d[6] = vreinterpretq_u8_u16(d67.val[0]); 398 d[7] = vreinterpretq_u8_u16(d67.val[1]); 399 400 // upper half 401 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), 402 vreinterpretq_u16_u8(w1.val[1])); 403 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), 404 vreinterpretq_u16_u8(w3.val[1])); 405 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), 406 vreinterpretq_u16_u8(w5.val[1])); 407 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), 408 vreinterpretq_u16_u8(w7.val[1])); 409 410 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), 411 vreinterpretq_u32_u16(w9.val[0])); 412 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), 413 vreinterpretq_u32_u16(w11.val[0])); 414 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), 415 vreinterpretq_u32_u16(w9.val[1])); 416 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), 417 vreinterpretq_u32_u16(w11.val[1])); 418 419 d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); 420 d[8] = vreinterpretq_u8_u16(d01.val[0]); 421 d[9] = vreinterpretq_u8_u16(d01.val[1]); 422 d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); 423 d[10] = vreinterpretq_u8_u16(d23.val[0]); 424 d[11] = vreinterpretq_u8_u16(d23.val[1]); 425 d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); 426 d[12] = vreinterpretq_u8_u16(d45.val[0]); 427 d[13] = vreinterpretq_u8_u16(d45.val[1]); 428 d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); 429 d[14] = vreinterpretq_u8_u16(d67.val[0]); 430 d[15] = vreinterpretq_u8_u16(d67.val[1]); 431 } 432 433 static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x, 434 uint8x16_t *d) { 435 uint8x16_t x2[32]; 436 for (int i = 0; i < 16; ++i) { 437 x2[i] = x[i].val[0]; 438 x2[i + 16] = x[i].val[1]; 439 } 440 transpose_arrays_u8_16x16(x2, d); 441 transpose_arrays_u8_16x16(x2 + 16, d + 16); 442 } 443 444 static inline void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, 445 uint8x8_t *a2, 446 uint8x8_t *a3) { 447 // Swap 8 bit elements. Goes from: 448 // a0: 00 01 02 03 04 05 06 07 449 // a1: 10 11 12 13 14 15 16 17 450 // a2: 20 21 22 23 24 25 26 27 451 // a3: 30 31 32 33 34 35 36 37 452 // to: 453 // b0.val[0]: 00 10 02 12 04 14 06 16 454 // b0.val[1]: 01 11 03 13 05 15 07 17 455 // b1.val[0]: 20 30 22 32 24 34 26 36 456 // b1.val[1]: 21 31 23 33 25 35 27 37 457 458 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); 459 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); 460 461 // Swap 16 bit elements resulting in: 462 // c0.val[0]: 00 10 20 30 04 14 24 34 463 // c0.val[1]: 02 12 22 32 06 16 26 36 464 // c1.val[0]: 01 11 21 31 05 15 25 35 465 // c1.val[1]: 03 13 23 33 07 17 27 37 466 467 const uint16x4x2_t c0 = 468 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); 469 const uint16x4x2_t c1 = 470 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); 471 472 *a0 = vreinterpret_u8_u16(c0.val[0]); 473 *a1 = vreinterpret_u8_u16(c1.val[0]); 474 *a2 = vreinterpret_u8_u16(c0.val[1]); 475 *a3 = vreinterpret_u8_u16(c1.val[1]); 476 } 477 478 static inline void transpose_elems_inplace_u8_16x4(uint8x16_t *a0, 479 uint8x16_t *a1, 480 uint8x16_t *a2, 481 uint8x16_t *a3) { 482 // Swap 8 bit elements. Goes from: 483 // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015 484 // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115 485 // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215 486 // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315 487 // to: 488 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114 489 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115 490 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314 491 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315 492 493 const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1); 494 const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3); 495 496 // Swap 16 bit elements resulting in: 497 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 012 112 212 312 498 // c0.val[1]: 02 12 22 32 06 16 26 36 09 19 29 39 013 113 213 313 499 // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314 500 // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315 501 502 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), 503 vreinterpretq_u16_u8(b1.val[0])); 504 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), 505 vreinterpretq_u16_u8(b1.val[1])); 506 507 *a0 = vreinterpretq_u8_u16(c0.val[0]); 508 *a1 = vreinterpretq_u8_u16(c1.val[0]); 509 *a2 = vreinterpretq_u8_u16(c0.val[1]); 510 *a3 = vreinterpretq_u8_u16(c1.val[1]); 511 } 512 513 static inline void transpose_elems_inplace_u8_4x4(uint8x8_t *a0, 514 uint8x8_t *a1) { 515 // Swap 16 bit elements. Goes from: 516 // a0: 00 01 02 03 10 11 12 13 517 // a1: 20 21 22 23 30 31 32 33 518 // to: 519 // b0.val[0]: 00 01 20 21 10 11 30 31 520 // b0.val[1]: 02 03 22 23 12 13 32 33 521 522 const uint16x4x2_t b0 = 523 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); 524 525 // Swap 32 bit elements resulting in: 526 // c0.val[0]: 00 01 20 21 02 03 22 23 527 // c0.val[1]: 10 11 30 31 12 13 32 33 528 529 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), 530 vreinterpret_u32_u16(b0.val[1])); 531 532 // Swap 8 bit elements resulting in: 533 // d0.val[0]: 00 10 20 30 02 12 22 32 534 // d0.val[1]: 01 11 21 31 03 13 23 33 535 536 const uint8x8x2_t d0 = 537 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); 538 539 *a0 = d0.val[0]; 540 *a1 = d0.val[1]; 541 } 542 543 static inline void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1, 544 uint8x8_t a2, uint8x8_t a3, 545 uint8x8_t a4, uint8x8_t a5, 546 uint8x8_t a6, uint8x8_t a7, 547 uint8x8_t *o0, uint8x8_t *o1, 548 uint8x8_t *o2, uint8x8_t *o3) { 549 // Swap 32 bit elements. Goes from: 550 // a0: 00 01 02 03 XX XX XX XX 551 // a1: 10 11 12 13 XX XX XX XX 552 // a2: 20 21 22 23 XX XX XX XX 553 // a3; 30 31 32 33 XX XX XX XX 554 // a4: 40 41 42 43 XX XX XX XX 555 // a5: 50 51 52 53 XX XX XX XX 556 // a6: 60 61 62 63 XX XX XX XX 557 // a7: 70 71 72 73 XX XX XX XX 558 // to: 559 // b0.val[0]: 00 01 02 03 40 41 42 43 560 // b1.val[0]: 10 11 12 13 50 51 52 53 561 // b2.val[0]: 20 21 22 23 60 61 62 63 562 // b3.val[0]: 30 31 32 33 70 71 72 73 563 564 const uint32x2x2_t b0 = 565 vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); 566 const uint32x2x2_t b1 = 567 vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); 568 const uint32x2x2_t b2 = 569 vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); 570 const uint32x2x2_t b3 = 571 vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); 572 573 // Swap 16 bit elements resulting in: 574 // c0.val[0]: 00 01 20 21 40 41 60 61 575 // c0.val[1]: 02 03 22 23 42 43 62 63 576 // c1.val[0]: 10 11 30 31 50 51 70 71 577 // c1.val[1]: 12 13 32 33 52 53 72 73 578 579 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), 580 vreinterpret_u16_u32(b2.val[0])); 581 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), 582 vreinterpret_u16_u32(b3.val[0])); 583 584 // Swap 8 bit elements resulting in: 585 // d0.val[0]: 00 10 20 30 40 50 60 70 586 // d0.val[1]: 01 11 21 31 41 51 61 71 587 // d1.val[0]: 02 12 22 32 42 52 62 72 588 // d1.val[1]: 03 13 23 33 43 53 63 73 589 590 const uint8x8x2_t d0 = 591 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); 592 const uint8x8x2_t d1 = 593 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); 594 595 *o0 = d0.val[0]; 596 *o1 = d0.val[1]; 597 *o2 = d1.val[0]; 598 *o3 = d1.val[1]; 599 } 600 601 static inline void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) { 602 // Input: 603 // 00 01 02 03 604 // 10 11 12 13 605 // 20 21 22 23 606 // 30 31 32 33 607 608 // b: 609 // 00 10 02 12 610 // 01 11 03 13 611 const uint16x4x2_t b = vtrn_u16(a[0], a[1]); 612 // c: 613 // 20 30 22 32 614 // 21 31 23 33 615 const uint16x4x2_t c = vtrn_u16(a[2], a[3]); 616 // d: 617 // 00 10 20 30 618 // 02 12 22 32 619 const uint32x2x2_t d = 620 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); 621 // e: 622 // 01 11 21 31 623 // 03 13 23 33 624 const uint32x2x2_t e = 625 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); 626 627 // Output: 628 // 00 10 20 30 629 // 01 11 21 31 630 // 02 12 22 32 631 // 03 13 23 33 632 a[0] = vreinterpret_u16_u32(d.val[0]); 633 a[1] = vreinterpret_u16_u32(e.val[0]); 634 a[2] = vreinterpret_u16_u32(d.val[1]); 635 a[3] = vreinterpret_u16_u32(e.val[1]); 636 } 637 638 static inline void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) { 639 // 4x8 Input: 640 // a[0]: 00 01 02 03 04 05 06 07 641 // a[1]: 10 11 12 13 14 15 16 17 642 // a[2]: 20 21 22 23 24 25 26 27 643 // a[3]: 30 31 32 33 34 35 36 37 644 645 // b0.val[0]: 00 10 02 12 04 14 06 16 646 // b0.val[1]: 01 11 03 13 05 15 07 17 647 // b1.val[0]: 20 30 22 32 24 34 26 36 648 // b1.val[1]: 21 31 23 33 25 35 27 37 649 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); 650 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); 651 652 // c0.val[0]: 00 10 20 30 04 14 24 34 653 // c0.val[1]: 02 12 22 32 06 16 26 36 654 // c1.val[0]: 01 11 21 31 05 15 25 35 655 // c1.val[1]: 03 13 23 33 07 17 27 37 656 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), 657 vreinterpretq_u32_u16(b1.val[0])); 658 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), 659 vreinterpretq_u32_u16(b1.val[1])); 660 661 // 8x4 Output: 662 // a[0]: 00 10 20 30 04 14 24 34 663 // a[1]: 01 11 21 31 05 15 25 35 664 // a[2]: 02 12 22 32 06 16 26 36 665 // a[3]: 03 13 23 33 07 17 27 37 666 a[0] = vreinterpretq_u16_u32(c0.val[0]); 667 a[1] = vreinterpretq_u16_u32(c1.val[0]); 668 a[2] = vreinterpretq_u16_u32(c0.val[1]); 669 a[3] = vreinterpretq_u16_u32(c1.val[1]); 670 } 671 672 // Special transpose for loop filter. 673 // 4x8 Input: 674 // p_q: p3 p2 p1 p0 q0 q1 q2 q3 675 // a[0]: 00 01 02 03 04 05 06 07 676 // a[1]: 10 11 12 13 14 15 16 17 677 // a[2]: 20 21 22 23 24 25 26 27 678 // a[3]: 30 31 32 33 34 35 36 37 679 // 8x4 Output: 680 // a[0]: 03 13 23 33 04 14 24 34 p0q0 681 // a[1]: 02 12 22 32 05 15 25 35 p1q1 682 // a[2]: 01 11 21 31 06 16 26 36 p2q2 683 // a[3]: 00 10 20 30 07 17 27 37 p3q3 684 // Direct reapplication of the function will reset the high halves, but 685 // reverse the low halves: 686 // p_q: p0 p1 p2 p3 q0 q1 q2 q3 687 // a[0]: 33 32 31 30 04 05 06 07 688 // a[1]: 23 22 21 20 14 15 16 17 689 // a[2]: 13 12 11 10 24 25 26 27 690 // a[3]: 03 02 01 00 34 35 36 37 691 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but 692 // reverse the high halves. 693 // The standard transpose_u16_4x8q will produce the same reversals, but with the 694 // order of the low halves also restored relative to the high halves. This is 695 // preferable because it puts all values from the same source row back together, 696 // but some post-processing is inevitable. 697 static inline void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) { 698 // b0.val[0]: 00 10 02 12 04 14 06 16 699 // b0.val[1]: 01 11 03 13 05 15 07 17 700 // b1.val[0]: 20 30 22 32 24 34 26 36 701 // b1.val[1]: 21 31 23 33 25 35 27 37 702 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); 703 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); 704 705 // Reverse odd vectors to bring the appropriate items to the front of zips. 706 // b0.val[0]: 00 10 02 12 04 14 06 16 707 // r0 : 03 13 01 11 07 17 05 15 708 // b1.val[0]: 20 30 22 32 24 34 26 36 709 // r1 : 23 33 21 31 27 37 25 35 710 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1])); 711 const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1])); 712 713 // Zip to complete the halves. 714 // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1 715 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2 716 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2 717 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1 718 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]), 719 vreinterpretq_u32_u16(b1.val[0])); 720 const uint32x4x2_t c1 = vzipq_u32(r0, r1); 721 722 // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3 723 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1 724 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0 725 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2 726 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]); 727 // The third row of c comes first here to swap p2 with q0. 728 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]); 729 730 // 8x4 Output: 731 // a[0]: 03 13 23 33 04 14 24 34 p0q0 732 // a[1]: 02 12 22 32 05 15 25 35 p1q1 733 // a[2]: 01 11 21 31 06 16 26 36 p2q2 734 // a[3]: 00 10 20 30 07 17 27 37 p3q3 735 a[0] = d1.val[0]; // p0q0 736 a[1] = d0.val[1]; // p1q1 737 a[2] = d1.val[1]; // p2q2 738 a[3] = d0.val[0]; // p3q3 739 } 740 741 static inline void transpose_elems_u16_4x8( 742 const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2, 743 const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5, 744 const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1, 745 uint16x8_t *o2, uint16x8_t *o3) { 746 // Combine rows. Goes from: 747 // a0: 00 01 02 03 748 // a1: 10 11 12 13 749 // a2: 20 21 22 23 750 // a3: 30 31 32 33 751 // a4: 40 41 42 43 752 // a5: 50 51 52 53 753 // a6: 60 61 62 63 754 // a7: 70 71 72 73 755 // to: 756 // b0: 00 01 02 03 40 41 42 43 757 // b1: 10 11 12 13 50 51 52 53 758 // b2: 20 21 22 23 60 61 62 63 759 // b3: 30 31 32 33 70 71 72 73 760 761 const uint16x8_t b0 = vcombine_u16(a0, a4); 762 const uint16x8_t b1 = vcombine_u16(a1, a5); 763 const uint16x8_t b2 = vcombine_u16(a2, a6); 764 const uint16x8_t b3 = vcombine_u16(a3, a7); 765 766 // Swap 16 bit elements resulting in: 767 // c0.val[0]: 00 10 02 12 40 50 42 52 768 // c0.val[1]: 01 11 03 13 41 51 43 53 769 // c1.val[0]: 20 30 22 32 60 70 62 72 770 // c1.val[1]: 21 31 23 33 61 71 63 73 771 772 const uint16x8x2_t c0 = vtrnq_u16(b0, b1); 773 const uint16x8x2_t c1 = vtrnq_u16(b2, b3); 774 775 // Swap 32 bit elements resulting in: 776 // d0.val[0]: 00 10 20 30 40 50 60 70 777 // d0.val[1]: 02 12 22 32 42 52 62 72 778 // d1.val[0]: 01 11 21 31 41 51 61 71 779 // d1.val[1]: 03 13 23 33 43 53 63 73 780 781 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), 782 vreinterpretq_u32_u16(c1.val[0])); 783 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), 784 vreinterpretq_u32_u16(c1.val[1])); 785 786 *o0 = vreinterpretq_u16_u32(d0.val[0]); 787 *o1 = vreinterpretq_u16_u32(d1.val[0]); 788 *o2 = vreinterpretq_u16_u32(d0.val[1]); 789 *o3 = vreinterpretq_u16_u32(d1.val[1]); 790 } 791 792 static inline void transpose_elems_s16_4x8( 793 const int16x4_t a0, const int16x4_t a1, const int16x4_t a2, 794 const int16x4_t a3, const int16x4_t a4, const int16x4_t a5, 795 const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1, 796 int16x8_t *o2, int16x8_t *o3) { 797 // Combine rows. Goes from: 798 // a0: 00 01 02 03 799 // a1: 10 11 12 13 800 // a2: 20 21 22 23 801 // a3: 30 31 32 33 802 // a4: 40 41 42 43 803 // a5: 50 51 52 53 804 // a6: 60 61 62 63 805 // a7: 70 71 72 73 806 // to: 807 // b0: 00 01 02 03 40 41 42 43 808 // b1: 10 11 12 13 50 51 52 53 809 // b2: 20 21 22 23 60 61 62 63 810 // b3: 30 31 32 33 70 71 72 73 811 812 const int16x8_t b0 = vcombine_s16(a0, a4); 813 const int16x8_t b1 = vcombine_s16(a1, a5); 814 const int16x8_t b2 = vcombine_s16(a2, a6); 815 const int16x8_t b3 = vcombine_s16(a3, a7); 816 817 // Swap 16 bit elements resulting in: 818 // c0.val[0]: 00 10 02 12 40 50 42 52 819 // c0.val[1]: 01 11 03 13 41 51 43 53 820 // c1.val[0]: 20 30 22 32 60 70 62 72 821 // c1.val[1]: 21 31 23 33 61 71 63 73 822 823 const int16x8x2_t c0 = vtrnq_s16(b0, b1); 824 const int16x8x2_t c1 = vtrnq_s16(b2, b3); 825 826 // Swap 32 bit elements resulting in: 827 // d0.val[0]: 00 10 20 30 40 50 60 70 828 // d0.val[1]: 02 12 22 32 42 52 62 72 829 // d1.val[0]: 01 11 21 31 41 51 61 71 830 // d1.val[1]: 03 13 23 33 43 53 63 73 831 832 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), 833 vreinterpretq_s32_s16(c1.val[0])); 834 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), 835 vreinterpretq_s32_s16(c1.val[1])); 836 837 *o0 = vreinterpretq_s16_s32(d0.val[0]); 838 *o1 = vreinterpretq_s16_s32(d1.val[0]); 839 *o2 = vreinterpretq_s16_s32(d0.val[1]); 840 *o3 = vreinterpretq_s16_s32(d1.val[1]); 841 } 842 843 static inline void transpose_elems_inplace_u16_8x8( 844 uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3, 845 uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) { 846 // Swap 16 bit elements. Goes from: 847 // a0: 00 01 02 03 04 05 06 07 848 // a1: 10 11 12 13 14 15 16 17 849 // a2: 20 21 22 23 24 25 26 27 850 // a3: 30 31 32 33 34 35 36 37 851 // a4: 40 41 42 43 44 45 46 47 852 // a5: 50 51 52 53 54 55 56 57 853 // a6: 60 61 62 63 64 65 66 67 854 // a7: 70 71 72 73 74 75 76 77 855 // to: 856 // b0.val[0]: 00 10 02 12 04 14 06 16 857 // b0.val[1]: 01 11 03 13 05 15 07 17 858 // b1.val[0]: 20 30 22 32 24 34 26 36 859 // b1.val[1]: 21 31 23 33 25 35 27 37 860 // b2.val[0]: 40 50 42 52 44 54 46 56 861 // b2.val[1]: 41 51 43 53 45 55 47 57 862 // b3.val[0]: 60 70 62 72 64 74 66 76 863 // b3.val[1]: 61 71 63 73 65 75 67 77 864 865 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); 866 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); 867 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); 868 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); 869 870 // Swap 32 bit elements resulting in: 871 // c0.val[0]: 00 10 20 30 04 14 24 34 872 // c0.val[1]: 02 12 22 32 06 16 26 36 873 // c1.val[0]: 01 11 21 31 05 15 25 35 874 // c1.val[1]: 03 13 23 33 07 17 27 37 875 // c2.val[0]: 40 50 60 70 44 54 64 74 876 // c2.val[1]: 42 52 62 72 46 56 66 76 877 // c3.val[0]: 41 51 61 71 45 55 65 75 878 // c3.val[1]: 43 53 63 73 47 57 67 77 879 880 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), 881 vreinterpretq_u32_u16(b1.val[0])); 882 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), 883 vreinterpretq_u32_u16(b1.val[1])); 884 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), 885 vreinterpretq_u32_u16(b3.val[0])); 886 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), 887 vreinterpretq_u32_u16(b3.val[1])); 888 889 // Swap 64 bit elements resulting in: 890 // d0.val[0]: 00 10 20 30 40 50 60 70 891 // d0.val[1]: 04 14 24 34 44 54 64 74 892 // d1.val[0]: 01 11 21 31 41 51 61 71 893 // d1.val[1]: 05 15 25 35 45 55 65 75 894 // d2.val[0]: 02 12 22 32 42 52 62 72 895 // d2.val[1]: 06 16 26 36 46 56 66 76 896 // d3.val[0]: 03 13 23 33 43 53 63 73 897 // d3.val[1]: 07 17 27 37 47 57 67 77 898 899 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); 900 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); 901 const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); 902 const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); 903 904 *a0 = d0.val[0]; 905 *a1 = d1.val[0]; 906 *a2 = d2.val[0]; 907 *a3 = d3.val[0]; 908 *a4 = d0.val[1]; 909 *a5 = d1.val[1]; 910 *a6 = d2.val[1]; 911 *a7 = d3.val[1]; 912 } 913 914 static inline int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { 915 int16x8x2_t b0; 916 #if AOM_ARCH_AARCH64 917 b0.val[0] = vreinterpretq_s16_s64( 918 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); 919 b0.val[1] = vreinterpretq_s16_s64( 920 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); 921 #else 922 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), 923 vreinterpret_s16_s32(vget_low_s32(a1))); 924 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), 925 vreinterpret_s16_s32(vget_high_s32(a1))); 926 #endif 927 return b0; 928 } 929 930 static inline void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1, 931 int16x8_t *a2, int16x8_t *a3, 932 int16x8_t *a4, int16x8_t *a5, 933 int16x8_t *a6, 934 int16x8_t *a7) { 935 // Swap 16 bit elements. Goes from: 936 // a0: 00 01 02 03 04 05 06 07 937 // a1: 10 11 12 13 14 15 16 17 938 // a2: 20 21 22 23 24 25 26 27 939 // a3: 30 31 32 33 34 35 36 37 940 // a4: 40 41 42 43 44 45 46 47 941 // a5: 50 51 52 53 54 55 56 57 942 // a6: 60 61 62 63 64 65 66 67 943 // a7: 70 71 72 73 74 75 76 77 944 // to: 945 // b0.val[0]: 00 10 02 12 04 14 06 16 946 // b0.val[1]: 01 11 03 13 05 15 07 17 947 // b1.val[0]: 20 30 22 32 24 34 26 36 948 // b1.val[1]: 21 31 23 33 25 35 27 37 949 // b2.val[0]: 40 50 42 52 44 54 46 56 950 // b2.val[1]: 41 51 43 53 45 55 47 57 951 // b3.val[0]: 60 70 62 72 64 74 66 76 952 // b3.val[1]: 61 71 63 73 65 75 67 77 953 954 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); 955 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); 956 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); 957 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); 958 959 // Swap 32 bit elements resulting in: 960 // c0.val[0]: 00 10 20 30 04 14 24 34 961 // c0.val[1]: 02 12 22 32 06 16 26 36 962 // c1.val[0]: 01 11 21 31 05 15 25 35 963 // c1.val[1]: 03 13 23 33 07 17 27 37 964 // c2.val[0]: 40 50 60 70 44 54 64 74 965 // c2.val[1]: 42 52 62 72 46 56 66 76 966 // c3.val[0]: 41 51 61 71 45 55 65 75 967 // c3.val[1]: 43 53 63 73 47 57 67 77 968 969 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), 970 vreinterpretq_s32_s16(b1.val[0])); 971 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), 972 vreinterpretq_s32_s16(b1.val[1])); 973 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), 974 vreinterpretq_s32_s16(b3.val[0])); 975 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), 976 vreinterpretq_s32_s16(b3.val[1])); 977 978 // Swap 64 bit elements resulting in: 979 // d0.val[0]: 00 10 20 30 40 50 60 70 980 // d0.val[1]: 04 14 24 34 44 54 64 74 981 // d1.val[0]: 01 11 21 31 41 51 61 71 982 // d1.val[1]: 05 15 25 35 45 55 65 75 983 // d2.val[0]: 02 12 22 32 42 52 62 72 984 // d2.val[1]: 06 16 26 36 46 56 66 76 985 // d3.val[0]: 03 13 23 33 43 53 63 73 986 // d3.val[1]: 07 17 27 37 47 57 67 77 987 988 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); 989 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); 990 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); 991 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); 992 993 *a0 = d0.val[0]; 994 *a1 = d1.val[0]; 995 *a2 = d2.val[0]; 996 *a3 = d3.val[0]; 997 *a4 = d0.val[1]; 998 *a5 = d1.val[1]; 999 *a6 = d2.val[1]; 1000 *a7 = d3.val[1]; 1001 } 1002 1003 static inline void transpose_arrays_s16_8x8(const int16x8_t *a, 1004 int16x8_t *out) { 1005 // Swap 16 bit elements. Goes from: 1006 // a0: 00 01 02 03 04 05 06 07 1007 // a1: 10 11 12 13 14 15 16 17 1008 // a2: 20 21 22 23 24 25 26 27 1009 // a3: 30 31 32 33 34 35 36 37 1010 // a4: 40 41 42 43 44 45 46 47 1011 // a5: 50 51 52 53 54 55 56 57 1012 // a6: 60 61 62 63 64 65 66 67 1013 // a7: 70 71 72 73 74 75 76 77 1014 // to: 1015 // b0.val[0]: 00 10 02 12 04 14 06 16 1016 // b0.val[1]: 01 11 03 13 05 15 07 17 1017 // b1.val[0]: 20 30 22 32 24 34 26 36 1018 // b1.val[1]: 21 31 23 33 25 35 27 37 1019 // b2.val[0]: 40 50 42 52 44 54 46 56 1020 // b2.val[1]: 41 51 43 53 45 55 47 57 1021 // b3.val[0]: 60 70 62 72 64 74 66 76 1022 // b3.val[1]: 61 71 63 73 65 75 67 77 1023 1024 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); 1025 const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); 1026 const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); 1027 const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); 1028 1029 // Swap 32 bit elements resulting in: 1030 // c0.val[0]: 00 10 20 30 04 14 24 34 1031 // c0.val[1]: 02 12 22 32 06 16 26 36 1032 // c1.val[0]: 01 11 21 31 05 15 25 35 1033 // c1.val[1]: 03 13 23 33 07 17 27 37 1034 // c2.val[0]: 40 50 60 70 44 54 64 74 1035 // c2.val[1]: 42 52 62 72 46 56 66 76 1036 // c3.val[0]: 41 51 61 71 45 55 65 75 1037 // c3.val[1]: 43 53 63 73 47 57 67 77 1038 1039 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), 1040 vreinterpretq_s32_s16(b1.val[0])); 1041 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), 1042 vreinterpretq_s32_s16(b1.val[1])); 1043 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), 1044 vreinterpretq_s32_s16(b3.val[0])); 1045 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), 1046 vreinterpretq_s32_s16(b3.val[1])); 1047 1048 // Swap 64 bit elements resulting in: 1049 // d0.val[0]: 00 10 20 30 40 50 60 70 1050 // d0.val[1]: 04 14 24 34 44 54 64 74 1051 // d1.val[0]: 01 11 21 31 41 51 61 71 1052 // d1.val[1]: 05 15 25 35 45 55 65 75 1053 // d2.val[0]: 02 12 22 32 42 52 62 72 1054 // d2.val[1]: 06 16 26 36 46 56 66 76 1055 // d3.val[0]: 03 13 23 33 43 53 63 73 1056 // d3.val[1]: 07 17 27 37 47 57 67 77 1057 1058 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); 1059 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); 1060 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); 1061 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); 1062 1063 out[0] = d0.val[0]; 1064 out[1] = d1.val[0]; 1065 out[2] = d2.val[0]; 1066 out[3] = d3.val[0]; 1067 out[4] = d0.val[1]; 1068 out[5] = d1.val[1]; 1069 out[6] = d2.val[1]; 1070 out[7] = d3.val[1]; 1071 } 1072 1073 static inline void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1, 1074 int16x8_t *a2, 1075 int16x8_t *a3) { 1076 // Swap 16 bit elements. Goes from: 1077 // a0: 00 01 02 03 04 05 06 07 1078 // a1: 10 11 12 13 14 15 16 17 1079 // a2: 20 21 22 23 24 25 26 27 1080 // a3: 30 31 32 33 34 35 36 37 1081 // to: 1082 // b0.val[0]: 00 10 02 12 04 14 06 16 1083 // b0.val[1]: 01 11 03 13 05 15 07 17 1084 // b1.val[0]: 20 30 22 32 24 34 26 36 1085 // b1.val[1]: 21 31 23 33 25 35 27 37 1086 1087 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); 1088 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); 1089 1090 // Swap 32 bit elements resulting in: 1091 // c0.val[0]: 00 10 20 30 04 14 24 34 1092 // c0.val[1]: 01 11 21 31 05 15 25 35 1093 // c1.val[0]: 02 12 22 32 06 16 26 36 1094 // c1.val[1]: 03 13 23 33 07 17 27 37 1095 1096 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), 1097 vreinterpretq_s32_s16(b1.val[0])); 1098 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), 1099 vreinterpretq_s32_s16(b1.val[1])); 1100 1101 *a0 = vreinterpretq_s16_s32(c0.val[0]); 1102 *a1 = vreinterpretq_s16_s32(c1.val[0]); 1103 *a2 = vreinterpretq_s16_s32(c0.val[1]); 1104 *a3 = vreinterpretq_s16_s32(c1.val[1]); 1105 } 1106 1107 static inline void transpose_elems_inplace_u16_4x4(uint16x4_t *a0, 1108 uint16x4_t *a1, 1109 uint16x4_t *a2, 1110 uint16x4_t *a3) { 1111 // Swap 16 bit elements. Goes from: 1112 // a0: 00 01 02 03 1113 // a1: 10 11 12 13 1114 // a2: 20 21 22 23 1115 // a3: 30 31 32 33 1116 // to: 1117 // b0.val[0]: 00 10 02 12 1118 // b0.val[1]: 01 11 03 13 1119 // b1.val[0]: 20 30 22 32 1120 // b1.val[1]: 21 31 23 33 1121 1122 const uint16x4x2_t b0 = vtrn_u16(*a0, *a1); 1123 const uint16x4x2_t b1 = vtrn_u16(*a2, *a3); 1124 1125 // Swap 32 bit elements resulting in: 1126 // c0.val[0]: 00 10 20 30 1127 // c0.val[1]: 02 12 22 32 1128 // c1.val[0]: 01 11 21 31 1129 // c1.val[1]: 03 13 23 33 1130 1131 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), 1132 vreinterpret_u32_u16(b1.val[0])); 1133 const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), 1134 vreinterpret_u32_u16(b1.val[1])); 1135 1136 *a0 = vreinterpret_u16_u32(c0.val[0]); 1137 *a1 = vreinterpret_u16_u32(c1.val[0]); 1138 *a2 = vreinterpret_u16_u32(c0.val[1]); 1139 *a3 = vreinterpret_u16_u32(c1.val[1]); 1140 } 1141 1142 static inline void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1, 1143 int16x4_t *a2, 1144 int16x4_t *a3) { 1145 // Swap 16 bit elements. Goes from: 1146 // a0: 00 01 02 03 1147 // a1: 10 11 12 13 1148 // a2: 20 21 22 23 1149 // a3: 30 31 32 33 1150 // to: 1151 // b0.val[0]: 00 10 02 12 1152 // b0.val[1]: 01 11 03 13 1153 // b1.val[0]: 20 30 22 32 1154 // b1.val[1]: 21 31 23 33 1155 1156 const int16x4x2_t b0 = vtrn_s16(*a0, *a1); 1157 const int16x4x2_t b1 = vtrn_s16(*a2, *a3); 1158 1159 // Swap 32 bit elements resulting in: 1160 // c0.val[0]: 00 10 20 30 1161 // c0.val[1]: 02 12 22 32 1162 // c1.val[0]: 01 11 21 31 1163 // c1.val[1]: 03 13 23 33 1164 1165 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), 1166 vreinterpret_s32_s16(b1.val[0])); 1167 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), 1168 vreinterpret_s32_s16(b1.val[1])); 1169 1170 *a0 = vreinterpret_s16_s32(c0.val[0]); 1171 *a1 = vreinterpret_s16_s32(c1.val[0]); 1172 *a2 = vreinterpret_s16_s32(c0.val[1]); 1173 *a3 = vreinterpret_s16_s32(c1.val[1]); 1174 } 1175 1176 static inline int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { 1177 int32x4x2_t b0; 1178 #if AOM_ARCH_AARCH64 1179 b0.val[0] = vreinterpretq_s32_s64( 1180 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); 1181 b0.val[1] = vreinterpretq_s32_s64( 1182 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); 1183 #else 1184 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); 1185 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); 1186 #endif 1187 return b0; 1188 } 1189 1190 static inline void transpose_elems_s32_4x4(const int32x4_t a0, 1191 const int32x4_t a1, 1192 const int32x4_t a2, 1193 const int32x4_t a3, int32x4_t *o0, 1194 int32x4_t *o1, int32x4_t *o2, 1195 int32x4_t *o3) { 1196 // Swap 32 bit elements. Goes from: 1197 // a0: 00 01 02 03 1198 // a1: 10 11 12 13 1199 // a2: 20 21 22 23 1200 // a3: 30 31 32 33 1201 // to: 1202 // b0.val[0]: 00 10 02 12 1203 // b0.val[1]: 01 11 03 13 1204 // b1.val[0]: 20 30 22 32 1205 // b1.val[1]: 21 31 23 33 1206 1207 const int32x4x2_t b0 = vtrnq_s32(a0, a1); 1208 const int32x4x2_t b1 = vtrnq_s32(a2, a3); 1209 1210 // Swap 64 bit elements resulting in: 1211 // c0.val[0]: 00 10 20 30 1212 // c0.val[1]: 02 12 22 32 1213 // c1.val[0]: 01 11 21 31 1214 // c1.val[1]: 03 13 23 33 1215 1216 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); 1217 const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); 1218 1219 *o0 = c0.val[0]; 1220 *o1 = c1.val[0]; 1221 *o2 = c0.val[1]; 1222 *o3 = c1.val[1]; 1223 } 1224 1225 static inline void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1, 1226 int32x4_t *a2, 1227 int32x4_t *a3) { 1228 transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3); 1229 } 1230 1231 static inline void transpose_arrays_s32_4x4(const int32x4_t *in, 1232 int32x4_t *out) { 1233 transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2], 1234 &out[3]); 1235 } 1236 1237 static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in, 1238 int32x4_t *out, 1239 const int width, 1240 const int height) { 1241 const int h = height >> 2; 1242 const int w = width >> 2; 1243 for (int j = 0; j < w; j++) { 1244 for (int i = 0; i < h; i++) { 1245 transpose_arrays_s32_4x4(in + j * height + i * 4, 1246 out + i * width + j * 4); 1247 } 1248 } 1249 } 1250 1251 #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \ 1252 static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \ 1253 const int32x4_t *in, int32x4_t *out) { \ 1254 transpose_arrays_s32_4nx4n(in, out, w, h); \ 1255 } 1256 1257 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8) 1258 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16) 1259 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4) 1260 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8) 1261 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16) 1262 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32) 1263 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8) 1264 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16) 1265 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32) 1266 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64) 1267 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8) 1268 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16) 1269 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32) 1270 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64) 1271 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16) 1272 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32) 1273 1274 #undef TRANSPOSE_ARRAYS_S32_WXH_NEON 1275 1276 static inline int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) { 1277 #if AOM_ARCH_AARCH64 1278 return vtrn1q_s64(a, b); 1279 #else 1280 return vcombine_s64(vget_low_s64(a), vget_low_s64(b)); 1281 #endif 1282 } 1283 1284 static inline int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) { 1285 #if AOM_ARCH_AARCH64 1286 return vtrn2q_s64(a, b); 1287 #else 1288 return vcombine_s64(vget_high_s64(a), vget_high_s64(b)); 1289 #endif 1290 } 1291 1292 static inline void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1, 1293 int32x4_t a2, int32x4_t a3, 1294 int32x4_t a4, int32x4_t a5, 1295 int32x4_t a6, int32x4_t a7, 1296 int32x4x2_t *o0, int32x4x2_t *o1, 1297 int32x4x2_t *o2, int32x4x2_t *o3) { 1298 // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4 1299 // matrix transpose implementation: 1300 // [ A ]^T => [ A^T B^T ] 1301 // [ B ] 1302 1303 transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T 1304 transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T 1305 1306 o0->val[0] = a0; 1307 o1->val[0] = a1; 1308 o2->val[0] = a2; 1309 o3->val[0] = a3; 1310 1311 o0->val[1] = a4; 1312 o1->val[1] = a5; 1313 o2->val[1] = a6; 1314 o3->val[1] = a7; 1315 } 1316 1317 static inline void transpose_elems_inplace_s32_8x8( 1318 int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3, 1319 int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) { 1320 // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4 1321 // matrix transpose implementation: 1322 // [ A B ]^T => [ A^T C^T ] 1323 // [ C D ] [ B^T D^T ] 1324 1325 int32x4_t q0_v1 = a0->val[0]; 1326 int32x4_t q0_v2 = a1->val[0]; 1327 int32x4_t q0_v3 = a2->val[0]; 1328 int32x4_t q0_v4 = a3->val[0]; 1329 1330 int32x4_t q1_v1 = a0->val[1]; 1331 int32x4_t q1_v2 = a1->val[1]; 1332 int32x4_t q1_v3 = a2->val[1]; 1333 int32x4_t q1_v4 = a3->val[1]; 1334 1335 int32x4_t q2_v1 = a4->val[0]; 1336 int32x4_t q2_v2 = a5->val[0]; 1337 int32x4_t q2_v3 = a6->val[0]; 1338 int32x4_t q2_v4 = a7->val[0]; 1339 1340 int32x4_t q3_v1 = a4->val[1]; 1341 int32x4_t q3_v2 = a5->val[1]; 1342 int32x4_t q3_v3 = a6->val[1]; 1343 int32x4_t q3_v4 = a7->val[1]; 1344 1345 transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T 1346 transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T 1347 transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T 1348 transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T 1349 1350 a0->val[0] = q0_v1; 1351 a1->val[0] = q0_v2; 1352 a2->val[0] = q0_v3; 1353 a3->val[0] = q0_v4; 1354 1355 a0->val[1] = q2_v1; 1356 a1->val[1] = q2_v2; 1357 a2->val[1] = q2_v3; 1358 a3->val[1] = q2_v4; 1359 1360 a4->val[0] = q1_v1; 1361 a5->val[0] = q1_v2; 1362 a6->val[0] = q1_v3; 1363 a7->val[0] = q1_v4; 1364 1365 a4->val[1] = q3_v1; 1366 a5->val[1] = q3_v2; 1367 a6->val[1] = q3_v3; 1368 a7->val[1] = q3_v4; 1369 } 1370 1371 static inline void transpose_arrays_s16_4x4(const int16x4_t *const in, 1372 int16x4_t *const out) { 1373 int16x4_t a0 = in[0]; 1374 int16x4_t a1 = in[1]; 1375 int16x4_t a2 = in[2]; 1376 int16x4_t a3 = in[3]; 1377 1378 transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3); 1379 1380 out[0] = a0; 1381 out[1] = a1; 1382 out[2] = a2; 1383 out[3] = a3; 1384 } 1385 1386 static inline void transpose_arrays_s16_4x8(const int16x4_t *const in, 1387 int16x8_t *const out) { 1388 #if AOM_ARCH_AARCH64 1389 const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)), 1390 vcombine_s16(in[1], vdup_n_s16(0))); 1391 const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)), 1392 vcombine_s16(in[3], vdup_n_s16(0))); 1393 const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)), 1394 vcombine_s16(in[5], vdup_n_s16(0))); 1395 const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)), 1396 vcombine_s16(in[7], vdup_n_s16(0))); 1397 #else 1398 int16x4x2_t temp; 1399 temp = vzip_s16(in[0], in[1]); 1400 const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]); 1401 temp = vzip_s16(in[2], in[3]); 1402 const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]); 1403 temp = vzip_s16(in[4], in[5]); 1404 const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]); 1405 temp = vzip_s16(in[6], in[7]); 1406 const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]); 1407 #endif 1408 1409 const int32x4x2_t b02 = 1410 vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1)); 1411 const int32x4x2_t b13 = 1412 vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3)); 1413 1414 #if AOM_ARCH_AARCH64 1415 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]), 1416 vreinterpretq_s64_s32(b13.val[0]))); 1417 out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]), 1418 vreinterpretq_s64_s32(b13.val[0]))); 1419 out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]), 1420 vreinterpretq_s64_s32(b13.val[1]))); 1421 out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]), 1422 vreinterpretq_s64_s32(b13.val[1]))); 1423 #else 1424 out[0] = vreinterpretq_s16_s32( 1425 vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2)); 1426 out[2] = vreinterpretq_s16_s32( 1427 vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2)); 1428 out[1] = vreinterpretq_s16_s32( 1429 vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2)); 1430 out[3] = vreinterpretq_s16_s32( 1431 vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2)); 1432 #endif 1433 } 1434 1435 static inline void transpose_arrays_s16_8x4(const int16x8_t *const in, 1436 int16x4_t *const out) { 1437 // Swap 16 bit elements. Goes from: 1438 // in[0]: 00 01 02 03 04 05 06 07 1439 // in[1]: 10 11 12 13 14 15 16 17 1440 // in[2]: 20 21 22 23 24 25 26 27 1441 // in[3]: 30 31 32 33 34 35 36 37 1442 // to: 1443 // b0.val[0]: 00 10 02 12 04 14 06 16 1444 // b0.val[1]: 01 11 03 13 05 15 07 17 1445 // b1.val[0]: 20 30 22 32 24 34 26 36 1446 // b1.val[1]: 21 31 23 33 25 35 27 37 1447 1448 const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]); 1449 const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]); 1450 1451 // Swap 32 bit elements resulting in: 1452 // c0.val[0]: 00 10 20 30 04 14 24 34 1453 // c0.val[1]: 02 12 22 32 06 16 26 36 1454 // c1.val[0]: 01 11 21 31 05 15 25 35 1455 // c1.val[1]: 03 13 23 33 07 17 27 37 1456 1457 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]), 1458 vreinterpretq_u32_s16(b1.val[0])); 1459 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]), 1460 vreinterpretq_u32_s16(b1.val[1])); 1461 1462 // Unpack 64 bit elements resulting in: 1463 // out[0]: 00 10 20 30 1464 // out[1]: 01 11 21 31 1465 // out[2]: 02 12 22 32 1466 // out[3]: 03 13 23 33 1467 // out[4]: 04 14 24 34 1468 // out[5]: 05 15 25 35 1469 // out[6]: 06 16 26 36 1470 // out[7]: 07 17 27 37 1471 1472 out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0])); 1473 out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0])); 1474 out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1])); 1475 out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1])); 1476 out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0])); 1477 out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0])); 1478 out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1])); 1479 out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1])); 1480 } 1481 1482 static inline void transpose_arrays_s64_4x4(const int64x2_t *in, 1483 int64x2_t *out) { 1484 // Perform a 4x4 matrix transpose going from: 1485 // in[0] = 00 01 1486 // in[1] = 02 03 1487 // in[2] = 10 11 1488 // in[3] = 12 13 1489 // in[4] = 20 21 1490 // in[5] = 22 23 1491 // in[6] = 30 31 1492 // in[7] = 32 33 1493 // 1494 // to: 1495 // out[0] = 00 10 1496 // out[1] = 20 30 1497 // out[2] = 01 11 1498 // out[3] = 21 31 1499 // out[4] = 02 12 1500 // out[5] = 22 32 1501 // out[6] = 03 13 1502 // out[7] = 23 33 1503 1504 out[0] = aom_vtrn1q_s64(in[0], in[2]); 1505 out[1] = aom_vtrn1q_s64(in[4], in[6]); 1506 out[2] = aom_vtrn2q_s64(in[0], in[2]); 1507 out[3] = aom_vtrn2q_s64(in[4], in[6]); 1508 out[4] = aom_vtrn1q_s64(in[1], in[3]); 1509 out[5] = aom_vtrn1q_s64(in[5], in[7]); 1510 out[6] = aom_vtrn2q_s64(in[1], in[3]); 1511 out[7] = aom_vtrn2q_s64(in[5], in[7]); 1512 } 1513 1514 #endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_