neon.h (4031B)
1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // NEON common code. 11 12 #ifndef WEBP_DSP_NEON_H_ 13 #define WEBP_DSP_NEON_H_ 14 15 #include "src/dsp/dsp.h" 16 17 #if defined(WEBP_USE_NEON) 18 19 #include <arm_neon.h> 20 21 // Right now, some intrinsics functions seem slower, so we disable them 22 // everywhere except newer clang/gcc or aarch64 where the inline assembly is 23 // incompatible. 24 #if LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 9) || WEBP_AARCH64 25 #define WEBP_USE_INTRINSICS // use intrinsics when possible 26 #endif 27 28 #define INIT_VECTOR2(v, a, b) do { \ 29 v.val[0] = a; \ 30 v.val[1] = b; \ 31 } while (0) 32 33 #define INIT_VECTOR3(v, a, b, c) do { \ 34 v.val[0] = a; \ 35 v.val[1] = b; \ 36 v.val[2] = c; \ 37 } while (0) 38 39 #define INIT_VECTOR4(v, a, b, c, d) do { \ 40 v.val[0] = a; \ 41 v.val[1] = b; \ 42 v.val[2] = c; \ 43 v.val[3] = d; \ 44 } while (0) 45 46 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 47 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). 48 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) 49 #if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64) 50 #define WORK_AROUND_GCC 51 #endif 52 53 static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) { 54 uint64x2x2_t row01, row23; 55 56 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); 57 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); 58 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); 59 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); 60 // Transpose 64-bit values (there's no vswp equivalent) 61 { 62 const uint64x1_t row0h = vget_high_u64(row01.val[0]); 63 const uint64x1_t row2l = vget_low_u64(row23.val[0]); 64 const uint64x1_t row1h = vget_high_u64(row01.val[1]); 65 const uint64x1_t row3l = vget_low_u64(row23.val[1]); 66 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); 67 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); 68 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); 69 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); 70 } 71 { 72 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), 73 vreinterpretq_s32_u64(row01.val[1])); 74 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), 75 vreinterpretq_s32_u64(row23.val[1])); 76 int32x4x4_t out; 77 out.val[0] = out01.val[0]; 78 out.val[1] = out01.val[1]; 79 out.val[2] = out23.val[0]; 80 out.val[3] = out23.val[1]; 81 return out; 82 } 83 } 84 85 #if 0 // Useful debug macro. 86 #include <stdio.h> 87 #define PRINT_REG(REG, SIZE) do { \ 88 int i; \ 89 printf("%s \t[%d]: 0x", #REG, SIZE); \ 90 if (SIZE == 8) { \ 91 uint8_t _tmp[8]; \ 92 vst1_u8(_tmp, (REG)); \ 93 for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \ 94 } else if (SIZE == 16) { \ 95 uint16_t _tmp[4]; \ 96 vst1_u16(_tmp, (REG)); \ 97 for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \ 98 } \ 99 printf("\n"); \ 100 } while (0) 101 #endif 102 103 #endif // WEBP_USE_NEON 104 #endif // WEBP_DSP_NEON_H_