mbyte.h (3892B)
1 #pragma once 2 3 #include <stdbool.h> 4 #include <stdint.h> 5 #include <sys/types.h> // IWYU pragma: keep 6 #include <utf8proc.h> 7 #include <uv.h> // IWYU pragma: keep 8 9 #include "nvim/cmdexpand_defs.h" // IWYU pragma: keep 10 #include "nvim/eval/typval_defs.h" // IWYU pragma: keep 11 #include "nvim/macros_defs.h" 12 #include "nvim/mbyte_defs.h" // IWYU pragma: keep 13 #include "nvim/types_defs.h" // IWYU pragma: keep 14 15 #define GRAPHEME_STATE_INIT 0 16 17 #include "mbyte.h.generated.h" 18 #include "mbyte.h.inline.generated.h" 19 20 enum { 21 kInvalidByteCells = 4, 22 }; 23 24 // Return byte length of character that starts with byte "b". 25 // Returns 1 for a single-byte character. 26 // MB_BYTE2LEN_CHECK() can be used to count a special key as one byte. 27 // Don't call MB_BYTE2LEN(b) with b < 0 or b > 255! 28 #define MB_BYTE2LEN(b) utf8len_tab[b] 29 #define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : utf8len_tab[b]) 30 31 extern const uint8_t utf8len_tab_zero[256]; 32 33 extern const uint8_t utf8len_tab[256]; 34 35 // Use our own character-case definitions, because the current locale may 36 // differ from what the .spl file uses. 37 // These must not be called with negative number! 38 // Multi-byte implementation. For Unicode we can call utf_*(), but don't do 39 // that for ASCII, because we don't want to use 'casemap' here. Otherwise use 40 // the "w" library function for characters above 255. 41 #define SPELL_TOFOLD(c) ((c) >= 128 ? utf_fold(c) : (int)spelltab.st_fold[c]) 42 43 #define SPELL_TOUPPER(c) ((c) >= 128 ? mb_toupper(c) : (int)spelltab.st_upper[c]) 44 45 #define SPELL_ISUPPER(c) ((c) >= 128 ? mb_isupper(c) : spelltab.st_isu[c]) 46 47 // MB_PTR_ADV(): advance a pointer to the next character, taking care of 48 // multi-byte characters if needed. Skip over composing chars. 49 #define MB_PTR_ADV(p) (p += utfc_ptr2len((char *)p)) 50 51 // MB_PTR_BACK(): backup a pointer to the previous character, taking care of 52 // multi-byte characters if needed. Only use with "p" > "s" ! 53 #define MB_PTR_BACK(s, p) \ 54 (p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1) 55 56 /// Check whether a given UTF-8 byte is a trailing byte (10xx.xxxx). 57 58 static inline bool utf_is_trail_byte(uint8_t const byte) 59 FUNC_ATTR_CONST FUNC_ATTR_ALWAYS_INLINE 60 { 61 // uint8_t is for clang to use smaller cmp 62 return (uint8_t)(byte & 0xC0U) == 0x80U; 63 } 64 65 /// Convert a UTF-8 byte sequence to a Unicode code point. 66 /// Handles ascii, multibyte sequiences and illegal sequences. 67 /// 68 /// @param[in] p_in String to convert. 69 /// 70 /// @return information abouth the character. When the sequence is illegal, 71 /// "value" is negative, "len" is 1. 72 static inline CharInfo utf_ptr2CharInfo(char const *const p_in) 73 FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE 74 { 75 uint8_t const *const p = (uint8_t const *)p_in; 76 uint8_t const first = *p; 77 if (first < 0x80) { 78 return (CharInfo){ .value = first, .len = 1 }; 79 } else { 80 int len = utf8len_tab[first]; 81 int32_t const code_point = utf_ptr2CharInfo_impl(p, (uintptr_t)len); 82 if (code_point < 0) { 83 len = 1; 84 } 85 return (CharInfo){ .value = code_point, .len = len }; 86 } 87 } 88 89 /// Return information about the next character. 90 /// Composing and combining characters are considered a part of the current character. 91 /// 92 /// @param[in] cur Information about the current character in the string. 93 static inline StrCharInfo utfc_next(StrCharInfo cur) 94 FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE 95 { 96 // handle ASCII case inline 97 uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); 98 if (EXPECT(*next < 0x80U, true)) { 99 return (StrCharInfo){ 100 .ptr = (char *)next, 101 .chr = (CharInfo){ .value = *next, .len = 1 }, 102 }; 103 } 104 105 return utfc_next_impl(cur); 106 } 107 108 static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr) 109 FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE 110 { 111 return (StrCharInfo){ .ptr = ptr, .chr = utf_ptr2CharInfo(ptr) }; 112 }