neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

mbyte.h (3892B)


      1 #pragma once
      2 
      3 #include <stdbool.h>
      4 #include <stdint.h>
      5 #include <sys/types.h>  // IWYU pragma: keep
      6 #include <utf8proc.h>
      7 #include <uv.h>  // IWYU pragma: keep
      8 
      9 #include "nvim/cmdexpand_defs.h"  // IWYU pragma: keep
     10 #include "nvim/eval/typval_defs.h"  // IWYU pragma: keep
     11 #include "nvim/macros_defs.h"
     12 #include "nvim/mbyte_defs.h"  // IWYU pragma: keep
     13 #include "nvim/types_defs.h"  // IWYU pragma: keep
     14 
     15 #define GRAPHEME_STATE_INIT 0
     16 
     17 #include "mbyte.h.generated.h"
     18 #include "mbyte.h.inline.generated.h"
     19 
     20 enum {
     21  kInvalidByteCells = 4,
     22 };
     23 
     24 // Return byte length of character that starts with byte "b".
     25 // Returns 1 for a single-byte character.
     26 // MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
     27 // Don't call MB_BYTE2LEN(b) with b < 0 or b > 255!
     28 #define MB_BYTE2LEN(b)         utf8len_tab[b]
     29 #define MB_BYTE2LEN_CHECK(b)   (((b) < 0 || (b) > 255) ? 1 : utf8len_tab[b])
     30 
     31 extern const uint8_t utf8len_tab_zero[256];
     32 
     33 extern const uint8_t utf8len_tab[256];
     34 
     35 // Use our own character-case definitions, because the current locale may
     36 // differ from what the .spl file uses.
     37 // These must not be called with negative number!
     38 // Multi-byte implementation.  For Unicode we can call utf_*(), but don't do
     39 // that for ASCII, because we don't want to use 'casemap' here.  Otherwise use
     40 // the "w" library function for characters above 255.
     41 #define SPELL_TOFOLD(c) ((c) >= 128 ? utf_fold(c) : (int)spelltab.st_fold[c])
     42 
     43 #define SPELL_TOUPPER(c) ((c) >= 128 ? mb_toupper(c) : (int)spelltab.st_upper[c])
     44 
     45 #define SPELL_ISUPPER(c) ((c) >= 128 ? mb_isupper(c) : spelltab.st_isu[c])
     46 
     47 // MB_PTR_ADV(): advance a pointer to the next character, taking care of
     48 // multi-byte characters if needed. Skip over composing chars.
     49 #define MB_PTR_ADV(p)      (p += utfc_ptr2len((char *)p))
     50 
     51 // MB_PTR_BACK(): backup a pointer to the previous character, taking care of
     52 // multi-byte characters if needed. Only use with "p" > "s" !
     53 #define MB_PTR_BACK(s, p) \
     54  (p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1)
     55 
     56 /// Check whether a given UTF-8 byte is a trailing byte (10xx.xxxx).
     57 
     58 static inline bool utf_is_trail_byte(uint8_t const byte)
     59  FUNC_ATTR_CONST FUNC_ATTR_ALWAYS_INLINE
     60 {
     61  // uint8_t is for clang to use smaller cmp
     62  return (uint8_t)(byte & 0xC0U) == 0x80U;
     63 }
     64 
     65 /// Convert a UTF-8 byte sequence to a Unicode code point.
     66 /// Handles ascii, multibyte sequiences and illegal sequences.
     67 ///
     68 /// @param[in]  p_in  String to convert.
     69 ///
     70 /// @return information abouth the character. When the sequence is illegal,
     71 /// "value" is negative, "len" is 1.
     72 static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
     73  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
     74 {
     75  uint8_t const *const p = (uint8_t const *)p_in;
     76  uint8_t const first = *p;
     77  if (first < 0x80) {
     78    return (CharInfo){ .value = first, .len = 1 };
     79  } else {
     80    int len = utf8len_tab[first];
     81    int32_t const code_point = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
     82    if (code_point < 0) {
     83      len = 1;
     84    }
     85    return (CharInfo){ .value = code_point, .len = len };
     86  }
     87 }
     88 
     89 /// Return information about the next character.
     90 /// Composing and combining characters are considered a part of the current character.
     91 ///
     92 /// @param[in] cur  Information about the current character in the string.
     93 static inline StrCharInfo utfc_next(StrCharInfo cur)
     94  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
     95 {
     96  // handle ASCII case inline
     97  uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
     98  if (EXPECT(*next < 0x80U, true)) {
     99    return (StrCharInfo){
    100      .ptr = (char *)next,
    101      .chr = (CharInfo){ .value = *next, .len = 1 },
    102    };
    103  }
    104 
    105  return utfc_next_impl(cur);
    106 }
    107 
    108 static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)
    109  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
    110 {
    111  return (StrCharInfo){ .ptr = ptr, .chr = utf_ptr2CharInfo(ptr) };
    112 }