neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

spell.c (111068B)


      1 // spell.c: code for spell checking
      2 //
      3 // See spellfile.c for the Vim spell file format.
      4 //
      5 // The spell checking mechanism uses a tree (aka trie).  Each node in the tree
      6 // has a list of bytes that can appear (siblings).  For each byte there is a
      7 // pointer to the node with the byte that follows in the word (child).
      8 //
      9 // A NUL byte is used where the word may end.  The bytes are sorted, so that
     10 // binary searching can be used and the NUL bytes are at the start.  The
     11 // number of possible bytes is stored before the list of bytes.
     12 //
     13 // The tree uses two arrays: "byts" stores the characters, "idxs" stores
     14 // either the next index or flags.  The tree starts at index 0.  For example,
     15 // to lookup "vi" this sequence is followed:
     16 //      i = 0
     17 //      len = byts[i]
     18 //      n = where "v" appears in byts[i + 1] to byts[i + len]
     19 //      i = idxs[n]
     20 //      len = byts[i]
     21 //      n = where "i" appears in byts[i + 1] to byts[i + len]
     22 //      i = idxs[n]
     23 //      len = byts[i]
     24 //      find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
     25 //
     26 // There are two word trees: one with case-folded words and one with words in
     27 // original case.  The second one is only used for keep-case words and is
     28 // usually small.
     29 //
     30 // There is one additional tree for when not all prefixes are applied when
     31 // generating the .spl file.  This tree stores all the possible prefixes, as
     32 // if they were words.  At each word (prefix) end the prefix nr is stored, the
     33 // following word must support this prefix nr.  And the condition nr is
     34 // stored, used to lookup the condition that the word must match with.
     35 //
     36 // Thanks to Olaf Seibert for providing an example implementation of this tree
     37 // and the compression mechanism.
     38 // LZ trie ideas, original link (now dead)
     39 //      irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf
     40 // More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html
     41 //
     42 // Matching involves checking the caps type: Onecap ALLCAP KeepCap.
     43 //
     44 // Why doesn't Vim use aspell/ispell/myspell/etc.?
     45 // See ":help develop-spell".
     46 
     47 // Use SPELL_PRINTTREE for debugging: dump the word tree after adding a word.
     48 // Only use it for small word lists!
     49 
     50 // Use SPELL_COMPRESS_ALWAYS for debugging: compress the word tree after
     51 // adding a word.  Only use it for small word lists!
     52 
     53 // Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk() for a
     54 // specific word.
     55 
     56 #include <assert.h>
     57 #include <inttypes.h>
     58 #include <limits.h>
     59 #include <stdbool.h>
     60 #include <stddef.h>
     61 #include <stdio.h>
     62 #include <string.h>
     63 
     64 #include "nvim/ascii_defs.h"
     65 #include "nvim/autocmd.h"
     66 #include "nvim/autocmd_defs.h"
     67 #include "nvim/buffer.h"
     68 #include "nvim/buffer_defs.h"
     69 #include "nvim/change.h"
     70 #include "nvim/charset.h"
     71 #include "nvim/cursor.h"
     72 #include "nvim/decoration.h"
     73 #include "nvim/decoration_provider.h"
     74 #include "nvim/drawscreen.h"
     75 #include "nvim/errors.h"
     76 #include "nvim/ex_cmds.h"
     77 #include "nvim/ex_cmds_defs.h"
     78 #include "nvim/ex_docmd.h"
     79 #include "nvim/garray.h"
     80 #include "nvim/garray_defs.h"
     81 #include "nvim/gettext_defs.h"
     82 #include "nvim/globals.h"
     83 #include "nvim/hashtab.h"
     84 #include "nvim/hashtab_defs.h"
     85 #include "nvim/highlight_defs.h"
     86 #include "nvim/insexpand.h"
     87 #include "nvim/log.h"
     88 #include "nvim/macros_defs.h"
     89 #include "nvim/mark_defs.h"
     90 #include "nvim/mbyte.h"
     91 #include "nvim/mbyte_defs.h"
     92 #include "nvim/memline.h"
     93 #include "nvim/memory.h"
     94 #include "nvim/message.h"
     95 #include "nvim/option.h"
     96 #include "nvim/option_defs.h"
     97 #include "nvim/option_vars.h"
     98 #include "nvim/os/fs.h"
     99 #include "nvim/os/input.h"
    100 #include "nvim/os/os_defs.h"
    101 #include "nvim/path.h"
    102 #include "nvim/pos_defs.h"
    103 #include "nvim/regexp.h"
    104 #include "nvim/regexp_defs.h"
    105 #include "nvim/runtime.h"
    106 #include "nvim/search.h"
    107 #include "nvim/spell.h"
    108 #include "nvim/spell_defs.h"
    109 #include "nvim/spellfile.h"
    110 #include "nvim/spellsuggest.h"
    111 #include "nvim/strings.h"
    112 #include "nvim/syntax.h"
    113 #include "nvim/types_defs.h"
    114 #include "nvim/undo.h"
    115 #include "nvim/vim_defs.h"
    116 #include "nvim/window.h"
    117 
    118 // Result values.  Lower number is accepted over higher one.
    119 enum {
    120  SP_BANNED = -1,
    121  SP_RARE = 0,
    122  SP_OK = 1,
    123  SP_LOCAL = 2,
    124  SP_BAD = 3,
    125 };
    126 
    127 // First language that is loaded, start of the linked list of loaded
    128 // languages.
    129 slang_T *first_lang = NULL;
    130 
    131 // file used for "zG" and "zW"
    132 char *int_wordlist = NULL;
    133 
    134 // Structure to store info for word matching.
    135 typedef struct {
    136  langp_T *mi_lp;                   // info for language and region
    137 
    138  // pointers to original text to be checked
    139  char *mi_word;                   // start of word being checked
    140  char *mi_end;                    // end of matching word so far
    141  char *mi_fend;                   // next char to be added to mi_fword
    142  char *mi_cend;                   // char after what was used for
    143                                   // mi_capflags
    144 
    145  // case-folded text
    146  char mi_fword[MAXWLEN + 1];           // mi_word case-folded
    147  int mi_fwordlen;                      // nr of valid bytes in mi_fword
    148 
    149  // for when checking word after a prefix
    150  int mi_prefarridx;                    // index in sl_pidxs with list of
    151                                        // affixID/condition
    152  int mi_prefcnt;                       // number of entries at mi_prefarridx
    153  int mi_prefixlen;                     // byte length of prefix
    154  int mi_cprefixlen;                    // byte length of prefix in original
    155                                        // case
    156 
    157  // for when checking a compound word
    158  int mi_compoff;                       // start of following word offset
    159  uint8_t mi_compflags[MAXWLEN];        // flags for compound words used
    160  int mi_complen;                       // nr of compound words used
    161  int mi_compextra;                     // nr of COMPOUNDROOT words
    162 
    163  // others
    164  int mi_result;                        // result so far: SP_BAD, SP_OK, etc.
    165  int mi_capflags;                      // WF_ONECAP WF_ALLCAP WF_KEEPCAP
    166  win_T *mi_win;                  // buffer being checked
    167 
    168  // for NOBREAK
    169  int mi_result2;                       // "mi_result" without following word
    170  char *mi_end2;                        // "mi_end" without following word
    171 } matchinf_T;
    172 
    173 // Structure used for the cookie argument of do_in_runtimepath().
    174 typedef struct {
    175  char sl_lang[MAXWLEN + 1];            // language name
    176  slang_T *sl_slang;                    // resulting slang_T struct
    177  int sl_nobreak;                       // NOBREAK language found
    178 } spelload_T;
    179 
    180 #define SY_MAXLEN   30
    181 typedef struct {
    182  char sy_chars[SY_MAXLEN];               // the sequence of chars
    183  int sy_len;
    184 } syl_item_T;
    185 
    186 spelltab_T spelltab;
    187 bool did_set_spelltab;
    188 
    189 #include "spell.c.generated.h"
    190 
    191 /// mode values for find_word
    192 enum {
    193  FIND_FOLDWORD     = 0,  ///< find word case-folded
    194  FIND_KEEPWORD     = 1,  ///< find keep-case word
    195  FIND_PREFIX       = 2,  ///< find word after prefix
    196  FIND_COMPOUND     = 3,  ///< find case-folded compound word
    197  FIND_KEEPCOMPOUND = 4,  ///< find keep-case compound word
    198 };
    199 
    200 /// type values for get_char_type
    201 enum {
    202  CHAR_OTHER = 0,
    203  CHAR_UPPER = 1,
    204  CHAR_DIGIT = 2,
    205 };
    206 
    207 char *e_format = N_("E759: Format error in spell file");
    208 
    209 // Remember what "z?" replaced.
    210 char *repl_from = NULL;
    211 char *repl_to = NULL;
    212 
    213 /// Main spell-checking function.
    214 /// "ptr" points to a character that could be the start of a word.
    215 /// "*attrp" is set to the highlight index for a badly spelled word.  For a
    216 /// non-word or when it's OK it remains unchanged.
    217 /// This must only be called when 'spelllang' is not empty.
    218 ///
    219 /// "capcol" is used to check for a Capitalised word after the end of a
    220 /// sentence.  If it's zero then perform the check.  Return the column where to
    221 /// check next, or -1 when no sentence end was found.  If it's NULL then don't
    222 /// worry.
    223 ///
    224 /// @param wp  current window
    225 /// @param capcol  column to check for Capital
    226 /// @param docount  count good words
    227 ///
    228 /// @return  the length of the word in bytes, also when it's OK, so that the
    229 /// caller can skip over the word.
    230 size_t spell_check(win_T *wp, char *ptr, hlf_T *attrp, int *capcol, bool docount)
    231 {
    232  // A word never starts at a space or a control character. Return quickly
    233  // then, skipping over the character.
    234  if ((uint8_t)(*ptr) <= ' ') {
    235    return 1;
    236  }
    237 
    238  // Return here when loading language files failed.
    239  if (GA_EMPTY(&wp->w_s->b_langp)) {
    240    return 1;
    241  }
    242 
    243  size_t nrlen = 0;              // found a number first
    244  size_t wrongcaplen = 0;
    245  bool count_word = docount;
    246  bool use_camel_case = (wp->w_s->b_p_spo_flags & kOptSpoFlagCamel) != 0;
    247  bool is_camel_case = false;
    248 
    249  matchinf_T mi;  // Most things are put in "mi" so that it can be passed to functions quickly.
    250  CLEAR_FIELD(mi);
    251 
    252  // A number is always OK.  Also skip hexadecimal numbers 0xFF99 and
    253  // 0X99FF.  But always do check spelling to find "3GPP" and "11
    254  // julifeest".
    255  if (*ptr >= '0' && *ptr <= '9') {
    256    if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B')) {
    257      mi.mi_end = (char *)skipbin(ptr + 2);
    258    } else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) {
    259      mi.mi_end = skiphex(ptr + 2);
    260    } else {
    261      mi.mi_end = skipdigits(ptr);
    262    }
    263    nrlen = (size_t)(mi.mi_end - ptr);
    264  }
    265 
    266  // Find the normal end of the word (until the next non-word character).
    267  mi.mi_word = ptr;
    268  mi.mi_fend = ptr;
    269  if (spell_iswordp(mi.mi_fend, wp)) {
    270    if (use_camel_case) {
    271      mi.mi_fend = advance_camelcase_word(ptr, wp, &is_camel_case);
    272    } else {
    273      do {
    274        MB_PTR_ADV(mi.mi_fend);
    275      } while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp));
    276    }
    277 
    278    if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL) {
    279      // Check word starting with capital letter.
    280      int c = utf_ptr2char(ptr);
    281      if (!SPELL_ISUPPER(c)) {
    282        wrongcaplen = (size_t)(mi.mi_fend - ptr);
    283      }
    284    }
    285  }
    286  if (capcol != NULL) {
    287    *capcol = -1;
    288  }
    289 
    290  // We always use the characters up to the next non-word character,
    291  // also for bad words.
    292  mi.mi_end = mi.mi_fend;
    293 
    294  // Check caps type later.
    295  mi.mi_capflags = 0;
    296  mi.mi_cend = NULL;
    297  mi.mi_win = wp;
    298 
    299  // case-fold the word with one non-word character, so that we can check
    300  // for the word end.
    301  if (*mi.mi_fend != NUL) {
    302    MB_PTR_ADV(mi.mi_fend);
    303  }
    304 
    305  spell_casefold(wp, ptr, (int)(mi.mi_fend - ptr), mi.mi_fword,
    306                 MAXWLEN + 1);
    307  mi.mi_fwordlen = (int)strlen(mi.mi_fword);
    308 
    309  if (is_camel_case && mi.mi_fwordlen > 0) {
    310    // introduce a fake word end space into the folded word.
    311    mi.mi_fword[mi.mi_fwordlen - 1] = ' ';
    312  }
    313 
    314  // The word is bad unless we recognize it.
    315  mi.mi_result = SP_BAD;
    316  mi.mi_result2 = SP_BAD;
    317 
    318  // Loop over the languages specified in 'spelllang'.
    319  // We check them all, because a word may be matched longer in another
    320  // language.
    321  for (int lpi = 0; lpi < wp->w_s->b_langp.ga_len; lpi++) {
    322    mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi);
    323 
    324    // If reloading fails the language is still in the list but everything
    325    // has been cleared.
    326    if (mi.mi_lp->lp_slang->sl_fidxs == NULL) {
    327      continue;
    328    }
    329 
    330    // Check for a matching word in case-folded words.
    331    find_word(&mi, FIND_FOLDWORD);
    332 
    333    // Check for a matching word in keep-case words.
    334    find_word(&mi, FIND_KEEPWORD);
    335 
    336    // Check for matching prefixes.
    337    find_prefix(&mi, FIND_FOLDWORD);
    338 
    339    // For a NOBREAK language, may want to use a word without a following
    340    // word as a backup.
    341    if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD
    342        && mi.mi_result2 != SP_BAD) {
    343      mi.mi_result = mi.mi_result2;
    344      mi.mi_end = mi.mi_end2;
    345    }
    346 
    347    // Count the word in the first language where it's found to be OK.
    348    if (count_word && mi.mi_result == SP_OK) {
    349      count_common_word(mi.mi_lp->lp_slang, ptr,
    350                        (int)(mi.mi_end - ptr), 1);
    351      count_word = false;
    352    }
    353  }
    354 
    355  if (mi.mi_result != SP_OK) {
    356    // If we found a number skip over it.  Allows for "42nd".  Do flag
    357    // rare and local words, e.g., "3GPP".
    358    if (nrlen > 0) {
    359      if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) {
    360        return nrlen;
    361      }
    362    } else if (!spell_iswordp_nmw(ptr, wp)) {
    363      // When we are at a non-word character there is no error, just
    364      // skip over the character (try looking for a word after it).
    365      if (capcol != NULL && wp->w_s->b_cap_prog != NULL) {
    366        regmatch_T regmatch;
    367 
    368        // Check for end of sentence.
    369        regmatch.regprog = wp->w_s->b_cap_prog;
    370        regmatch.rm_ic = false;
    371        bool r = vim_regexec(&regmatch, ptr, 0);
    372        wp->w_s->b_cap_prog = regmatch.regprog;
    373        if (r) {
    374          *capcol = (int)(regmatch.endp[0] - ptr);
    375        }
    376      }
    377 
    378      return (size_t)(utfc_ptr2len(ptr));
    379    } else if (mi.mi_end == ptr) {
    380      // Always include at least one character.  Required for when there
    381      // is a mixup in "midword".
    382      MB_PTR_ADV(mi.mi_end);
    383    } else if (mi.mi_result == SP_BAD
    384               && LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak) {
    385      char *p;
    386      int save_result = mi.mi_result;
    387 
    388      // First language in 'spelllang' is NOBREAK.  Find first position
    389      // at which any word would be valid.
    390      mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0);
    391      if (mi.mi_lp->lp_slang->sl_fidxs != NULL) {
    392        p = mi.mi_word;
    393        char *fp = mi.mi_fword;
    394        while (true) {
    395          MB_PTR_ADV(p);
    396          MB_PTR_ADV(fp);
    397          if (p >= mi.mi_end) {
    398            break;
    399          }
    400          mi.mi_compoff = (int)(fp - mi.mi_fword);
    401          find_word(&mi, FIND_COMPOUND);
    402          if (mi.mi_result != SP_BAD) {
    403            mi.mi_end = p;
    404            break;
    405          }
    406        }
    407        mi.mi_result = save_result;
    408      }
    409    }
    410 
    411    if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) {
    412      *attrp = HLF_SPB;
    413    } else if (mi.mi_result == SP_RARE) {
    414      *attrp = HLF_SPR;
    415    } else {
    416      *attrp = HLF_SPL;
    417    }
    418  }
    419 
    420  if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE)) {
    421    // Report SpellCap only when the word isn't badly spelled.
    422    *attrp = HLF_SPC;
    423    return wrongcaplen;
    424  }
    425 
    426  return (size_t)(mi.mi_end - ptr);
    427 }
    428 
    429 /// Determine the type of character "c".
    430 static int get_char_type(int c)
    431 {
    432  if (ascii_isdigit(c)) {
    433    return CHAR_DIGIT;
    434  }
    435  if (SPELL_ISUPPER(c)) {
    436    return CHAR_UPPER;
    437  }
    438  return CHAR_OTHER;
    439 }
    440 
    441 /// Returns a pointer to the end of the word starting at "str".
    442 /// Supports camelCase words.
    443 static char *advance_camelcase_word(char *str, win_T *wp, bool *is_camel_case)
    444 {
    445  char *end = str;
    446 
    447  *is_camel_case = false;
    448 
    449  if (*str == NUL) {
    450    return str;
    451  }
    452 
    453  int c = utf_ptr2char(end);
    454  MB_PTR_ADV(end);
    455  // We need at most the types of the type of the last two chars.
    456  int last_last_type = -1;
    457  int last_type = get_char_type(c);
    458 
    459  while (*end != NUL && spell_iswordp(end, wp)) {
    460    c = utf_ptr2char(end);
    461    int this_type = get_char_type(c);
    462 
    463    if (last_last_type == CHAR_UPPER && last_type == CHAR_UPPER
    464        && this_type == CHAR_OTHER) {
    465      // Handle the following cases:
    466      // UpperUpperLower
    467      *is_camel_case = true;
    468      // Back up by one char.
    469      MB_PTR_BACK(str, end);
    470      break;
    471    } else if ((this_type == CHAR_UPPER && last_type == CHAR_OTHER)
    472               || (this_type != last_type
    473                   && (this_type == CHAR_DIGIT || last_type == CHAR_DIGIT))) {
    474      // Handle the following cases:
    475      // LowerUpper LowerDigit UpperDigit DigitUpper DigitLower
    476      *is_camel_case = true;
    477      break;
    478    }
    479 
    480    last_last_type = last_type;
    481    last_type = this_type;
    482 
    483    MB_PTR_ADV(end);
    484  }
    485 
    486  return end;
    487 }
    488 
    489 // Check if the word at "mip->mi_word" is in the tree.
    490 // When "mode" is FIND_FOLDWORD check in fold-case word tree.
    491 // When "mode" is FIND_KEEPWORD check in keep-case word tree.
    492 // When "mode" is FIND_PREFIX check for word after prefix in fold-case word
    493 // tree.
    494 //
    495 // For a match mip->mi_result is updated.
    496 static void find_word(matchinf_T *mip, int mode)
    497 {
    498  int wlen = 0;
    499  int flen;
    500  char *ptr;
    501  slang_T *slang = mip->mi_lp->lp_slang;
    502  uint8_t *byts;
    503  idx_T *idxs;
    504 
    505  if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) {
    506    // Check for word with matching case in keep-case tree.
    507    ptr = mip->mi_word;
    508    flen = 9999;                    // no case folding, always enough bytes
    509    byts = slang->sl_kbyts;
    510    idxs = slang->sl_kidxs;
    511 
    512    if (mode == FIND_KEEPCOMPOUND) {
    513      // Skip over the previously found word(s).
    514      wlen += mip->mi_compoff;
    515    }
    516  } else {
    517    // Check for case-folded in case-folded tree.
    518    ptr = mip->mi_fword;
    519    flen = mip->mi_fwordlen;        // available case-folded bytes
    520    byts = slang->sl_fbyts;
    521    idxs = slang->sl_fidxs;
    522 
    523    if (mode == FIND_PREFIX) {
    524      // Skip over the prefix.
    525      wlen = mip->mi_prefixlen;
    526      flen -= mip->mi_prefixlen;
    527    } else if (mode == FIND_COMPOUND) {
    528      // Skip over the previously found word(s).
    529      wlen = mip->mi_compoff;
    530      flen -= mip->mi_compoff;
    531    }
    532  }
    533 
    534  if (byts == NULL) {
    535    return;                     // array is empty
    536  }
    537  idx_T arridx = 0;
    538  int endlen[MAXWLEN];              // length at possible word endings
    539  idx_T endidx[MAXWLEN];            // possible word endings
    540  int endidxcnt = 0;
    541 
    542  // Repeat advancing in the tree until:
    543  // - there is a byte that doesn't match,
    544  // - we reach the end of the tree,
    545  // - or we reach the end of the line.
    546  while (true) {
    547    if (flen <= 0 && *mip->mi_fend != NUL) {
    548      flen = fold_more(mip);
    549    }
    550 
    551    int len = byts[arridx++];
    552 
    553    // If the first possible byte is a zero the word could end here.
    554    // Remember this index, we first check for the longest word.
    555    if (byts[arridx] == 0) {
    556      if (endidxcnt == MAXWLEN) {
    557        // Must be a corrupted spell file.
    558        emsg(_(e_format));
    559        return;
    560      }
    561      endlen[endidxcnt] = wlen;
    562      endidx[endidxcnt++] = arridx++;
    563      len--;
    564 
    565      // Skip over the zeros, there can be several flag/region
    566      // combinations.
    567      while (len > 0 && byts[arridx] == 0) {
    568        arridx++;
    569        len--;
    570      }
    571      if (len == 0) {
    572        break;              // no children, word must end here
    573      }
    574    }
    575 
    576    // Stop looking at end of the line.
    577    if (ptr[wlen] == NUL) {
    578      break;
    579    }
    580 
    581    // Perform a binary search in the list of accepted bytes.
    582    int c = (uint8_t)ptr[wlen];
    583    if (c == TAB) {         // <Tab> is handled like <Space>
    584      c = ' ';
    585    }
    586    idx_T lo = arridx;
    587    idx_T hi = arridx + len - 1;
    588    while (lo < hi) {
    589      idx_T m = (lo + hi) / 2;
    590      if (byts[m] > c) {
    591        hi = m - 1;
    592      } else if (byts[m] < c) {
    593        lo = m + 1;
    594      } else {
    595        lo = hi = m;
    596        break;
    597      }
    598    }
    599 
    600    // Stop if there is no matching byte.
    601    if (hi < lo || byts[lo] != c) {
    602      break;
    603    }
    604 
    605    // Continue at the child (if there is one).
    606    arridx = idxs[lo];
    607    wlen++;
    608    flen--;
    609 
    610    // One space in the good word may stand for several spaces in the
    611    // checked word.
    612    if (c == ' ') {
    613      while (true) {
    614        if (flen <= 0 && *mip->mi_fend != NUL) {
    615          flen = fold_more(mip);
    616        }
    617        if (ptr[wlen] != ' ' && ptr[wlen] != TAB) {
    618          break;
    619        }
    620        wlen++;
    621        flen--;
    622      }
    623    }
    624  }
    625 
    626  // Verify that one of the possible endings is valid.  Try the longest
    627  // first.
    628  while (endidxcnt > 0) {
    629    endidxcnt--;
    630    arridx = endidx[endidxcnt];
    631    wlen = endlen[endidxcnt];
    632 
    633    if (utf_head_off(ptr, ptr + wlen) > 0) {
    634      continue;             // not at first byte of character
    635    }
    636    bool word_ends;
    637    if (spell_iswordp(ptr + wlen, mip->mi_win)) {
    638      if (slang->sl_compprog == NULL && !slang->sl_nobreak) {
    639        continue;                   // next char is a word character
    640      }
    641      word_ends = false;
    642    } else {
    643      word_ends = true;
    644    }
    645    // The prefix flag is before compound flags.  Once a valid prefix flag
    646    // has been found we try compound flags.
    647    bool prefix_found = false;
    648 
    649    if (mode != FIND_KEEPWORD) {
    650      // Compute byte length in original word, length may change
    651      // when folding case.  This can be slow, take a shortcut when the
    652      // case-folded word is equal to the keep-case word.
    653      char *p = mip->mi_word;
    654      if (strncmp(ptr, p, (size_t)wlen) != 0) {
    655        for (char *s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) {
    656          MB_PTR_ADV(p);
    657        }
    658        wlen = (int)(p - mip->mi_word);
    659      }
    660    }
    661 
    662    // Check flags and region.  For FIND_PREFIX check the condition and
    663    // prefix ID.
    664    // Repeat this if there are more flags/region alternatives until there
    665    // is a match.
    666    for (int len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; len--, arridx++) {
    667      uint32_t flags = (uint32_t)idxs[arridx];
    668 
    669      // For the fold-case tree check that the case of the checked word
    670      // matches with what the word in the tree requires.
    671      // For keep-case tree the case is always right.  For prefixes we
    672      // don't bother to check.
    673      if (mode == FIND_FOLDWORD) {
    674        if (mip->mi_cend != mip->mi_word + wlen) {
    675          // mi_capflags was set for a different word length, need
    676          // to do it again.
    677          mip->mi_cend = mip->mi_word + wlen;
    678          mip->mi_capflags = captype(mip->mi_word, mip->mi_cend);
    679        }
    680 
    681        if (mip->mi_capflags == WF_KEEPCAP
    682            || !spell_valid_case(mip->mi_capflags, (int)flags)) {
    683          continue;
    684        }
    685      } else if (mode == FIND_PREFIX && !prefix_found) {
    686        // When mode is FIND_PREFIX the word must support the prefix:
    687        // check the prefix ID and the condition.  Do that for the list at
    688        // mip->mi_prefarridx that find_prefix() filled.
    689        int c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx,
    690                                  (int)flags,
    691                                  mip->mi_word + mip->mi_cprefixlen, slang,
    692                                  false);
    693        if (c == 0) {
    694          continue;
    695        }
    696 
    697        // Use the WF_RARE flag for a rare prefix.
    698        if (c & WF_RAREPFX) {
    699          flags |= WF_RARE;
    700        }
    701        prefix_found = true;
    702      }
    703 
    704      if (slang->sl_nobreak) {
    705        if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND)
    706            && (flags & WF_BANNED) == 0) {
    707          // NOBREAK: found a valid following word.  That's all we
    708          // need to know, so return.
    709          mip->mi_result = SP_OK;
    710          break;
    711        }
    712      } else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND
    713                  || !word_ends)) {
    714        // If there is no compound flag or the word is shorter than
    715        // COMPOUNDMIN reject it quickly.
    716        // Makes you wonder why someone puts a compound flag on a word
    717        // that's too short...  Myspell compatibility requires this
    718        // anyway.
    719        if (((unsigned)flags >> 24) == 0
    720            || wlen - mip->mi_compoff < slang->sl_compminlen) {
    721          continue;
    722        }
    723        // For multi-byte chars check character length against
    724        // COMPOUNDMIN.
    725        if (slang->sl_compminlen > 0
    726            && mb_charlen_len(mip->mi_word + mip->mi_compoff,
    727                              wlen - mip->mi_compoff) < slang->sl_compminlen) {
    728          continue;
    729        }
    730 
    731        // Limit the number of compound words to COMPOUNDWORDMAX if no
    732        // maximum for syllables is specified.
    733        if (!word_ends && mip->mi_complen + mip->mi_compextra + 2
    734            > slang->sl_compmax
    735            && slang->sl_compsylmax == MAXWLEN) {
    736          continue;
    737        }
    738 
    739        // Don't allow compounding on a side where an affix was added,
    740        // unless COMPOUNDPERMITFLAG was used.
    741        if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF)) {
    742          continue;
    743        }
    744        if (!word_ends && (flags & WF_NOCOMPAFT)) {
    745          continue;
    746        }
    747 
    748        // Quickly check if compounding is possible with this flag.
    749        if (!byte_in_str(mip->mi_complen ==
    750                         0 ? slang->sl_compstartflags : slang->sl_compallflags,
    751                         (int)((unsigned)flags >> 24))) {
    752          continue;
    753        }
    754 
    755        // If there is a match with a CHECKCOMPOUNDPATTERN rule
    756        // discard the compound word.
    757        if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat)) {
    758          continue;
    759        }
    760 
    761        if (mode == FIND_COMPOUND) {
    762          int capflags;
    763          char *p;
    764 
    765          // Need to check the caps type of the appended compound
    766          // word.
    767          if (strncmp(ptr, mip->mi_word, (size_t)mip->mi_compoff) != 0) {
    768            // case folding may have changed the length
    769            p = mip->mi_word;
    770            for (char *s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s)) {
    771              MB_PTR_ADV(p);
    772            }
    773          } else {
    774            p = mip->mi_word + mip->mi_compoff;
    775          }
    776          capflags = captype(p, mip->mi_word + wlen);
    777          if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP
    778                                         && (flags & WF_FIXCAP) != 0)) {
    779            continue;
    780          }
    781 
    782          if (capflags != WF_ALLCAP) {
    783            // When the character before the word is a word
    784            // character we do not accept a Onecap word.  We do
    785            // accept a no-caps word, even when the dictionary
    786            // word specifies ONECAP.
    787            MB_PTR_BACK(mip->mi_word, p);
    788            if (spell_iswordp_nmw(p, mip->mi_win)
    789                ? capflags == WF_ONECAP
    790                : (flags & WF_ONECAP) != 0
    791                && capflags != WF_ONECAP) {
    792              continue;
    793            }
    794          }
    795        }
    796 
    797        // If the word ends the sequence of compound flags of the
    798        // words must match with one of the COMPOUNDRULE items and
    799        // the number of syllables must not be too large.
    800        mip->mi_compflags[mip->mi_complen] = (uint8_t)((unsigned)flags >> 24);
    801        mip->mi_compflags[mip->mi_complen + 1] = NUL;
    802        if (word_ends) {
    803          char fword[MAXWLEN] = { 0 };
    804 
    805          if (slang->sl_compsylmax < MAXWLEN) {
    806            // "fword" is only needed for checking syllables.
    807            if (ptr == mip->mi_word) {
    808              spell_casefold(mip->mi_win, ptr, wlen, fword, MAXWLEN);
    809            } else {
    810              xmemcpyz(fword, ptr, (size_t)endlen[endidxcnt]);
    811            }
    812          }
    813          if (!can_compound(slang, fword, mip->mi_compflags)) {
    814            continue;
    815          }
    816        } else if (slang->sl_comprules != NULL
    817                   && !match_compoundrule(slang, mip->mi_compflags)) {
    818          // The compound flags collected so far do not match any
    819          // COMPOUNDRULE, discard the compounded word.
    820          continue;
    821        }
    822      } else if (flags & WF_NEEDCOMP) {
    823        // skip if word is only valid in a compound
    824        continue;
    825      }
    826 
    827      int nobreak_result = SP_OK;
    828 
    829      if (!word_ends) {
    830        int save_result = mip->mi_result;
    831        char *save_end = mip->mi_end;
    832        langp_T *save_lp = mip->mi_lp;
    833 
    834        // Check that a valid word follows.  If there is one and we
    835        // are compounding, it will set "mi_result", thus we are
    836        // always finished here.  For NOBREAK we only check that a
    837        // valid word follows.
    838        // Recursive!
    839        if (slang->sl_nobreak) {
    840          mip->mi_result = SP_BAD;
    841        }
    842 
    843        // Find following word in case-folded tree.
    844        mip->mi_compoff = endlen[endidxcnt];
    845        if (mode == FIND_KEEPWORD) {
    846          // Compute byte length in case-folded word from "wlen":
    847          // byte length in keep-case word.  Length may change when
    848          // folding case.  This can be slow, take a shortcut when
    849          // the case-folded word is equal to the keep-case word.
    850          char *p = mip->mi_fword;
    851          if (strncmp(ptr, p, (size_t)wlen) != 0) {
    852            for (char *s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) {
    853              MB_PTR_ADV(p);
    854            }
    855            mip->mi_compoff = (int)(p - mip->mi_fword);
    856          }
    857        }
    858        mip->mi_complen++;
    859        if (flags & WF_COMPROOT) {
    860          mip->mi_compextra++;
    861        }
    862 
    863        // For NOBREAK we need to try all NOBREAK languages, at least
    864        // to find the ".add" file(s).
    865        for (int lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; lpi++) {
    866          if (slang->sl_nobreak) {
    867            mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi);
    868            if (mip->mi_lp->lp_slang->sl_fidxs == NULL
    869                || !mip->mi_lp->lp_slang->sl_nobreak) {
    870              continue;
    871            }
    872          }
    873 
    874          find_word(mip, FIND_COMPOUND);
    875 
    876          // When NOBREAK any word that matches is OK.  Otherwise we
    877          // need to find the longest match, thus try with keep-case
    878          // and prefix too.
    879          if (!slang->sl_nobreak || mip->mi_result == SP_BAD) {
    880            // Find following word in keep-case tree.
    881            mip->mi_compoff = wlen;
    882            find_word(mip, FIND_KEEPCOMPOUND);
    883          }
    884 
    885          if (!slang->sl_nobreak) {
    886            break;
    887          }
    888        }
    889        mip->mi_complen--;
    890        if (flags & WF_COMPROOT) {
    891          mip->mi_compextra--;
    892        }
    893        mip->mi_lp = save_lp;
    894 
    895        if (slang->sl_nobreak) {
    896          nobreak_result = mip->mi_result;
    897          mip->mi_result = save_result;
    898          mip->mi_end = save_end;
    899        } else {
    900          if (mip->mi_result == SP_OK) {
    901            break;
    902          }
    903          continue;
    904        }
    905      }
    906 
    907      int res = SP_BAD;
    908      if (flags & WF_BANNED) {
    909        res = SP_BANNED;
    910      } else if (flags & WF_REGION) {
    911        // Check region.
    912        if (((unsigned)mip->mi_lp->lp_region & (flags >> 16)) != 0) {
    913          res = SP_OK;
    914        } else {
    915          res = SP_LOCAL;
    916        }
    917      } else if (flags & WF_RARE) {
    918        res = SP_RARE;
    919      } else {
    920        res = SP_OK;
    921      }
    922 
    923      // Always use the longest match and the best result.  For NOBREAK
    924      // we separately keep the longest match without a following good
    925      // word as a fall-back.
    926      if (nobreak_result == SP_BAD) {
    927        if (mip->mi_result2 > res) {
    928          mip->mi_result2 = res;
    929          mip->mi_end2 = mip->mi_word + wlen;
    930        } else if (mip->mi_result2 == res
    931                   && mip->mi_end2 < mip->mi_word + wlen) {
    932          mip->mi_end2 = mip->mi_word + wlen;
    933        }
    934      } else if (mip->mi_result > res) {
    935        mip->mi_result = res;
    936        mip->mi_end = mip->mi_word + wlen;
    937      } else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen) {
    938        mip->mi_end = mip->mi_word + wlen;
    939      }
    940 
    941      if (mip->mi_result == SP_OK) {
    942        break;
    943      }
    944    }
    945 
    946    if (mip->mi_result == SP_OK) {
    947      break;
    948    }
    949  }
    950 }
    951 
    952 /// Returns true if there is a match between the word ptr[wlen] and
    953 /// CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another
    954 /// word.
    955 /// A match means that the first part of CHECKCOMPOUNDPATTERN matches at the
    956 /// end of ptr[wlen] and the second part matches after it.
    957 ///
    958 /// @param gap  &sl_comppat
    959 bool match_checkcompoundpattern(char *ptr, int wlen, garray_T *gap)
    960 {
    961  for (int i = 0; i + 1 < gap->ga_len; i += 2) {
    962    char *p = ((char **)gap->ga_data)[i + 1];
    963    if (strncmp(ptr + wlen, p, strlen(p)) == 0) {
    964      // Second part matches at start of following compound word, now
    965      // check if first part matches at end of previous word.
    966      p = ((char **)gap->ga_data)[i];
    967      int len = (int)strlen(p);
    968      if (len <= wlen && strncmp(ptr + wlen - len, p, (size_t)len) == 0) {
    969        return true;
    970      }
    971    }
    972  }
    973  return false;
    974 }
    975 
    976 /// @return  true if "flags" is a valid sequence of compound flags and "word"
    977 ///          does not have too many syllables.
    978 bool can_compound(slang_T *slang, const char *word, const uint8_t *flags)
    979  FUNC_ATTR_NONNULL_ALL
    980 {
    981  char uflags[MAXWLEN * 2] = { 0 };
    982 
    983  if (slang->sl_compprog == NULL) {
    984    return false;
    985  }
    986  // Need to convert the single byte flags to utf8 characters.
    987  char *p = uflags;
    988  for (int i = 0; flags[i] != NUL; i++) {
    989    p += utf_char2bytes(flags[i], p);
    990  }
    991  *p = NUL;
    992  p = uflags;
    993  if (!vim_regexec_prog(&slang->sl_compprog, false, p, 0)) {
    994    return false;
    995  }
    996 
    997  // Count the number of syllables.  This may be slow, do it last.  If there
    998  // are too many syllables AND the number of compound words is above
    999  // COMPOUNDWORDMAX then compounding is not allowed.
   1000  if (slang->sl_compsylmax < MAXWLEN
   1001      && count_syllables(slang, word) > slang->sl_compsylmax) {
   1002    return (int)strlen((char *)flags) < slang->sl_compmax;
   1003  }
   1004  return true;
   1005 }
   1006 
   1007 // Returns true if the compound flags in compflags[] match the start of any
   1008 // compound rule.  This is used to stop trying a compound if the flags
   1009 // collected so far can't possibly match any compound rule.
   1010 // Caller must check that slang->sl_comprules is not NULL.
   1011 bool match_compoundrule(slang_T *slang, const uint8_t *compflags)
   1012 {
   1013  // loop over all the COMPOUNDRULE entries
   1014  for (char *p = (char *)slang->sl_comprules; *p != NUL; p++) {
   1015    // loop over the flags in the compound word we have made, match
   1016    // them against the current rule entry
   1017    for (int i = 0;; i++) {
   1018      int c = compflags[i];
   1019      if (c == NUL) {
   1020        // found a rule that matches for the flags we have so far
   1021        return true;
   1022      }
   1023      if (*p == '/' || *p == NUL) {
   1024        break;          // end of rule, it's too short
   1025      }
   1026      if (*p == '[') {
   1027        bool match = false;
   1028 
   1029        // compare against all the flags in []
   1030        p++;
   1031        while (*p != ']' && *p != NUL) {
   1032          if ((uint8_t)(*p++) == c) {
   1033            match = true;
   1034          }
   1035        }
   1036        if (!match) {
   1037          break;            // none matches
   1038        }
   1039      } else if ((uint8_t)(*p) != c) {
   1040        break;          // flag of word doesn't match flag in pattern
   1041      }
   1042      p++;
   1043    }
   1044 
   1045    // Skip to the next "/", where the next pattern starts.
   1046    p = vim_strchr(p, '/');
   1047    if (p == NULL) {
   1048      break;
   1049    }
   1050  }
   1051 
   1052  // Checked all the rules and none of them match the flags, so there
   1053  // can't possibly be a compound starting with these flags.
   1054  return false;
   1055 }
   1056 
   1057 /// Return non-zero if the prefix indicated by "arridx" matches with the prefix
   1058 /// ID in "flags" for the word "word".
   1059 /// The WF_RAREPFX flag is included in the return value for a rare prefix.
   1060 ///
   1061 /// @param totprefcnt  nr of prefix IDs
   1062 /// @param arridx  idx in sl_pidxs[]
   1063 /// @param cond_req  only use prefixes with a condition
   1064 int valid_word_prefix(int totprefcnt, int arridx, int flags, char *word, slang_T *slang,
   1065                      bool cond_req)
   1066 {
   1067  int prefid = (int)((unsigned)flags >> 24);
   1068  for (int prefcnt = totprefcnt - 1; prefcnt >= 0; prefcnt--) {
   1069    int pidx = slang->sl_pidxs[arridx + prefcnt];
   1070 
   1071    // Check the prefix ID.
   1072    if (prefid != (pidx & 0xff)) {
   1073      continue;
   1074    }
   1075 
   1076    // Check if the prefix doesn't combine and the word already has a
   1077    // suffix.
   1078    if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC)) {
   1079      continue;
   1080    }
   1081 
   1082    // Check the condition, if there is one.  The condition index is
   1083    // stored in the two bytes above the prefix ID byte.
   1084    regprog_T **rp = &slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff];
   1085    if (*rp != NULL) {
   1086      if (!vim_regexec_prog(rp, false, word, 0)) {
   1087        continue;
   1088      }
   1089    } else if (cond_req) {
   1090      continue;
   1091    }
   1092 
   1093    // It's a match!  Return the WF_ flags.
   1094    return pidx;
   1095  }
   1096  return 0;
   1097 }
   1098 
   1099 // Check if the word at "mip->mi_word" has a matching prefix.
   1100 // If it does, then check the following word.
   1101 //
   1102 // If "mode" is "FIND_COMPOUND" then do the same after another word, find a
   1103 // prefix in a compound word.
   1104 //
   1105 // For a match mip->mi_result is updated.
   1106 static void find_prefix(matchinf_T *mip, int mode)
   1107 {
   1108  idx_T arridx = 0;
   1109  int wlen = 0;
   1110  slang_T *slang = mip->mi_lp->lp_slang;
   1111 
   1112  uint8_t *byts = slang->sl_pbyts;
   1113  if (byts == NULL) {
   1114    return;                     // array is empty
   1115  }
   1116  // We use the case-folded word here, since prefixes are always
   1117  // case-folded.
   1118  char *ptr = mip->mi_fword;
   1119  int flen = mip->mi_fwordlen;      // available case-folded bytes
   1120  if (mode == FIND_COMPOUND) {
   1121    // Skip over the previously found word(s).
   1122    ptr += mip->mi_compoff;
   1123    flen -= mip->mi_compoff;
   1124  }
   1125  idx_T *idxs = slang->sl_pidxs;
   1126 
   1127  // Repeat advancing in the tree until:
   1128  // - there is a byte that doesn't match,
   1129  // - we reach the end of the tree,
   1130  // - or we reach the end of the line.
   1131  while (true) {
   1132    if (flen == 0 && *mip->mi_fend != NUL) {
   1133      flen = fold_more(mip);
   1134    }
   1135 
   1136    int len = byts[arridx++];
   1137 
   1138    // If the first possible byte is a zero the prefix could end here.
   1139    // Check if the following word matches and supports the prefix.
   1140    if (byts[arridx] == 0) {
   1141      // There can be several prefixes with different conditions.  We
   1142      // try them all, since we don't know which one will give the
   1143      // longest match.  The word is the same each time, pass the list
   1144      // of possible prefixes to find_word().
   1145      mip->mi_prefarridx = arridx;
   1146      mip->mi_prefcnt = len;
   1147      while (len > 0 && byts[arridx] == 0) {
   1148        arridx++;
   1149        len--;
   1150      }
   1151      mip->mi_prefcnt -= len;
   1152 
   1153      // Find the word that comes after the prefix.
   1154      mip->mi_prefixlen = wlen;
   1155      if (mode == FIND_COMPOUND) {
   1156        // Skip over the previously found word(s).
   1157        mip->mi_prefixlen += mip->mi_compoff;
   1158      }
   1159 
   1160      // Case-folded length may differ from original length.
   1161      mip->mi_cprefixlen = nofold_len(mip->mi_fword, mip->mi_prefixlen,
   1162                                      mip->mi_word);
   1163      find_word(mip, FIND_PREFIX);
   1164 
   1165      if (len == 0) {
   1166        break;              // no children, word must end here
   1167      }
   1168    }
   1169 
   1170    // Stop looking at end of the line.
   1171    if (ptr[wlen] == NUL) {
   1172      break;
   1173    }
   1174 
   1175    // Perform a binary search in the list of accepted bytes.
   1176    int c = (uint8_t)ptr[wlen];
   1177    idx_T lo = arridx;
   1178    idx_T hi = arridx + len - 1;
   1179    while (lo < hi) {
   1180      idx_T m = (lo + hi) / 2;
   1181      if (byts[m] > c) {
   1182        hi = m - 1;
   1183      } else if (byts[m] < c) {
   1184        lo = m + 1;
   1185      } else {
   1186        lo = hi = m;
   1187        break;
   1188      }
   1189    }
   1190 
   1191    // Stop if there is no matching byte.
   1192    if (hi < lo || byts[lo] != c) {
   1193      break;
   1194    }
   1195 
   1196    // Continue at the child (if there is one).
   1197    arridx = idxs[lo];
   1198    wlen++;
   1199    flen--;
   1200  }
   1201 }
   1202 
   1203 // Need to fold at least one more character.  Do until next non-word character
   1204 // for efficiency.  Include the non-word character too.
   1205 // Return the length of the folded chars in bytes.
   1206 static int fold_more(matchinf_T *mip)
   1207 {
   1208  char *p = mip->mi_fend;
   1209  do {
   1210    MB_PTR_ADV(mip->mi_fend);
   1211  } while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win));
   1212 
   1213  // Include the non-word character so that we can check for the word end.
   1214  if (*mip->mi_fend != NUL) {
   1215    MB_PTR_ADV(mip->mi_fend);
   1216  }
   1217 
   1218  spell_casefold(mip->mi_win, p, (int)(mip->mi_fend - p),
   1219                 mip->mi_fword + mip->mi_fwordlen,
   1220                 MAXWLEN - mip->mi_fwordlen);
   1221  int flen = (int)strlen(mip->mi_fword + mip->mi_fwordlen);
   1222  mip->mi_fwordlen += flen;
   1223  return flen;
   1224 }
   1225 
   1226 /// Checks case flags for a word. Returns true, if the word has the requested
   1227 /// case.
   1228 ///
   1229 /// @param wordflags Flags for the checked word.
   1230 /// @param treeflags Flags for the word in the spell tree.
   1231 bool spell_valid_case(int wordflags, int treeflags)
   1232 {
   1233  return (wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0)
   1234         || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0
   1235             && ((treeflags & WF_ONECAP) == 0
   1236                 || (wordflags & WF_ONECAP) != 0));
   1237 }
   1238 
   1239 /// Return true if spell checking is enabled for "wp".
   1240 bool spell_check_window(win_T *wp)
   1241 {
   1242  return wp->w_p_spell
   1243         && *wp->w_s->b_p_spl != NUL
   1244         && wp->w_s->b_langp.ga_len > 0
   1245         && *(char **)(wp->w_s->b_langp.ga_data) != NULL;
   1246 }
   1247 
   1248 /// Return true and give an error if spell checking is not enabled.
   1249 bool no_spell_checking(win_T *wp)
   1250 {
   1251  if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL || GA_EMPTY(&wp->w_s->b_langp)) {
   1252    emsg(_(e_no_spell));
   1253    return true;
   1254  }
   1255  return false;
   1256 }
   1257 
   1258 static void decor_spell_nav_start(win_T *wp)
   1259 {
   1260  decor_state = (DecorState){ 0 };
   1261  decor_redraw_reset(wp, &decor_state);
   1262 }
   1263 
   1264 static TriState decor_spell_nav_col(win_T *wp, linenr_T lnum, linenr_T *decor_lnum, int col)
   1265 {
   1266  if (*decor_lnum != lnum) {
   1267    decor_providers_invoke_spell(wp, lnum - 1, col, lnum - 1, -1);
   1268    decor_redraw_line(wp, lnum - 1, &decor_state);
   1269    *decor_lnum = lnum;
   1270  }
   1271  decor_redraw_col(wp, col, 0, false, &decor_state);
   1272  return decor_state.spell;
   1273 }
   1274 
   1275 static inline bool can_syn_spell(win_T *wp, linenr_T lnum, int col)
   1276 {
   1277  bool can_spell;
   1278  syn_get_id(wp, lnum, col, false, &can_spell, false);
   1279  return can_spell;
   1280 }
   1281 
   1282 /// Moves to the next spell error.
   1283 /// "curline" is false for "[s", "]s", "[S" and "]S".
   1284 /// "curline" is true to find word under/after cursor in the same line.
   1285 /// For Insert mode completion "dir" is BACKWARD and "curline" is true: move
   1286 /// to after badly spelled word before the cursor.
   1287 ///
   1288 /// @param dir  FORWARD or BACKWARD
   1289 /// @param behaviour  Behaviour of the function
   1290 /// @param attrp  return: attributes of bad word or NULL (only when "dir" is FORWARD)
   1291 ///
   1292 /// @return  0 if not found, length of the badly spelled word otherwise.
   1293 size_t spell_move_to(win_T *wp, int dir, smt_T behaviour, bool curline, hlf_T *attrp)
   1294 {
   1295  if (no_spell_checking(wp)) {
   1296    return 0;
   1297  }
   1298 
   1299  pos_T found_pos;
   1300  size_t found_len = 0;
   1301  hlf_T attr = HLF_COUNT;
   1302  bool has_syntax = syntax_present(wp);
   1303  char *buf = NULL;
   1304  size_t buflen = 0;
   1305  int skip = 0;
   1306  colnr_T capcol = -1;
   1307  bool found_one = false;
   1308  bool wrapped = false;
   1309 
   1310  size_t ret = 0;
   1311 
   1312  // Start looking for bad word at the start of the line, because we can't
   1313  // start halfway through a word, we don't know where it starts or ends.
   1314  //
   1315  // When searching backwards, we continue in the line to find the last
   1316  // bad word (in the cursor line: before the cursor).
   1317  //
   1318  // We concatenate the start of the next line, so that wrapped words work
   1319  // (e.g. "et<line-break>cetera").  Doesn't work when searching backwards
   1320  // though...
   1321  linenr_T lnum = wp->w_cursor.lnum;
   1322  clearpos(&found_pos);
   1323 
   1324  // Ephemeral extmarks are currently stored in the global decor_state.
   1325  // When looking for spell errors, we need to:
   1326  //  - temporarily reset decor_state
   1327  //  - run the _on_spell_nav decor callback for each line we look at
   1328  //  - detect if any spell marks are present
   1329  //  - restore decor_state to the value saved here.
   1330  // TODO(lewis6991): un-globalize decor_state and allow ephemeral marks to be stored into a
   1331  // temporary DecorState.
   1332  DecorState saved_decor_start = decor_state;
   1333  linenr_T decor_lnum = -1;
   1334  decor_spell_nav_start(wp);
   1335 
   1336  while (!got_int) {
   1337    char *line = ml_get_buf(wp->w_buffer, lnum);
   1338 
   1339    size_t len = (size_t)ml_get_buf_len(wp->w_buffer, lnum);
   1340    if (buflen < len + MAXWLEN + 2) {
   1341      xfree(buf);
   1342      buflen = len + MAXWLEN + 2;
   1343      buf = xmalloc(buflen);
   1344    }
   1345    assert(buf && buflen >= len + MAXWLEN + 2);
   1346 
   1347    // In first line check first word for Capital.
   1348    if (lnum == 1) {
   1349      capcol = 0;
   1350    }
   1351 
   1352    // For checking first word with a capital skip white space.
   1353    if (capcol == 0) {
   1354      capcol = (colnr_T)getwhitecols(line);
   1355    } else if (curline && wp == curwin) {
   1356      // For spellbadword(): check if first word needs a capital.
   1357      colnr_T col = (colnr_T)getwhitecols(line);
   1358      if (check_need_cap(curwin, lnum, col)) {
   1359        capcol = col;
   1360      }
   1361 
   1362      // Need to get the line again, may have looked at the previous
   1363      // one.
   1364      line = ml_get_buf(wp->w_buffer, lnum);
   1365    }
   1366 
   1367    // Copy the line into "buf" and append the start of the next line if
   1368    // possible.  Note: this ml_get_buf() may make "line" invalid, check
   1369    // for empty line first.
   1370    bool empty_line = *skipwhite(line) == NUL;
   1371    STRCPY(buf, line);
   1372    if (lnum < wp->w_buffer->b_ml.ml_line_count) {
   1373      spell_cat_line(buf + strlen(buf),
   1374                     ml_get_buf(wp->w_buffer, lnum + 1),
   1375                     MAXWLEN);
   1376    }
   1377    char *p = buf + skip;
   1378    char *endp = buf + len;
   1379    while (p < endp) {
   1380      // When searching backward don't search after the cursor.  Unless
   1381      // we wrapped around the end of the buffer.
   1382      if (dir == BACKWARD
   1383          && lnum == wp->w_cursor.lnum
   1384          && !wrapped
   1385          && (colnr_T)(p - buf) >= wp->w_cursor.col) {
   1386        break;
   1387      }
   1388 
   1389      // start of word
   1390      attr = HLF_COUNT;
   1391      len = spell_check(wp, p, &attr, &capcol, false);
   1392 
   1393      if (attr != HLF_COUNT) {
   1394        // We found a bad word.  Check the attribute.
   1395        if (behaviour == SMT_ALL
   1396            || (behaviour == SMT_BAD && attr == HLF_SPB)
   1397            || (behaviour == SMT_RARE && attr == HLF_SPR)) {
   1398          // When searching forward only accept a bad word after
   1399          // the cursor.
   1400          if (dir == BACKWARD
   1401              || lnum != wp->w_cursor.lnum
   1402              || wrapped
   1403              || ((colnr_T)(curline
   1404                            ? p - buf + (ptrdiff_t)len
   1405                            : p - buf) > wp->w_cursor.col)) {
   1406            colnr_T col = (colnr_T)(p - buf);
   1407 
   1408            bool no_plain_buffer = (wp->w_s->b_p_spo_flags & kOptSpoFlagNoplainbuffer) != 0;
   1409            bool can_spell = !no_plain_buffer;
   1410            switch (decor_spell_nav_col(wp, lnum, &decor_lnum, col)) {
   1411            case kTrue:
   1412              can_spell = true; break;
   1413            case kFalse:
   1414              can_spell = false; break;
   1415            case kNone:
   1416              if (has_syntax) {
   1417                can_spell = can_syn_spell(wp, lnum, col);
   1418              }
   1419            }
   1420 
   1421            if (!can_spell) {
   1422              attr = HLF_COUNT;
   1423            }
   1424 
   1425            if (can_spell) {
   1426              found_one = true;
   1427              found_pos = (pos_T) {
   1428                .lnum = lnum,
   1429                .col = col,
   1430                .coladd = 0
   1431              };
   1432              if (dir == FORWARD) {
   1433                // No need to search further.
   1434                wp->w_cursor = found_pos;
   1435                if (attrp != NULL) {
   1436                  *attrp = attr;
   1437                }
   1438                ret = len;
   1439                goto theend;
   1440              } else if (curline) {
   1441                // Insert mode completion: put cursor after
   1442                // the bad word.
   1443                assert(len <= INT_MAX);
   1444                found_pos.col += (int)len;
   1445              }
   1446              found_len = len;
   1447            }
   1448          } else {
   1449            found_one = true;
   1450          }
   1451        }
   1452      }
   1453 
   1454      // advance to character after the word
   1455      p += len;
   1456      assert(len <= INT_MAX);
   1457      capcol -= (int)len;
   1458    }
   1459 
   1460    if (dir == BACKWARD && found_pos.lnum != 0) {
   1461      // Use the last match in the line (before the cursor).
   1462      wp->w_cursor = found_pos;
   1463      ret = found_len;
   1464      goto theend;
   1465    }
   1466 
   1467    if (curline) {
   1468      break;            // only check cursor line
   1469    }
   1470 
   1471    // If we are back at the starting line and searched it again there
   1472    // is no match, give up.
   1473    if (lnum == wp->w_cursor.lnum && wrapped) {
   1474      break;
   1475    }
   1476 
   1477    // Advance to next line.
   1478    if (dir == BACKWARD) {
   1479      if (lnum > 1) {
   1480        lnum--;
   1481      } else if (!p_ws) {
   1482        break;              // at first line and 'nowrapscan'
   1483      } else {
   1484        // Wrap around to the end of the buffer.  May search the
   1485        // starting line again and accept the last match.
   1486        lnum = wp->w_buffer->b_ml.ml_line_count;
   1487        wrapped = true;
   1488        if (!shortmess(SHM_SEARCH)) {
   1489          give_warning(_(top_bot_msg), true, false);
   1490        }
   1491      }
   1492      capcol = -1;
   1493    } else {
   1494      if (lnum < wp->w_buffer->b_ml.ml_line_count) {
   1495        lnum++;
   1496      } else if (!p_ws) {
   1497        break;              // at first line and 'nowrapscan'
   1498      } else {
   1499        // Wrap around to the start of the buffer.  May search the
   1500        // starting line again and accept the first match.
   1501        lnum = 1;
   1502        wrapped = true;
   1503        if (!shortmess(SHM_SEARCH)) {
   1504          give_warning(_(bot_top_msg), true, false);
   1505        }
   1506      }
   1507 
   1508      // If we are back at the starting line and there is no match then
   1509      // give up.
   1510      if (lnum == wp->w_cursor.lnum && !found_one) {
   1511        break;
   1512      }
   1513 
   1514      // Skip the characters at the start of the next line that were
   1515      // included in a match crossing line boundaries.
   1516      if (attr == HLF_COUNT) {
   1517        skip = (int)(p - endp);
   1518      } else {
   1519        skip = 0;
   1520      }
   1521 
   1522      // Capcol skips over the inserted space.
   1523      capcol--;
   1524 
   1525      // But after empty line check first word in next line
   1526      if (empty_line) {
   1527        capcol = 0;
   1528      }
   1529    }
   1530 
   1531    line_breakcheck();
   1532  }
   1533 
   1534 theend:
   1535  decor_state_free(&decor_state);
   1536  decor_state = saved_decor_start;
   1537  xfree(buf);
   1538  return ret;
   1539 }
   1540 
   1541 // For spell checking: concatenate the start of the following line "line" into
   1542 // "buf", blanking-out special characters.  Copy less than "maxlen" bytes.
   1543 // Keep the blanks at the start of the next line, this is used in win_line()
   1544 // to skip those bytes if the word was OK.
   1545 void spell_cat_line(char *buf, char *line, int maxlen)
   1546 {
   1547  char *p = skipwhite(line);
   1548  while (vim_strchr("*#/\"\t", (uint8_t)(*p)) != NULL) {
   1549    p = skipwhite(p + 1);
   1550  }
   1551 
   1552  if (*p == NUL) {
   1553    return;
   1554  }
   1555 
   1556  // Only worth concatenating if there is something else than spaces to
   1557  // concatenate.
   1558  int n = (int)(p - line) + 1;
   1559  if (n < maxlen - 1) {
   1560    memset(buf, ' ', (size_t)n);
   1561    xstrlcpy(buf + n, p, (size_t)(maxlen - n));
   1562  }
   1563 }
   1564 
   1565 // Load word list(s) for "lang" from Vim spell file(s).
   1566 // "lang" must be the language without the region: e.g., "en".
   1567 static void spell_load_lang(char *lang)
   1568 {
   1569  char fname_enc[85];
   1570  int r;
   1571  spelload_T sl;
   1572 
   1573  // Copy the language name to pass it to spell_load_cb() as a cookie.
   1574  // It's truncated when an error is detected.
   1575  STRCPY(sl.sl_lang, lang);
   1576  sl.sl_slang = NULL;
   1577  sl.sl_nobreak = false;
   1578 
   1579  // Disallow deleting the current buffer.  Autocommands can do weird things
   1580  // and cause "lang" to be freed.
   1581  curbuf->b_locked++;
   1582 
   1583  // We may retry when no spell file is found for the language, an
   1584  // autocommand may load it then.
   1585  for (int round = 1; round <= 2; round++) {
   1586    // Find the first spell file for "lang" in 'runtimepath' and load it.
   1587    vim_snprintf(fname_enc, sizeof(fname_enc) - 5,
   1588                 "spell/%s.%s.spl", lang, spell_enc());
   1589    r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl);
   1590 
   1591    if (r == FAIL && *sl.sl_lang != NUL) {
   1592      // Try loading the ASCII version.
   1593      vim_snprintf(fname_enc, sizeof(fname_enc) - 5,
   1594                   "spell/%s.ascii.spl", lang);
   1595      r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl);
   1596 
   1597      if (r == FAIL && *sl.sl_lang != NUL && round == 1
   1598          && apply_autocmds(EVENT_SPELLFILEMISSING, lang,
   1599                            curbuf->b_fname, false, curbuf)) {
   1600        continue;
   1601      }
   1602      break;
   1603    }
   1604    break;
   1605  }
   1606 
   1607  if (r == FAIL) {
   1608    if (starting) {
   1609      // Prompt the user at VimEnter if spell files are missing. #3027
   1610      // Plugins aren't loaded yet, so nvim/spellfile.lua cannot handle this case.
   1611      char autocmd_buf[512] = { 0 };
   1612      snprintf(autocmd_buf, sizeof(autocmd_buf),
   1613               "autocmd VimEnter * call v:lua.require'nvim.spellfile'.get('%s')|set spell",
   1614               lang);
   1615      do_cmdline_cmd(autocmd_buf);
   1616    } else {
   1617      smsg(0, _("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\""),
   1618           lang, spell_enc(), lang);
   1619    }
   1620  } else if (sl.sl_slang != NULL) {
   1621    // At least one file was loaded, now load ALL the additions.
   1622    STRCPY(fname_enc + strlen(fname_enc) - 3, "add.spl");
   1623    do_in_runtimepath(fname_enc, DIP_ALL, spell_load_cb, &sl);
   1624  }
   1625 
   1626  curbuf->b_locked--;
   1627 }
   1628 
   1629 // Return the encoding used for spell checking: Use 'encoding', except that we
   1630 // use "latin1" for "latin9".  And limit to 60 characters (just in case).
   1631 char *spell_enc(void)
   1632 {
   1633  if (strlen(p_enc) < 60 && strcmp(p_enc, "iso-8859-15") != 0) {
   1634    return p_enc;
   1635  }
   1636  return "latin1";
   1637 }
   1638 
   1639 // Get the name of the .spl file for the internal wordlist into
   1640 // "fname[MAXPATHL]".
   1641 static void int_wordlist_spl(char *fname)
   1642 {
   1643  vim_snprintf(fname, MAXPATHL, SPL_FNAME_TMPL,
   1644               int_wordlist, spell_enc());
   1645 }
   1646 
   1647 /// Allocate a new slang_T for language "lang".  "lang" can be NULL.
   1648 /// Caller must fill "sl_next".
   1649 slang_T *slang_alloc(char *lang)
   1650  FUNC_ATTR_NONNULL_RET
   1651 {
   1652  slang_T *lp = xcalloc(1, sizeof(slang_T));
   1653 
   1654  if (lang != NULL) {
   1655    lp->sl_name = xstrdup(lang);
   1656  }
   1657  ga_init(&lp->sl_rep, sizeof(fromto_T), 10);
   1658  ga_init(&lp->sl_repsal, sizeof(fromto_T), 10);
   1659  lp->sl_compmax = MAXWLEN;
   1660  lp->sl_compsylmax = MAXWLEN;
   1661  hash_init(&lp->sl_wordcount);
   1662 
   1663  return lp;
   1664 }
   1665 
   1666 // Free the contents of an slang_T and the structure itself.
   1667 void slang_free(slang_T *lp)
   1668 {
   1669  xfree(lp->sl_name);
   1670  xfree(lp->sl_fname);
   1671  slang_clear(lp);
   1672  xfree(lp);
   1673 }
   1674 
   1675 /// Frees a salitem_T
   1676 static void free_salitem(salitem_T *smp)
   1677 {
   1678  xfree(smp->sm_lead);
   1679  // Don't free sm_oneof and sm_rules, they point into sm_lead.
   1680  xfree(smp->sm_to);
   1681  xfree(smp->sm_lead_w);
   1682  xfree(smp->sm_oneof_w);
   1683  xfree(smp->sm_to_w);
   1684 }
   1685 
   1686 /// Frees a fromto_T
   1687 static void free_fromto(fromto_T *ftp)
   1688 {
   1689  xfree(ftp->ft_from);
   1690  xfree(ftp->ft_to);
   1691 }
   1692 
   1693 // Clear an slang_T so that the file can be reloaded.
   1694 void slang_clear(slang_T *lp)
   1695 {
   1696  garray_T *gap;
   1697 
   1698  XFREE_CLEAR(lp->sl_fbyts);
   1699  XFREE_CLEAR(lp->sl_kbyts);
   1700  XFREE_CLEAR(lp->sl_pbyts);
   1701 
   1702  XFREE_CLEAR(lp->sl_fidxs);
   1703  XFREE_CLEAR(lp->sl_kidxs);
   1704  XFREE_CLEAR(lp->sl_pidxs);
   1705 
   1706  GA_DEEP_CLEAR(&lp->sl_rep, fromto_T, free_fromto);
   1707  GA_DEEP_CLEAR(&lp->sl_repsal, fromto_T, free_fromto);
   1708 
   1709  gap = &lp->sl_sal;
   1710  if (lp->sl_sofo) {
   1711    // "ga_len" is set to 1 without adding an item for latin1
   1712    GA_DEEP_CLEAR_PTR(gap);
   1713  } else {
   1714    // SAL items: free salitem_T items
   1715    GA_DEEP_CLEAR(gap, salitem_T, free_salitem);
   1716  }
   1717 
   1718  for (int i = 0; i < lp->sl_prefixcnt; i++) {
   1719    vim_regfree(lp->sl_prefprog[i]);
   1720  }
   1721  lp->sl_prefixcnt = 0;
   1722  XFREE_CLEAR(lp->sl_prefprog);
   1723  XFREE_CLEAR(lp->sl_info);
   1724  XFREE_CLEAR(lp->sl_midword);
   1725 
   1726  vim_regfree(lp->sl_compprog);
   1727  lp->sl_compprog = NULL;
   1728  XFREE_CLEAR(lp->sl_comprules);
   1729  XFREE_CLEAR(lp->sl_compstartflags);
   1730  XFREE_CLEAR(lp->sl_compallflags);
   1731 
   1732  XFREE_CLEAR(lp->sl_syllable);
   1733  ga_clear(&lp->sl_syl_items);
   1734 
   1735  ga_clear_strings(&lp->sl_comppat);
   1736 
   1737  hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF);
   1738  hash_init(&lp->sl_wordcount);
   1739 
   1740  hash_clear_all(&lp->sl_map_hash, 0);
   1741 
   1742  // Clear info from .sug file.
   1743  slang_clear_sug(lp);
   1744 
   1745  lp->sl_compmax = MAXWLEN;
   1746  lp->sl_compminlen = 0;
   1747  lp->sl_compsylmax = MAXWLEN;
   1748  lp->sl_regions[0] = NUL;
   1749 }
   1750 
   1751 // Clear the info from the .sug file in "lp".
   1752 void slang_clear_sug(slang_T *lp)
   1753 {
   1754  XFREE_CLEAR(lp->sl_sbyts);
   1755  XFREE_CLEAR(lp->sl_sidxs);
   1756  close_spellbuf(lp->sl_sugbuf);
   1757  lp->sl_sugbuf = NULL;
   1758  lp->sl_sugloaded = false;
   1759  lp->sl_sugtime = 0;
   1760 }
   1761 
   1762 // Load one spell file and store the info into a slang_T.
   1763 // Invoked through do_in_runtimepath().
   1764 static bool spell_load_cb(int num_fnames, char **fnames, bool all, void *cookie)
   1765 {
   1766  spelload_T *slp = (spelload_T *)cookie;
   1767  for (int i = 0; i < num_fnames; i++) {
   1768    slang_T *slang = spell_load_file(fnames[i], slp->sl_lang, NULL, false);
   1769 
   1770    if (slang == NULL) {
   1771      continue;
   1772    }
   1773 
   1774    // When a previously loaded file has NOBREAK also use it for the
   1775    // ".add" files.
   1776    if (slp->sl_nobreak && slang->sl_add) {
   1777      slang->sl_nobreak = true;
   1778    } else if (slang->sl_nobreak) {
   1779      slp->sl_nobreak = true;
   1780    }
   1781 
   1782    slp->sl_slang = slang;
   1783 
   1784    if (!all) {
   1785      break;
   1786    }
   1787  }
   1788 
   1789  return num_fnames > 0;
   1790 }
   1791 
   1792 /// Add a word to the hashtable of common words.
   1793 /// If it's already there then the counter is increased.
   1794 ///
   1795 /// @param[in]  lp
   1796 /// @param[in]  word  added to common words hashtable
   1797 /// @param[in]  len  length of word or -1 for NUL terminated
   1798 /// @param[in]  count  1 to count once, 10 to init
   1799 void count_common_word(slang_T *lp, char *word, int len, uint8_t count)
   1800 {
   1801  char buf[MAXWLEN];
   1802  char *p;
   1803 
   1804  if (len == -1) {
   1805    p = word;
   1806  } else if (len >= MAXWLEN) {
   1807    return;
   1808  } else {
   1809    xmemcpyz(buf, word, (size_t)len);
   1810    p = buf;
   1811  }
   1812 
   1813  hash_T hash = hash_hash(p);
   1814  const size_t p_len = strlen(p);
   1815  hashitem_T *hi = hash_lookup(&lp->sl_wordcount, p, p_len, hash);
   1816  if (HASHITEM_EMPTY(hi)) {
   1817    wordcount_T *wc = xmalloc(offsetof(wordcount_T, wc_word) + p_len + 1);
   1818    memcpy(wc->wc_word, p, p_len + 1);
   1819    wc->wc_count = count;
   1820    hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash);
   1821  } else {
   1822    wordcount_T *wc = HI2WC(hi);
   1823    wc->wc_count = (uint16_t)(wc->wc_count + count);
   1824    if (wc->wc_count < count) {    // check for overflow
   1825      wc->wc_count = MAXWORDCOUNT;
   1826    }
   1827  }
   1828 }
   1829 
   1830 // Returns true if byte "n" appears in "str".
   1831 // Like strchr() but independent of locale.
   1832 bool byte_in_str(uint8_t *str, int n)
   1833 {
   1834  for (uint8_t *p = str; *p != NUL; p++) {
   1835    if (*p == n) {
   1836      return true;
   1837    }
   1838  }
   1839  return false;
   1840 }
   1841 
   1842 // Truncate "slang->sl_syllable" at the first slash and put the following items
   1843 // in "slang->sl_syl_items".
   1844 int init_syl_tab(slang_T *slang)
   1845 {
   1846  ga_init(&slang->sl_syl_items, sizeof(syl_item_T), 4);
   1847  char *p = vim_strchr(slang->sl_syllable, '/');
   1848  while (p != NULL) {
   1849    *p++ = NUL;
   1850    if (*p == NUL) {        // trailing slash
   1851      break;
   1852    }
   1853    char *s = p;
   1854    p = vim_strchr(p, '/');
   1855    int l;
   1856    if (p == NULL) {
   1857      l = (int)strlen(s);
   1858    } else {
   1859      l = (int)(p - s);
   1860    }
   1861    if (l >= SY_MAXLEN) {
   1862      return SP_FORMERROR;
   1863    }
   1864 
   1865    syl_item_T *syl = GA_APPEND_VIA_PTR(syl_item_T, &slang->sl_syl_items);
   1866    xmemcpyz(syl->sy_chars, s, (size_t)l);
   1867    syl->sy_len = l;
   1868  }
   1869  return OK;
   1870 }
   1871 
   1872 // Count the number of syllables in "word".
   1873 // When "word" contains spaces the syllables after the last space are counted.
   1874 // Returns zero if syllables are not defines.
   1875 static int count_syllables(slang_T *slang, const char *word)
   1876  FUNC_ATTR_NONNULL_ALL
   1877 {
   1878  if (slang->sl_syllable == NULL) {
   1879    return 0;
   1880  }
   1881 
   1882  int cnt = 0;
   1883  bool skip = false;
   1884  int len;
   1885 
   1886  for (const char *p = word; *p != NUL; p += len) {
   1887    // When running into a space reset counter.
   1888    if (*p == ' ') {
   1889      len = 1;
   1890      cnt = 0;
   1891      continue;
   1892    }
   1893 
   1894    // Find longest match of syllable items.
   1895    len = 0;
   1896    for (int i = 0; i < slang->sl_syl_items.ga_len; i++) {
   1897      syl_item_T *syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i;
   1898      if (syl->sy_len > len
   1899          && strncmp(p, syl->sy_chars, (size_t)syl->sy_len) == 0) {
   1900        len = syl->sy_len;
   1901      }
   1902    }
   1903    if (len != 0) {     // found a match, count syllable
   1904      cnt++;
   1905      skip = false;
   1906    } else {
   1907      // No recognized syllable item, at least a syllable char then?
   1908      int c = utf_ptr2char(p);
   1909      len = utfc_ptr2len(p);
   1910      if (vim_strchr(slang->sl_syllable, c) == NULL) {
   1911        skip = false;               // No, search for next syllable
   1912      } else if (!skip) {
   1913        cnt++;                      // Yes, count it
   1914        skip = true;                // don't count following syllable chars
   1915      }
   1916    }
   1917  }
   1918  return cnt;
   1919 }
   1920 
   1921 /// Parse 'spelllang' and set w_s->b_langp accordingly.
   1922 /// @return  NULL if it's OK, an untranslated error message otherwise.
   1923 char *parse_spelllang(win_T *wp)
   1924 {
   1925  char region_cp[3];
   1926  char lang[MAXWLEN + 1];
   1927  char spf_name[MAXPATHL];
   1928  char *use_region = NULL;
   1929  bool dont_use_region = false;
   1930  bool nobreak = false;
   1931  static bool recursive = false;
   1932  char *ret_msg = NULL;
   1933 
   1934  bufref_T bufref;
   1935  set_bufref(&bufref, wp->w_buffer);
   1936 
   1937  // We don't want to do this recursively.  May happen when a language is
   1938  // not available and the SpellFileMissing autocommand opens a new buffer
   1939  // in which 'spell' is set.
   1940  if (recursive) {
   1941    return NULL;
   1942  }
   1943  recursive = true;
   1944 
   1945  garray_T ga;
   1946  ga_init(&ga, sizeof(langp_T), 2);
   1947  clear_midword(wp);
   1948 
   1949  // Make a copy of 'spelllang', the SpellFileMissing autocommands may change
   1950  // it under our fingers.
   1951  char *spl_copy = xstrdup(wp->w_s->b_p_spl);
   1952 
   1953  wp->w_s->b_cjk = 0;
   1954 
   1955  // Loop over comma separated language names.
   1956  for (char *splp = spl_copy; *splp != NUL;) {
   1957    // Get one language name.
   1958    copy_option_part(&splp, lang, MAXWLEN, ",");
   1959    char *region = NULL;
   1960    int len = (int)strlen(lang);
   1961 
   1962    if (!valid_spelllang(lang)) {
   1963      continue;
   1964    }
   1965 
   1966    if (strcmp(lang, "cjk") == 0) {
   1967      wp->w_s->b_cjk = 1;
   1968      continue;
   1969    }
   1970 
   1971    slang_T *slang;
   1972    bool filename;
   1973    // If the name ends in ".spl" use it as the name of the spell file.
   1974    // If there is a region name let "region" point to it and remove it
   1975    // from the name.
   1976    if (len > 4 && path_fnamecmp(lang + len - 4, ".spl") == 0) {
   1977      filename = true;
   1978 
   1979      // Locate a region and remove it from the file name.
   1980      char *p = vim_strchr(path_tail(lang), '_');
   1981      if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2])
   1982          && !ASCII_ISALPHA(p[3])) {
   1983        xstrlcpy(region_cp, p + 1, 3);
   1984        memmove(p, p + 3, (size_t)(len - (p - lang) - 2));
   1985        region = region_cp;
   1986      } else {
   1987        dont_use_region = true;
   1988      }
   1989 
   1990      // Check if we loaded this language before.
   1991      for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
   1992        if (path_full_compare(lang, slang->sl_fname, false, true)
   1993            == kEqualFiles) {
   1994          break;
   1995        }
   1996      }
   1997    } else {
   1998      filename = false;
   1999      if (len > 3 && lang[len - 3] == '_') {
   2000        region = lang + len - 2;
   2001        lang[len - 3] = NUL;
   2002      } else {
   2003        dont_use_region = true;
   2004      }
   2005 
   2006      // Check if we loaded this language before.
   2007      for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
   2008        if (STRICMP(lang, slang->sl_name) == 0) {
   2009          break;
   2010        }
   2011      }
   2012    }
   2013 
   2014    if (region != NULL) {
   2015      // If the region differs from what was used before then don't
   2016      // use it for 'spellfile'.
   2017      if (use_region != NULL && strcmp(region, use_region) != 0) {
   2018        dont_use_region = true;
   2019      }
   2020      use_region = region;
   2021    }
   2022 
   2023    // If not found try loading the language now.
   2024    if (slang == NULL) {
   2025      if (filename) {
   2026        spell_load_file(lang, lang, NULL, false);
   2027      } else {
   2028        spell_load_lang(lang);
   2029        // SpellFileMissing autocommands may do anything, including
   2030        // destroying the buffer we are using or closing the window.
   2031        if (!bufref_valid(&bufref) || !win_valid_any_tab(wp)) {
   2032          ret_msg = N_("E797: SpellFileMissing autocommand deleted buffer");
   2033          goto theend;
   2034        }
   2035      }
   2036    }
   2037 
   2038    // Loop over the languages, there can be several files for "lang".
   2039    for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
   2040      if (filename
   2041          ? path_full_compare(lang, slang->sl_fname, false, true) == kEqualFiles
   2042          : STRICMP(lang, slang->sl_name) == 0) {
   2043        int region_mask = REGION_ALL;
   2044        if (!filename && region != NULL) {
   2045          // find region in sl_regions
   2046          int c = find_region(slang->sl_regions, region);
   2047          if (c == REGION_ALL) {
   2048            if (slang->sl_add) {
   2049              if (*slang->sl_regions != NUL) {
   2050                // This addition file is for other regions.
   2051                region_mask = 0;
   2052              }
   2053            } else {
   2054              // This is probably an error.  Give a warning and
   2055              // accept the words anyway.
   2056              smsg(0, _("Warning: region %s not supported"),
   2057                   region);
   2058            }
   2059          } else {
   2060            region_mask = 1 << c;
   2061          }
   2062        }
   2063 
   2064        if (region_mask != 0) {
   2065          langp_T *p_ = GA_APPEND_VIA_PTR(langp_T, &ga);
   2066          p_->lp_slang = slang;
   2067          p_->lp_region = region_mask;
   2068 
   2069          use_midword(slang, wp);
   2070          if (slang->sl_nobreak) {
   2071            nobreak = true;
   2072          }
   2073        }
   2074      }
   2075    }
   2076  }
   2077 
   2078  // round 0: load int_wordlist, if possible.
   2079  // round 1: load first name in 'spellfile'.
   2080  // round 2: load second name in 'spellfile.
   2081  // etc.
   2082  char *spf = curwin->w_s->b_p_spf;
   2083  for (int round = 0; round == 0 || *spf != NUL; round++) {
   2084    if (round == 0) {
   2085      // Internal wordlist, if there is one.
   2086      if (int_wordlist == NULL) {
   2087        continue;
   2088      }
   2089      int_wordlist_spl(spf_name);
   2090    } else {
   2091      // One entry in 'spellfile'.
   2092      copy_option_part(&spf, spf_name, MAXPATHL - 4, ",");
   2093      strcat(spf_name, ".spl");
   2094      int c;
   2095 
   2096      // If it was already found above then skip it.
   2097      for (c = 0; c < ga.ga_len; c++) {
   2098        char *p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname;
   2099        if (p != NULL
   2100            && path_full_compare(spf_name, p, false, true) == kEqualFiles) {
   2101          break;
   2102        }
   2103      }
   2104      if (c < ga.ga_len) {
   2105        continue;
   2106      }
   2107    }
   2108 
   2109    slang_T *slang;
   2110 
   2111    // Check if it was loaded already.
   2112    for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
   2113      if (path_full_compare(spf_name, slang->sl_fname, false, true)
   2114          == kEqualFiles) {
   2115        break;
   2116      }
   2117    }
   2118    if (slang == NULL) {
   2119      // Not loaded, try loading it now.  The language name includes the
   2120      // region name, the region is ignored otherwise.  for int_wordlist
   2121      // use an arbitrary name.
   2122      if (round == 0) {
   2123        STRCPY(lang, "internal wordlist");
   2124      } else {
   2125        xstrlcpy(lang, path_tail(spf_name), MAXWLEN + 1);
   2126        char *p = vim_strchr(lang, '.');
   2127        if (p != NULL) {
   2128          *p = NUL;             // truncate at ".encoding.add"
   2129        }
   2130      }
   2131      slang = spell_load_file(spf_name, lang, NULL, true);
   2132 
   2133      // If one of the languages has NOBREAK we assume the addition
   2134      // files also have this.
   2135      if (slang != NULL && nobreak) {
   2136        slang->sl_nobreak = true;
   2137      }
   2138    }
   2139    if (slang != NULL) {
   2140      int region_mask = REGION_ALL;
   2141      if (use_region != NULL && !dont_use_region) {
   2142        // find region in sl_regions
   2143        int c = find_region(slang->sl_regions, use_region);
   2144        if (c != REGION_ALL) {
   2145          region_mask = 1 << c;
   2146        } else if (*slang->sl_regions != NUL) {
   2147          // This spell file is for other regions.
   2148          region_mask = 0;
   2149        }
   2150      }
   2151 
   2152      if (region_mask != 0) {
   2153        langp_T *p_ = GA_APPEND_VIA_PTR(langp_T, &ga);
   2154        p_->lp_slang = slang;
   2155        p_->lp_sallang = NULL;
   2156        p_->lp_replang = NULL;
   2157        p_->lp_region = region_mask;
   2158 
   2159        use_midword(slang, wp);
   2160      }
   2161    }
   2162  }
   2163 
   2164  // Everything is fine, store the new b_langp value.
   2165  ga_clear(&wp->w_s->b_langp);
   2166  wp->w_s->b_langp = ga;
   2167 
   2168  // For each language figure out what language to use for sound folding and
   2169  // REP items.  If the language doesn't support it itself use another one
   2170  // with the same name.  E.g. for "en-math" use "en".
   2171  for (int i = 0; i < ga.ga_len; i++) {
   2172    langp_T *lp = LANGP_ENTRY(ga, i);
   2173 
   2174    // sound folding
   2175    if (!GA_EMPTY(&lp->lp_slang->sl_sal)) {
   2176      // language does sound folding itself
   2177      lp->lp_sallang = lp->lp_slang;
   2178    } else {
   2179      // find first similar language that does sound folding
   2180      for (int j = 0; j < ga.ga_len; j++) {
   2181        langp_T *lp2 = LANGP_ENTRY(ga, j);
   2182        if (!GA_EMPTY(&lp2->lp_slang->sl_sal)
   2183            && strncmp(lp->lp_slang->sl_name,
   2184                       lp2->lp_slang->sl_name, 2) == 0) {
   2185          lp->lp_sallang = lp2->lp_slang;
   2186          break;
   2187        }
   2188      }
   2189    }
   2190 
   2191    // REP items
   2192    if (!GA_EMPTY(&lp->lp_slang->sl_rep)) {
   2193      // language has REP items itself
   2194      lp->lp_replang = lp->lp_slang;
   2195    } else {
   2196      // find first similar language that has REP items
   2197      for (int j = 0; j < ga.ga_len; j++) {
   2198        langp_T *lp2 = LANGP_ENTRY(ga, j);
   2199        if (!GA_EMPTY(&lp2->lp_slang->sl_rep)
   2200            && strncmp(lp->lp_slang->sl_name,
   2201                       lp2->lp_slang->sl_name, 2) == 0) {
   2202          lp->lp_replang = lp2->lp_slang;
   2203          break;
   2204        }
   2205      }
   2206    }
   2207  }
   2208  redraw_later(wp, UPD_NOT_VALID);
   2209 
   2210 theend:
   2211  xfree(spl_copy);
   2212  recursive = false;
   2213  return ret_msg;
   2214 }
   2215 
   2216 // Clear the midword characters for buffer "buf".
   2217 static void clear_midword(win_T *wp)
   2218 {
   2219  CLEAR_FIELD(wp->w_s->b_spell_ismw);
   2220  XFREE_CLEAR(wp->w_s->b_spell_ismw_mb);
   2221 }
   2222 
   2223 /// Use the "sl_midword" field of language "lp" for buffer "buf".
   2224 /// They add up to any currently used midword characters.
   2225 static void use_midword(slang_T *lp, win_T *wp)
   2226  FUNC_ATTR_NONNULL_ALL
   2227 {
   2228  if (lp->sl_midword == NULL) {  // there aren't any
   2229    return;
   2230  }
   2231 
   2232  for (char *p = lp->sl_midword; *p != NUL;) {
   2233    const int c = utf_ptr2char(p);
   2234    const int l = utfc_ptr2len(p);
   2235    if (c < 256 && l <= 2) {
   2236      wp->w_s->b_spell_ismw[c] = true;
   2237    } else if (wp->w_s->b_spell_ismw_mb == NULL) {
   2238      // First multi-byte char in "b_spell_ismw_mb".
   2239      wp->w_s->b_spell_ismw_mb = xmemdupz(p, (size_t)l);
   2240    } else {
   2241      // Append multi-byte chars to "b_spell_ismw_mb".
   2242      const int n = (int)strlen(wp->w_s->b_spell_ismw_mb);
   2243      char *bp = xstrnsave(wp->w_s->b_spell_ismw_mb, (size_t)n + (size_t)l);
   2244      xfree(wp->w_s->b_spell_ismw_mb);
   2245      wp->w_s->b_spell_ismw_mb = bp;
   2246      xmemcpyz(bp + n, p, (size_t)l);
   2247    }
   2248    p += l;
   2249  }
   2250 }
   2251 
   2252 // Find the region "region[2]" in "rp" (points to "sl_regions").
   2253 // Each region is simply stored as the two characters of its name.
   2254 // Returns the index if found (first is 0), REGION_ALL if not found.
   2255 static int find_region(const char *rp, const char *region)
   2256 {
   2257  int i;
   2258 
   2259  for (i = 0;; i += 2) {
   2260    if (rp[i] == NUL) {
   2261      return REGION_ALL;
   2262    }
   2263    if (rp[i] == region[0] && rp[i + 1] == region[1]) {
   2264      break;
   2265    }
   2266  }
   2267  return i / 2;
   2268 }
   2269 
   2270 /// Return case type of word:
   2271 /// w word       0
   2272 /// Word         WF_ONECAP
   2273 /// W WORD       WF_ALLCAP
   2274 /// WoRd wOrd    WF_KEEPCAP
   2275 ///
   2276 /// @param[in]  word
   2277 /// @param[in]  end  End of word or NULL for NUL delimited string
   2278 ///
   2279 /// @returns  Case type of word
   2280 int captype(const char *word, const char *end)
   2281  FUNC_ATTR_NONNULL_ARG(1)
   2282 {
   2283  const char *p;
   2284 
   2285  // find first letter
   2286  for (p = word; !spell_iswordp_nmw(p, curwin); MB_PTR_ADV(p)) {
   2287    if (end == NULL ? *p == NUL : p >= end) {
   2288      return 0;             // only non-word characters, illegal word
   2289    }
   2290  }
   2291  int c = mb_ptr2char_adv(&p);
   2292  bool allcap;
   2293  bool firstcap = allcap = SPELL_ISUPPER(c);
   2294  bool past_second = false;              // past second word char
   2295 
   2296  // Need to check all letters to find a word with mixed upper/lower.
   2297  // But a word with an upper char only at start is a ONECAP.
   2298  for (; end == NULL ? *p != NUL : p < end; MB_PTR_ADV(p)) {
   2299    if (spell_iswordp_nmw(p, curwin)) {
   2300      c = utf_ptr2char(p);
   2301      if (!SPELL_ISUPPER(c)) {
   2302        // UUl -> KEEPCAP
   2303        if (past_second && allcap) {
   2304          return WF_KEEPCAP;
   2305        }
   2306        allcap = false;
   2307      } else if (!allcap) {
   2308        // UlU -> KEEPCAP
   2309        return WF_KEEPCAP;
   2310      }
   2311      past_second = true;
   2312    }
   2313  }
   2314 
   2315  if (allcap) {
   2316    return WF_ALLCAP;
   2317  }
   2318  if (firstcap) {
   2319    return WF_ONECAP;
   2320  }
   2321  return 0;
   2322 }
   2323 
   2324 // Delete the internal wordlist and its .spl file.
   2325 void spell_delete_wordlist(void)
   2326 {
   2327  if (int_wordlist == NULL) {
   2328    return;
   2329  }
   2330 
   2331  char fname[MAXPATHL] = { 0 };
   2332  os_remove(int_wordlist);
   2333  int_wordlist_spl(fname);
   2334  os_remove(fname);
   2335  XFREE_CLEAR(int_wordlist);
   2336 }
   2337 
   2338 // Free all languages.
   2339 void spell_free_all(void)
   2340 {
   2341  // Go through all buffers and handle 'spelllang'. <VN>
   2342  FOR_ALL_BUFFERS(buf) {
   2343    ga_clear(&buf->b_s.b_langp);
   2344  }
   2345 
   2346  while (first_lang != NULL) {
   2347    slang_T *slang = first_lang;
   2348    first_lang = slang->sl_next;
   2349    slang_free(slang);
   2350  }
   2351 
   2352  spell_delete_wordlist();
   2353 
   2354  XFREE_CLEAR(repl_to);
   2355  XFREE_CLEAR(repl_from);
   2356 }
   2357 
   2358 // Clear all spelling tables and reload them.
   2359 // Used after 'encoding' is set and when ":mkspell" was used.
   2360 void spell_reload(void)
   2361 {
   2362  // Initialize the table for spell_iswordp().
   2363  init_spell_chartab();
   2364 
   2365  // Unload all allocated memory.
   2366  spell_free_all();
   2367 
   2368  // Go through all buffers and handle 'spelllang'.
   2369  FOR_ALL_WINDOWS_IN_TAB(wp, curtab) {
   2370    // Only load the wordlists when 'spelllang' is set and there is a
   2371    // window for this buffer in which 'spell' is set.
   2372    if (*wp->w_s->b_p_spl != NUL) {
   2373      if (wp->w_p_spell) {
   2374        parse_spelllang(wp);
   2375        break;
   2376      }
   2377    }
   2378  }
   2379 }
   2380 
   2381 // Open a spell buffer.  This is a nameless buffer that is not in the buffer
   2382 // list and only contains text lines.  Can use a swapfile to reduce memory
   2383 // use.
   2384 // Most other fields are invalid!  Esp. watch out for string options being
   2385 // NULL and there is no undo info.
   2386 buf_T *open_spellbuf(void)
   2387 {
   2388  buf_T *buf = xcalloc(1, sizeof(buf_T));
   2389 
   2390  buf->b_spell = true;
   2391  buf->b_p_swf = true;        // may create a swap file
   2392  if (ml_open(buf) == FAIL) {
   2393    ELOG("Error opening a new memline");
   2394  }
   2395  ml_open_file(buf);          // create swap file now
   2396 
   2397  return buf;
   2398 }
   2399 
   2400 // Close the buffer used for spell info.
   2401 void close_spellbuf(buf_T *buf)
   2402 {
   2403  if (buf == NULL) {
   2404    return;
   2405  }
   2406 
   2407  ml_close(buf, true);
   2408  xfree(buf);
   2409 }
   2410 
   2411 // Init the chartab used for spelling for ASCII.
   2412 void clear_spell_chartab(spelltab_T *sp)
   2413 {
   2414  // Init everything to false (zero).
   2415  CLEAR_FIELD(sp->st_isw);
   2416  CLEAR_FIELD(sp->st_isu);
   2417 
   2418  for (int i = 0; i < 256; i++) {
   2419    sp->st_fold[i] = (uint8_t)i;
   2420    sp->st_upper[i] = (uint8_t)i;
   2421  }
   2422 
   2423  // We include digits. A word shouldn't start with a digit, but handling
   2424  // that is done separately.
   2425  for (int i = '0'; i <= '9'; i++) {
   2426    sp->st_isw[i] = true;
   2427  }
   2428  for (int i = 'A'; i <= 'Z'; i++) {
   2429    sp->st_isw[i] = true;
   2430    sp->st_isu[i] = true;
   2431    sp->st_fold[i] = (uint8_t)(i + 0x20);
   2432  }
   2433  for (int i = 'a'; i <= 'z'; i++) {
   2434    sp->st_isw[i] = true;
   2435    sp->st_upper[i] = (uint8_t)(i - 0x20);
   2436  }
   2437 }
   2438 
   2439 // Init the chartab used for spelling. Called once while starting up.
   2440 // The default is to use isalpha(), but the spell file should define the word
   2441 // characters to make it possible that 'encoding' differs from the current
   2442 // locale.  For utf-8 we don't use isalpha() but our own functions.
   2443 void init_spell_chartab(void)
   2444 {
   2445  did_set_spelltab = false;
   2446  clear_spell_chartab(&spelltab);
   2447  for (int i = 128; i < 256; i++) {
   2448    int f = utf_fold(i);
   2449    int u = mb_toupper(i);
   2450 
   2451    spelltab.st_isu[i] = mb_isupper(i);
   2452    spelltab.st_isw[i] = spelltab.st_isu[i] || mb_islower(i);
   2453    // The folded/upper-cased value is different between latin1 and
   2454    // utf8 for 0xb5, causing E763 for no good reason.  Use the latin1
   2455    // value for utf-8 to avoid this.
   2456    spelltab.st_fold[i] = (f < 256) ? (uint8_t)f : (uint8_t)i;
   2457    spelltab.st_upper[i] = (u < 256) ? (uint8_t)u : (uint8_t)i;
   2458  }
   2459 }
   2460 
   2461 /// Returns true if "p" points to a word character.
   2462 /// As a special case we see "midword" characters as word character when it is
   2463 /// followed by a word character.  This finds they'there but not 'they there'.
   2464 /// Thus this only works properly when past the first character of the word.
   2465 ///
   2466 /// @param wp Buffer used.
   2467 bool spell_iswordp(const char *p, const win_T *wp)
   2468  FUNC_ATTR_NONNULL_ALL
   2469 {
   2470  const int l = utfc_ptr2len(p);
   2471  const char *s = p;
   2472  if (l == 1) {
   2473    // be quick for ASCII
   2474    if (wp->w_s->b_spell_ismw[(uint8_t)(*p)]) {
   2475      s = p + 1;                      // skip a mid-word character
   2476    }
   2477  } else {
   2478    int c = utf_ptr2char(p);
   2479    if (c < 256
   2480        ? wp->w_s->b_spell_ismw[c]
   2481        : (wp->w_s->b_spell_ismw_mb != NULL
   2482           && vim_strchr(wp->w_s->b_spell_ismw_mb, c) != NULL)) {
   2483      s = p + l;
   2484    }
   2485  }
   2486 
   2487  int c = utf_ptr2char(s);
   2488  if (c > 255) {
   2489    return spell_mb_isword_class(mb_get_class(s), wp);
   2490  }
   2491  return spelltab.st_isw[c];
   2492 }
   2493 
   2494 // Returns true if "p" points to a word character.
   2495 // Unlike spell_iswordp() this doesn't check for "midword" characters.
   2496 bool spell_iswordp_nmw(const char *p, win_T *wp)
   2497 {
   2498  int c = utf_ptr2char(p);
   2499  if (c > 255) {
   2500    return spell_mb_isword_class(mb_get_class(p), wp);
   2501  }
   2502  return spelltab.st_isw[c];
   2503 }
   2504 
   2505 // Returns true if word class indicates a word character.
   2506 // Only for characters above 255.
   2507 // Unicode subscript and superscript are not considered word characters.
   2508 // See also utf_class() in mbyte.c.
   2509 static bool spell_mb_isword_class(int cl, const win_T *wp)
   2510  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
   2511 {
   2512  if (wp->w_s->b_cjk) {
   2513    // East Asian characters are not considered word characters.
   2514    return cl == 2 || cl == 0x2800;
   2515  }
   2516  return cl >= 2 && cl != 0x2070 && cl != 0x2080 && cl != 3;
   2517 }
   2518 
   2519 // Returns true if "p" points to a word character.
   2520 // Wide version of spell_iswordp().
   2521 static bool spell_iswordp_w(const int *p, const win_T *wp)
   2522  FUNC_ATTR_NONNULL_ALL
   2523 {
   2524  const int *s;
   2525 
   2526  if (*p <
   2527      256 ? wp->w_s->b_spell_ismw[*p] : (wp->w_s->b_spell_ismw_mb != NULL
   2528                                         && vim_strchr(wp->w_s->b_spell_ismw_mb,
   2529                                                       *p) != NULL)) {
   2530    s = p + 1;
   2531  } else {
   2532    s = p;
   2533  }
   2534 
   2535  if (*s > 255) {
   2536    return spell_mb_isword_class(utf_class(*s), wp);
   2537  }
   2538  return spelltab.st_isw[*s];
   2539 }
   2540 
   2541 // Case-fold "str[len]" into "buf[buflen]".  The result is NUL terminated.
   2542 // Uses the character definitions from the .spl file.
   2543 // When using a multi-byte 'encoding' the length may change!
   2544 // Returns FAIL when something wrong.
   2545 int spell_casefold(const win_T *wp, const char *str, int len, char *buf, int buflen)
   2546  FUNC_ATTR_NONNULL_ALL
   2547 {
   2548  if (len >= buflen) {
   2549    buf[0] = NUL;
   2550    return FAIL;                // result will not fit
   2551  }
   2552 
   2553  int outi = 0;
   2554 
   2555  // Fold one character at a time.
   2556  for (const char *p = str; p < str + len;) {
   2557    if (outi + MB_MAXBYTES > buflen) {
   2558      buf[outi] = NUL;
   2559      return FAIL;
   2560    }
   2561    int c = mb_cptr2char_adv(&p);
   2562 
   2563    // Exception: greek capital sigma 0x03A3 folds to 0x03C3, except
   2564    // when it is the last character in a word, then it folds to
   2565    // 0x03C2.
   2566    if (c == 0x03a3 || c == 0x03c2) {
   2567      if (p == str + len || !spell_iswordp(p, wp)) {
   2568        c = 0x03c2;
   2569      } else {
   2570        c = 0x03c3;
   2571      }
   2572    } else {
   2573      c = SPELL_TOFOLD(c);
   2574    }
   2575 
   2576    outi += utf_char2bytes(c, buf + outi);
   2577  }
   2578  buf[outi] = NUL;
   2579 
   2580  return OK;
   2581 }
   2582 
   2583 // Check if the word at line "lnum" column "col" is required to start with a
   2584 // capital.  This uses 'spellcapcheck' of the buffer in window "wp".
   2585 bool check_need_cap(win_T *wp, linenr_T lnum, colnr_T col)
   2586 {
   2587  if (wp->w_s->b_cap_prog == NULL) {
   2588    return false;
   2589  }
   2590 
   2591  bool need_cap = false;
   2592  char *line = col ? ml_get_buf(wp->w_buffer, lnum) : NULL;
   2593  char *line_copy = NULL;
   2594  colnr_T endcol = 0;
   2595  if (col == 0 || getwhitecols(line) >= col) {
   2596    // At start of line, check if previous line is empty or sentence
   2597    // ends there.
   2598    if (lnum == 1) {
   2599      need_cap = true;
   2600    } else {
   2601      line = ml_get_buf(wp->w_buffer, lnum - 1);
   2602      if (*skipwhite(line) == NUL) {
   2603        need_cap = true;
   2604      } else {
   2605        // Append a space in place of the line break.
   2606        line_copy = concat_str(line, " ");
   2607        line = line_copy;
   2608        endcol = (colnr_T)strlen(line);
   2609      }
   2610    }
   2611  } else {
   2612    endcol = col;
   2613  }
   2614 
   2615  if (endcol > 0) {
   2616    // Check if sentence ends before the bad word.
   2617    regmatch_T regmatch = {
   2618      .regprog = wp->w_s->b_cap_prog,
   2619      .rm_ic = false
   2620    };
   2621    char *p = line + endcol;
   2622    while (true) {
   2623      MB_PTR_BACK(line, p);
   2624      if (p == line || spell_iswordp_nmw(p, wp)) {
   2625        break;
   2626      }
   2627      if (vim_regexec(&regmatch, p, 0)
   2628          && regmatch.endp[0] == line + endcol) {
   2629        need_cap = true;
   2630        break;
   2631      }
   2632    }
   2633    wp->w_s->b_cap_prog = regmatch.regprog;
   2634  }
   2635 
   2636  xfree(line_copy);
   2637 
   2638  return need_cap;
   2639 }
   2640 
   2641 // ":spellrepall"
   2642 void ex_spellrepall(exarg_T *eap)
   2643 {
   2644  pos_T pos = curwin->w_cursor;
   2645  bool save_ws = p_ws;
   2646  linenr_T prev_lnum = 0;
   2647 
   2648  if (repl_from == NULL || repl_to == NULL) {
   2649    emsg(_("E752: No previous spell replacement"));
   2650    return;
   2651  }
   2652  const size_t repl_from_len = strlen(repl_from);
   2653  const size_t repl_to_len = strlen(repl_to);
   2654  const int addlen = (int)(repl_to_len - repl_from_len);
   2655 
   2656  const size_t frompatsize = repl_from_len + 7;
   2657  char *frompat = xmalloc(frompatsize);
   2658  size_t frompatlen = (size_t)snprintf(frompat, frompatsize, "\\V\\<%s\\>", repl_from);
   2659  p_ws = false;
   2660 
   2661  sub_nsubs = 0;
   2662  sub_nlines = 0;
   2663  curwin->w_cursor.lnum = 0;
   2664  while (!got_int) {
   2665    if (do_search(NULL, '/', '/', frompat, frompatlen, 1, SEARCH_KEEP, NULL) == 0
   2666        || u_save_cursor() == FAIL) {
   2667      break;
   2668    }
   2669 
   2670    // Only replace when the right word isn't there yet.  This happens
   2671    // when changing "etc" to "etc.".
   2672    char *line = get_cursor_line_ptr();
   2673    if (addlen <= 0
   2674        || strncmp(line + curwin->w_cursor.col, repl_to, repl_to_len) != 0) {
   2675      char *p = xmalloc((size_t)get_cursor_line_len() + (size_t)addlen + 1);
   2676      memmove(p, line, (size_t)curwin->w_cursor.col);
   2677      STRCPY(p + curwin->w_cursor.col, repl_to);
   2678      strcat(p, line + curwin->w_cursor.col + repl_from_len);
   2679      ml_replace(curwin->w_cursor.lnum, p, false);
   2680      inserted_bytes(curwin->w_cursor.lnum, curwin->w_cursor.col,
   2681                     (int)repl_from_len, (int)repl_to_len);
   2682 
   2683      if (curwin->w_cursor.lnum != prev_lnum) {
   2684        sub_nlines++;
   2685        prev_lnum = curwin->w_cursor.lnum;
   2686      }
   2687      sub_nsubs++;
   2688    }
   2689    curwin->w_cursor.col += (colnr_T)repl_to_len;
   2690  }
   2691 
   2692  p_ws = save_ws;
   2693  curwin->w_cursor = pos;
   2694  xfree(frompat);
   2695 
   2696  if (sub_nsubs == 0) {
   2697    semsg(_("E753: Not found: %s"), repl_from);
   2698  } else {
   2699    do_sub_msg(false);
   2700  }
   2701 }
   2702 
   2703 /// Make a copy of "word", with the first letter upper or lower cased, to
   2704 /// "wcopy[MAXWLEN]".  "word" must not be empty.
   2705 /// The result is NUL terminated.
   2706 ///
   2707 /// @param[in]  word  source string to copy
   2708 /// @param[in,out]  wcopy  copied string, with case of first letter changed
   2709 /// @param[in]  upper  True to upper case, otherwise lower case
   2710 void onecap_copy(const char *word, char *wcopy, bool upper)
   2711 {
   2712  const char *p = word;
   2713  int c = mb_cptr2char_adv(&p);
   2714  if (upper) {
   2715    c = SPELL_TOUPPER(c);
   2716  } else {
   2717    c = SPELL_TOFOLD(c);
   2718  }
   2719  int l = utf_char2bytes(c, wcopy);
   2720  xstrlcpy(wcopy + l, p, (size_t)(MAXWLEN - l));
   2721 }
   2722 
   2723 // Make a copy of "word" with all the letters upper cased into
   2724 // "wcopy[MAXWLEN]".  The result is NUL terminated.
   2725 void allcap_copy(const char *word, char *wcopy)
   2726 {
   2727  char *d = wcopy;
   2728  for (const char *s = word; *s != NUL;) {
   2729    int c = mb_cptr2char_adv(&s);
   2730 
   2731    if (c == 0xdf) {
   2732      c = 'S';
   2733      if (d - wcopy >= MAXWLEN - 1) {
   2734        break;
   2735      }
   2736      *d++ = (char)c;
   2737    } else {
   2738      c = SPELL_TOUPPER(c);
   2739    }
   2740 
   2741    if (d - wcopy >= MAXWLEN - MB_MAXBYTES) {
   2742      break;
   2743    }
   2744    d += utf_char2bytes(c, d);
   2745  }
   2746  *d = NUL;
   2747 }
   2748 
   2749 // Case-folding may change the number of bytes: Count nr of chars in
   2750 // fword[flen] and return the byte length of that many chars in "word".
   2751 int nofold_len(char *fword, int flen, char *word)
   2752 {
   2753  char *p;
   2754  int i = 0;
   2755 
   2756  for (p = fword; p < fword + flen; MB_PTR_ADV(p)) {
   2757    i++;
   2758  }
   2759  for (p = word; i > 0; MB_PTR_ADV(p)) {
   2760    i--;
   2761  }
   2762  return (int)(p - word);
   2763 }
   2764 
   2765 // Copy "fword" to "cword", fixing case according to "flags".
   2766 void make_case_word(char *fword, char *cword, int flags)
   2767 {
   2768  if (flags & WF_ALLCAP) {
   2769    // Make it all upper-case
   2770    allcap_copy(fword, cword);
   2771  } else if (flags & WF_ONECAP) {
   2772    // Make the first letter upper-case
   2773    onecap_copy(fword, cword, true);
   2774  } else {
   2775    // Use goodword as-is.
   2776    STRCPY(cword, fword);
   2777  }
   2778 }
   2779 
   2780 /// Soundfold a string, for soundfold()
   2781 ///
   2782 /// @param[in]  word  Word to soundfold.
   2783 ///
   2784 /// @return [allocated] soundfolded string or NULL in case of error. May return
   2785 ///                     copy of the input string if soundfolding is not
   2786 ///                     supported by any of the languages in &spellang.
   2787 char *eval_soundfold(const char *const word)
   2788  FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_MALLOC FUNC_ATTR_NONNULL_ALL
   2789 {
   2790  if (curwin->w_p_spell && *curwin->w_s->b_p_spl != NUL) {
   2791    // Use the sound-folding of the first language that supports it.
   2792    for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) {
   2793      langp_T *const lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
   2794      if (!GA_EMPTY(&lp->lp_slang->sl_sal)) {
   2795        // soundfold the word
   2796        char sound[MAXWLEN];
   2797        spell_soundfold(lp->lp_slang, (char *)word, false, sound);
   2798        return xstrdup(sound);
   2799      }
   2800    }
   2801  }
   2802 
   2803  // No language with sound folding, return word as-is.
   2804  return xstrdup(word);
   2805 }
   2806 
   2807 /// Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
   2808 ///
   2809 /// There are many ways to turn a word into a sound-a-like representation.  The
   2810 /// oldest is Soundex (1918!).   A nice overview can be found in "Approximate
   2811 /// swedish name matching - survey and test of different algorithms" by Klas
   2812 /// Erikson.
   2813 ///
   2814 /// We support two methods:
   2815 /// 1. SOFOFROM/SOFOTO do a simple character mapping.
   2816 /// 2. SAL items define a more advanced sound-folding (and much slower).
   2817 ///
   2818 /// @param[in]  slang
   2819 /// @param[in]  inword  word to soundfold
   2820 /// @param[in]  folded  whether inword is already case-folded
   2821 /// @param[in,out]  res  destination for soundfolded word
   2822 void spell_soundfold(slang_T *slang, char *inword, bool folded, char *res)
   2823 {
   2824  if (slang->sl_sofo) {
   2825    // SOFOFROM and SOFOTO used
   2826    spell_soundfold_sofo(slang, inword, res);
   2827  } else {
   2828    char fword[MAXWLEN];
   2829    char *word;
   2830    // SAL items used.  Requires the word to be case-folded.
   2831    if (folded) {
   2832      word = inword;
   2833    } else {
   2834      spell_casefold(curwin, inword, (int)strlen(inword), fword, MAXWLEN);
   2835      word = fword;
   2836    }
   2837 
   2838    spell_soundfold_wsal(slang, word, res);
   2839  }
   2840 }
   2841 
   2842 // Perform sound folding of "inword" into "res" according to SOFOFROM and
   2843 // SOFOTO lines.
   2844 static void spell_soundfold_sofo(slang_T *slang, const char *inword, char *res)
   2845 {
   2846  int ri = 0;
   2847 
   2848  int prevc = 0;
   2849 
   2850  // The sl_sal_first[] table contains the translation for chars up to
   2851  // 255, sl_sal the rest.
   2852  for (const char *s = inword; *s != NUL;) {
   2853    int c = mb_cptr2char_adv(&s);
   2854    if (utf_class(c) == 0) {
   2855      c = ' ';
   2856    } else if (c < 256) {
   2857      c = slang->sl_sal_first[c];
   2858    } else {
   2859      int *ip = ((int **)slang->sl_sal.ga_data)[c & 0xff];
   2860      if (ip == NULL) {               // empty list, can't match
   2861        c = NUL;
   2862      } else {
   2863        while (true) {                // find "c" in the list
   2864          if (*ip == 0) {             // not found
   2865            c = NUL;
   2866            break;
   2867          }
   2868          if (*ip == c) {             // match!
   2869            c = ip[1];
   2870            break;
   2871          }
   2872          ip += 2;
   2873        }
   2874      }
   2875    }
   2876 
   2877    if (c != NUL && c != prevc) {
   2878      ri += utf_char2bytes(c, res + ri);
   2879      if (ri + MB_MAXBYTES > MAXWLEN) {
   2880        break;
   2881      }
   2882      prevc = c;
   2883    }
   2884  }
   2885 
   2886  res[ri] = NUL;
   2887 }
   2888 
   2889 // Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
   2890 // Multi-byte version of spell_soundfold().
   2891 static void spell_soundfold_wsal(slang_T *slang, const char *inword, char *res)
   2892 {
   2893  int word[MAXWLEN] = { 0 };
   2894  bool did_white = false;
   2895 
   2896  // Convert the multi-byte string to a wide-character string.
   2897  // Remove accents, if wanted.  We actually remove all non-word characters.
   2898  // But keep white space.
   2899  int wordlen = 0;
   2900  for (const char *s = inword; *s != NUL;) {
   2901    const char *t = s;
   2902    int c = mb_cptr2char_adv(&s);
   2903    if (slang->sl_rem_accents) {
   2904      if (utf_class(c) == 0) {
   2905        if (did_white) {
   2906          continue;
   2907        }
   2908        c = ' ';
   2909        did_white = true;
   2910      } else {
   2911        did_white = false;
   2912        if (!spell_iswordp_nmw(t, curwin)) {
   2913          continue;
   2914        }
   2915      }
   2916    }
   2917    word[wordlen++] = c;
   2918  }
   2919  word[wordlen] = NUL;
   2920 
   2921  salitem_T *smp = (salitem_T *)slang->sl_sal.ga_data;
   2922  int wres[MAXWLEN] = { 0 };
   2923  int k = 0;
   2924  int p0 = -333;
   2925  int c;
   2926  // This algorithm comes from Aspell phonet.cpp.
   2927  // Converted from C++ to C.  Added support for multi-byte chars.
   2928  // Changed to keep spaces.
   2929  int i = 0;
   2930  int reslen = 0;
   2931  int z = 0;
   2932  while ((c = word[i]) != NUL) {
   2933    // Start with the first rule that has the character in the word.
   2934    int n = slang->sl_sal_first[c & 0xff];
   2935    int z0 = 0;
   2936 
   2937    if (n >= 0) {
   2938      int *ws;
   2939      // Check all rules for the same index byte.
   2940      // If c is 0x300 need extra check for the end of the array, as
   2941      // (c & 0xff) is NUL.
   2942      for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff)
   2943           && ws[0] != NUL; n++) {
   2944        // Quickly skip entries that don't match the word.  Most
   2945        // entries are less than three chars, optimize for that.
   2946        if (c != ws[0]) {
   2947          continue;
   2948        }
   2949        k = smp[n].sm_leadlen;
   2950        if (k > 1) {
   2951          if (word[i + 1] != ws[1]) {
   2952            continue;
   2953          }
   2954          if (k > 2) {
   2955            int j;
   2956            for (j = 2; j < k; j++) {
   2957              if (word[i + j] != ws[j]) {
   2958                break;
   2959              }
   2960            }
   2961            if (j < k) {
   2962              continue;
   2963            }
   2964          }
   2965        }
   2966 
   2967        int *pf;
   2968        if ((pf = smp[n].sm_oneof_w) != NULL) {
   2969          // Check for match with one of the chars in "sm_oneof".
   2970          while (*pf != NUL && *pf != word[i + k]) {
   2971            pf++;
   2972          }
   2973          if (*pf == NUL) {
   2974            continue;
   2975          }
   2976          k++;
   2977        }
   2978        char *s = smp[n].sm_rules;
   2979        int pri = 5;            // default priority
   2980 
   2981        p0 = (uint8_t)(*s);
   2982        int k0 = k;
   2983        while (*s == '-' && k > 1) {
   2984          k--;
   2985          s++;
   2986        }
   2987        if (*s == '<') {
   2988          s++;
   2989        }
   2990        if (ascii_isdigit(*s)) {
   2991          // determine priority
   2992          pri = (uint8_t)(*s) - '0';
   2993          s++;
   2994        }
   2995        if (*s == '^' && *(s + 1) == '^') {
   2996          s++;
   2997        }
   2998 
   2999        if (*s == NUL
   3000            || (*s == '^'
   3001                && (i == 0 || !(word[i - 1] == ' '
   3002                                || spell_iswordp_w(word + i - 1, curwin)))
   3003                && (*(s + 1) != '$'
   3004                    || (!spell_iswordp_w(word + i + k0, curwin))))
   3005            || (*s == '$' && i > 0
   3006                && spell_iswordp_w(word + i - 1, curwin)
   3007                && (!spell_iswordp_w(word + i + k0, curwin)))) {
   3008          // search for followup rules, if:
   3009          // followup and k > 1  and  NO '-' in searchstring
   3010          int c0 = word[i + k - 1];
   3011          int n0 = slang->sl_sal_first[c0 & 0xff];
   3012 
   3013          if (slang->sl_followup && k > 1 && n0 >= 0
   3014              && p0 != '-' && word[i + k] != NUL) {
   3015            // Test follow-up rule for "word[i + k]"; loop over
   3016            // all entries with the same index byte.
   3017            for (; ((ws = smp[n0].sm_lead_w)[0] & 0xff)
   3018                 == (c0 & 0xff); n0++) {
   3019              // Quickly skip entries that don't match the word.
   3020              if (c0 != ws[0]) {
   3021                continue;
   3022              }
   3023              k0 = smp[n0].sm_leadlen;
   3024              if (k0 > 1) {
   3025                if (word[i + k] != ws[1]) {
   3026                  continue;
   3027                }
   3028                if (k0 > 2) {
   3029                  pf = word + i + k + 1;
   3030                  int j;
   3031                  for (j = 2; j < k0; j++) {
   3032                    if (*pf++ != ws[j]) {
   3033                      break;
   3034                    }
   3035                  }
   3036                  if (j < k0) {
   3037                    continue;
   3038                  }
   3039                }
   3040              }
   3041              k0 += k - 1;
   3042 
   3043              if ((pf = smp[n0].sm_oneof_w) != NULL) {
   3044                // Check for match with one of the chars in
   3045                // "sm_oneof".
   3046                while (*pf != NUL && *pf != word[i + k0]) {
   3047                  pf++;
   3048                }
   3049                if (*pf == NUL) {
   3050                  continue;
   3051                }
   3052                k0++;
   3053              }
   3054 
   3055              p0 = 5;
   3056              s = smp[n0].sm_rules;
   3057              while (*s == '-') {
   3058                // "k0" gets NOT reduced because
   3059                // "if (k0 == k)"
   3060                s++;
   3061              }
   3062              if (*s == '<') {
   3063                s++;
   3064              }
   3065              if (ascii_isdigit(*s)) {
   3066                p0 = (uint8_t)(*s) - '0';
   3067                s++;
   3068              }
   3069 
   3070              if (*s == NUL
   3071                  // *s == '^' cuts
   3072                  || (*s == '$'
   3073                      && !spell_iswordp_w(word + i + k0,
   3074                                          curwin))) {
   3075                if (k0 == k) {
   3076                  // this is just a piece of the string
   3077                  continue;
   3078                }
   3079 
   3080                if (p0 < pri) {
   3081                  // priority too low
   3082                  continue;
   3083                }
   3084                // rule fits; stop search
   3085                break;
   3086              }
   3087            }
   3088 
   3089            if (p0 >= pri && (smp[n0].sm_lead_w[0] & 0xff)
   3090                == (c0 & 0xff)) {
   3091              continue;
   3092            }
   3093          }
   3094 
   3095          // replace string
   3096          ws = smp[n].sm_to_w;
   3097          s = smp[n].sm_rules;
   3098          p0 = (vim_strchr(s, '<') != NULL) ? 1 : 0;
   3099          if (p0 == 1 && z == 0) {
   3100            // rule with '<' is used
   3101            if (reslen > 0 && ws != NULL && *ws != NUL
   3102                && (wres[reslen - 1] == c
   3103                    || wres[reslen - 1] == *ws)) {
   3104              reslen--;
   3105            }
   3106            z0 = 1;
   3107            z = 1;
   3108            k0 = 0;
   3109            if (ws != NULL) {
   3110              while (*ws != NUL && word[i + k0] != NUL) {
   3111                word[i + k0] = *ws;
   3112                k0++;
   3113                ws++;
   3114              }
   3115            }
   3116            if (k > k0) {
   3117              memmove(word + i + k0, word + i + k, sizeof(int) * (size_t)(wordlen - (i + k) + 1));
   3118            }
   3119 
   3120            // new "actual letter"
   3121            c = word[i];
   3122          } else {
   3123            // no '<' rule used
   3124            i += k - 1;
   3125            z = 0;
   3126            if (ws != NULL) {
   3127              while (*ws != NUL && ws[1] != NUL
   3128                     && reslen < MAXWLEN) {
   3129                if (reslen == 0 || wres[reslen - 1] != *ws) {
   3130                  wres[reslen++] = *ws;
   3131                }
   3132                ws++;
   3133              }
   3134            }
   3135            // new "actual letter"
   3136            if (ws == NULL) {
   3137              c = NUL;
   3138            } else {
   3139              c = *ws;
   3140            }
   3141            if (strstr(s, "^^") != NULL) {
   3142              if (c != NUL && reslen < MAXWLEN) {
   3143                wres[reslen++] = c;
   3144              }
   3145              memmove(word, word + i + 1, sizeof(int) * (size_t)(wordlen - (i + 1) + 1));
   3146              i = 0;
   3147              z0 = 1;
   3148            }
   3149          }
   3150          break;
   3151        }
   3152      }
   3153    } else if (ascii_iswhite(c)) {
   3154      c = ' ';
   3155      k = 1;
   3156    }
   3157 
   3158    if (z0 == 0) {
   3159      if (k && !p0 && reslen < MAXWLEN && c != NUL
   3160          && (!slang->sl_collapse || reslen == 0
   3161              || wres[reslen - 1] != c)) {
   3162        // condense only double letters
   3163        wres[reslen++] = c;
   3164      }
   3165 
   3166      i++;
   3167      z = 0;
   3168      k = 0;
   3169    }
   3170  }
   3171 
   3172  // Convert wide characters in "wres" to a multi-byte string in "res".
   3173  int l = 0;
   3174  for (int n = 0; n < reslen; n++) {
   3175    l += utf_char2bytes(wres[n], res + l);
   3176    if (l + MB_MAXBYTES > MAXWLEN) {
   3177      break;
   3178    }
   3179  }
   3180  res[l] = NUL;
   3181 }
   3182 
   3183 // ":spellinfo"
   3184 void ex_spellinfo(exarg_T *eap)
   3185 {
   3186  if (no_spell_checking(curwin)) {
   3187    return;
   3188  }
   3189 
   3190  msg_ext_set_kind("list_cmd");
   3191  msg_start();
   3192  for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len && !got_int; lpi++) {
   3193    langp_T *const lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
   3194    msg_puts("file: ");
   3195    msg_puts(lp->lp_slang->sl_fname);
   3196    const char *const p = lp->lp_slang->sl_info;
   3197    if (lpi < curwin->w_s->b_langp.ga_len || p != NULL) {
   3198      msg_putchar('\n');
   3199    }
   3200    if (p != NULL) {
   3201      msg_puts(p);
   3202      if (lpi < curwin->w_s->b_langp.ga_len - 1) {
   3203        msg_putchar('\n');
   3204      }
   3205    }
   3206  }
   3207  msg_end();
   3208 }
   3209 
   3210 #define DUMPFLAG_KEEPCASE   1   // round 2: keep-case tree
   3211 #define DUMPFLAG_COUNT      2   // include word count
   3212 #define DUMPFLAG_ICASE      4   // ignore case when finding matches
   3213 #define DUMPFLAG_ONECAP     8   // pattern starts with capital
   3214 #define DUMPFLAG_ALLCAP     16  // pattern is all capitals
   3215 
   3216 // ":spelldump"
   3217 void ex_spelldump(exarg_T *eap)
   3218 {
   3219  if (no_spell_checking(curwin)) {
   3220    return;
   3221  }
   3222  OptVal spl = get_option_value(kOptSpelllang, OPT_LOCAL);
   3223 
   3224  // Create a new empty buffer in a new window.
   3225  do_cmdline_cmd("new");
   3226 
   3227  // enable spelling locally in the new window
   3228  set_option_value_give_err(kOptSpell, BOOLEAN_OPTVAL(true), OPT_LOCAL);
   3229  set_option_value_give_err(kOptSpelllang, spl, OPT_LOCAL);
   3230  optval_free(spl);
   3231 
   3232  if (!buf_is_empty(curbuf)) {
   3233    return;
   3234  }
   3235 
   3236  spell_dump_compl(NULL, 0, NULL, eap->forceit ? DUMPFLAG_COUNT : 0);
   3237 
   3238  // Delete the empty line that we started with.
   3239  if (curbuf->b_ml.ml_line_count > 1) {
   3240    ml_delete(curbuf->b_ml.ml_line_count);
   3241  }
   3242  redraw_later(curwin, UPD_NOT_VALID);
   3243 }
   3244 
   3245 /// Go through all possible words and:
   3246 /// 1. When "pat" is NULL: dump a list of all words in the current buffer.
   3247 ///      "ic" and "dir" are not used.
   3248 /// 2. When "pat" is not NULL: add matching words to insert mode completion.
   3249 ///
   3250 /// @param pat  leading part of the word
   3251 /// @param ic  ignore case
   3252 /// @param dir  direction for adding matches
   3253 /// @param dumpflags_arg  DUMPFLAG_*
   3254 void spell_dump_compl(char *pat, int ic, Direction *dir, int dumpflags_arg)
   3255 {
   3256  idx_T arridx[MAXWLEN];
   3257  int curi[MAXWLEN];
   3258  char word[MAXWLEN];
   3259  linenr_T lnum = 0;
   3260  char *region_names = NULL;         // region names being used
   3261  bool do_region = true;                    // dump region names and numbers
   3262  int dumpflags = dumpflags_arg;
   3263 
   3264  // When ignoring case or when the pattern starts with capital pass this on
   3265  // to dump_word().
   3266  if (pat != NULL) {
   3267    if (ic) {
   3268      dumpflags |= DUMPFLAG_ICASE;
   3269    } else {
   3270      int n = captype(pat, NULL);
   3271      if (n == WF_ONECAP) {
   3272        dumpflags |= DUMPFLAG_ONECAP;
   3273      } else if (n == WF_ALLCAP
   3274                 && (int)strlen(pat) > utfc_ptr2len(pat)) {
   3275        dumpflags |= DUMPFLAG_ALLCAP;
   3276      }
   3277    }
   3278  }
   3279 
   3280  // Find out if we can support regions: All languages must support the same
   3281  // regions or none at all.
   3282  for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) {
   3283    langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
   3284    char *p = lp->lp_slang->sl_regions;
   3285    if (p[0] != 0) {
   3286      if (region_names == NULL) {           // first language with regions
   3287        region_names = p;
   3288      } else if (strcmp(region_names, p) != 0) {
   3289        do_region = false;                  // region names are different
   3290        break;
   3291      }
   3292    }
   3293  }
   3294 
   3295  if (do_region && region_names != NULL && pat == NULL) {
   3296    vim_snprintf(IObuff, IOSIZE, "/regions=%s", region_names);
   3297    ml_append(lnum++, IObuff, 0, false);
   3298  } else {
   3299    do_region = false;
   3300  }
   3301 
   3302  // Loop over all files loaded for the entries in 'spelllang'.
   3303  for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) {
   3304    langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
   3305    slang_T *slang = lp->lp_slang;
   3306    if (slang->sl_fbyts == NULL) {          // reloading failed
   3307      continue;
   3308    }
   3309 
   3310    if (pat == NULL) {
   3311      vim_snprintf(IObuff, IOSIZE, "# file: %s", slang->sl_fname);
   3312      ml_append(lnum++, IObuff, 0, false);
   3313    }
   3314 
   3315    int patlen;
   3316    // When matching with a pattern and there are no prefixes only use
   3317    // parts of the tree that match "pat".
   3318    if (pat != NULL && slang->sl_pbyts == NULL) {
   3319      patlen = (int)strlen(pat);
   3320    } else {
   3321      patlen = -1;
   3322    }
   3323 
   3324    // round 1: case-folded tree
   3325    // round 2: keep-case tree
   3326    for (int round = 1; round <= 2; round++) {
   3327      uint8_t *byts;
   3328      idx_T *idxs;
   3329      if (round == 1) {
   3330        dumpflags &= ~DUMPFLAG_KEEPCASE;
   3331        byts = slang->sl_fbyts;
   3332        idxs = slang->sl_fidxs;
   3333      } else {
   3334        dumpflags |= DUMPFLAG_KEEPCASE;
   3335        byts = slang->sl_kbyts;
   3336        idxs = slang->sl_kidxs;
   3337      }
   3338      if (byts == NULL) {
   3339        continue;                       // array is empty
   3340      }
   3341      int depth = 0;
   3342      arridx[0] = 0;
   3343      curi[0] = 1;
   3344      while (depth >= 0 && !got_int
   3345             && (pat == NULL || !ins_compl_interrupted())) {
   3346        if (curi[depth] > byts[arridx[depth]]) {
   3347          // Done all bytes at this node, go up one level.
   3348          depth--;
   3349          line_breakcheck();
   3350          ins_compl_check_keys(50, false);
   3351        } else {
   3352          // Do one more byte at this node.
   3353          int n = arridx[depth] + curi[depth];
   3354          curi[depth]++;
   3355          int c = byts[n];
   3356          if (c == 0 || depth >= MAXWLEN - 1) {
   3357            // End of word or reached maximum length, deal with the
   3358            // word.
   3359            // Don't use keep-case words in the fold-case tree,
   3360            // they will appear in the keep-case tree.
   3361            // Only use the word when the region matches.
   3362            int flags = (int)idxs[n];
   3363            if ((round == 2 || (flags & WF_KEEPCAP) == 0)
   3364                && (flags & WF_NEEDCOMP) == 0
   3365                && (do_region
   3366                    || (flags & WF_REGION) == 0
   3367                    || (((unsigned)flags >> 16)
   3368                        & (unsigned)lp->lp_region) != 0)) {
   3369              word[depth] = NUL;
   3370              if (!do_region) {
   3371                flags &= ~WF_REGION;
   3372              }
   3373 
   3374              // Dump the basic word if there is no prefix or
   3375              // when it's the first one.
   3376              c = (int)((unsigned)flags >> 24);
   3377              if (c == 0 || curi[depth] == 2) {
   3378                dump_word(slang, word, pat, dir, dumpflags, flags, lnum);
   3379                if (pat == NULL) {
   3380                  lnum++;
   3381                }
   3382              }
   3383 
   3384              // Apply the prefix, if there is one.
   3385              if (c != 0) {
   3386                lnum = dump_prefixes(slang, word, pat, dir,
   3387                                     dumpflags, flags, lnum);
   3388              }
   3389            }
   3390          } else {
   3391            // Normal char, go one level deeper.
   3392            word[depth++] = (char)c;
   3393            arridx[depth] = idxs[n];
   3394            curi[depth] = 1;
   3395 
   3396            // Check if this character matches with the pattern.
   3397            // If not skip the whole tree below it.
   3398            // Always ignore case here, dump_word() will check
   3399            // proper case later.  This isn't exactly right when
   3400            // length changes for multi-byte characters with
   3401            // ignore case...
   3402            assert(depth >= 0);
   3403            if (depth <= patlen
   3404                && mb_strnicmp(word, pat, (size_t)depth) != 0) {
   3405              depth--;
   3406            }
   3407          }
   3408        }
   3409      }
   3410    }
   3411  }
   3412 }
   3413 
   3414 /// Dumps one word: apply case modifications and append a line to the buffer.
   3415 /// When "lnum" is zero add insert mode completion.
   3416 static void dump_word(slang_T *slang, char *word, char *pat, Direction *dir, int dumpflags,
   3417                      int wordflags, linenr_T lnum)
   3418 {
   3419  bool keepcap = false;
   3420  char *p;
   3421  char cword[MAXWLEN];
   3422  char badword[MAXWLEN + 10];
   3423  int flags = wordflags;
   3424 
   3425  if (dumpflags & DUMPFLAG_ONECAP) {
   3426    flags |= WF_ONECAP;
   3427  }
   3428  if (dumpflags & DUMPFLAG_ALLCAP) {
   3429    flags |= WF_ALLCAP;
   3430  }
   3431 
   3432  if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) {
   3433    // Need to fix case according to "flags".
   3434    make_case_word(word, cword, flags);
   3435    p = cword;
   3436  } else {
   3437    p = word;
   3438    if ((dumpflags & DUMPFLAG_KEEPCASE)
   3439        && ((captype(word, NULL) & WF_KEEPCAP) == 0
   3440            || (flags & WF_FIXCAP) != 0)) {
   3441      keepcap = true;
   3442    }
   3443  }
   3444  char *tw = p;
   3445 
   3446  if (pat == NULL) {
   3447    // Add flags and regions after a slash.
   3448    if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) {
   3449      STRCPY(badword, p);
   3450      strcat(badword, "/");
   3451      if (keepcap) {
   3452        strcat(badword, "=");
   3453      }
   3454      if (flags & WF_BANNED) {
   3455        strcat(badword, "!");
   3456      } else if (flags & WF_RARE) {
   3457        strcat(badword, "?");
   3458      }
   3459      if (flags & WF_REGION) {
   3460        for (int i = 0; i < 7; i++) {
   3461          if (flags & (0x10000 << i)) {
   3462            const size_t badword_len = strlen(badword);
   3463            snprintf(badword + badword_len,
   3464                     sizeof(badword) - badword_len,
   3465                     "%d", i + 1);
   3466          }
   3467        }
   3468      }
   3469      p = badword;
   3470    }
   3471 
   3472    if (dumpflags & DUMPFLAG_COUNT) {
   3473      hashitem_T *hi;
   3474 
   3475      // Include the word count for ":spelldump!".
   3476      hi = hash_find(&slang->sl_wordcount, tw);
   3477      if (!HASHITEM_EMPTY(hi)) {
   3478        vim_snprintf(IObuff, IOSIZE, "%s\t%d",
   3479                     tw, HI2WC(hi)->wc_count);
   3480        p = IObuff;
   3481      }
   3482    }
   3483 
   3484    ml_append(lnum, p, 0, false);
   3485  } else if (((dumpflags & DUMPFLAG_ICASE)
   3486              ? mb_strnicmp(p, pat, strlen(pat)) == 0
   3487              : strncmp(p, pat, strlen(pat)) == 0)
   3488             && ins_compl_add_infercase(p, (int)strlen(p),
   3489                                        p_ic, NULL, *dir, false, 0) == OK) {
   3490    // if dir was BACKWARD then honor it just once
   3491    *dir = FORWARD;
   3492  }
   3493 }
   3494 
   3495 /// For ":spelldump": Find matching prefixes for "word".  Prepend each to
   3496 /// "word" and append a line to the buffer.
   3497 /// When "lnum" is zero add insert mode completion.
   3498 ///
   3499 /// @param word  case-folded word
   3500 /// @param flags  flags with prefix ID
   3501 ///
   3502 /// @return  the updated line number.
   3503 static linenr_T dump_prefixes(slang_T *slang, char *word, char *pat, Direction *dir, int dumpflags,
   3504                              int flags, linenr_T startlnum)
   3505 {
   3506  idx_T arridx[MAXWLEN];
   3507  int curi[MAXWLEN];
   3508  char prefix[MAXWLEN];
   3509  char word_up[MAXWLEN];
   3510  bool has_word_up = false;
   3511  linenr_T lnum = startlnum;
   3512 
   3513  // If the word starts with a lower-case letter make the word with an
   3514  // upper-case letter in word_up[].
   3515  int c = utf_ptr2char(word);
   3516  if (SPELL_TOUPPER(c) != c) {
   3517    onecap_copy(word, word_up, true);
   3518    has_word_up = true;
   3519  }
   3520 
   3521  uint8_t *byts = slang->sl_pbyts;
   3522  idx_T *idxs = slang->sl_pidxs;
   3523  if (byts != NULL) {           // array not is empty
   3524    // Loop over all prefixes, building them byte-by-byte in prefix[].
   3525    // When at the end of a prefix check that it supports "flags".
   3526    int depth = 0;
   3527    arridx[0] = 0;
   3528    curi[0] = 1;
   3529    while (depth >= 0 && !got_int) {
   3530      int n = arridx[depth];
   3531      int len = byts[n];
   3532      if (curi[depth] > len) {
   3533        // Done all bytes at this node, go up one level.
   3534        depth--;
   3535        line_breakcheck();
   3536      } else {
   3537        // Do one more byte at this node.
   3538        n += curi[depth];
   3539        curi[depth]++;
   3540        c = byts[n];
   3541        if (c == 0) {
   3542          // End of prefix, find out how many IDs there are.
   3543          int i;
   3544          for (i = 1; i < len; i++) {
   3545            if (byts[n + i] != 0) {
   3546              break;
   3547            }
   3548          }
   3549          curi[depth] += i - 1;
   3550 
   3551          c = valid_word_prefix(i, n, flags, word, slang, false);
   3552          if (c != 0) {
   3553            xstrlcpy(prefix + depth, word, (size_t)(MAXWLEN - depth));
   3554            dump_word(slang, prefix, pat, dir, dumpflags,
   3555                      (c & WF_RAREPFX) ? (flags | WF_RARE) : flags, lnum);
   3556            if (lnum != 0) {
   3557              lnum++;
   3558            }
   3559          }
   3560 
   3561          // Check for prefix that matches the word when the
   3562          // first letter is upper-case, but only if the prefix has
   3563          // a condition.
   3564          if (has_word_up) {
   3565            c = valid_word_prefix(i, n, flags, word_up, slang, true);
   3566            if (c != 0) {
   3567              xstrlcpy(prefix + depth, word_up, (size_t)(MAXWLEN - depth));
   3568              dump_word(slang, prefix, pat, dir, dumpflags,
   3569                        (c & WF_RAREPFX) ? (flags | WF_RARE) : flags, lnum);
   3570              if (lnum != 0) {
   3571                lnum++;
   3572              }
   3573            }
   3574          }
   3575        } else {
   3576          // Normal char, go one level deeper.
   3577          prefix[depth++] = (char)c;
   3578          arridx[depth] = idxs[n];
   3579          curi[depth] = 1;
   3580        }
   3581      }
   3582    }
   3583  }
   3584 
   3585  return lnum;
   3586 }
   3587 
   3588 // Move "p" to the end of word "start".
   3589 // Uses the spell-checking word characters.
   3590 char *spell_to_word_end(char *start, win_T *win)
   3591 {
   3592  char *p = start;
   3593 
   3594  while (*p != NUL && spell_iswordp(p, win)) {
   3595    MB_PTR_ADV(p);
   3596  }
   3597  return p;
   3598 }
   3599 
   3600 // For Insert mode completion CTRL-X s:
   3601 // Find start of the word in front of column "startcol".
   3602 // We don't check if it is badly spelled, with completion we can only change
   3603 // the word in front of the cursor.
   3604 // Returns the column number of the word.
   3605 int spell_word_start(int startcol)
   3606 {
   3607  if (no_spell_checking(curwin)) {
   3608    return startcol;
   3609  }
   3610 
   3611  char *line = get_cursor_line_ptr();
   3612  char *p;
   3613 
   3614  // Find a word character before "startcol".
   3615  for (p = line + startcol; p > line;) {
   3616    MB_PTR_BACK(line, p);
   3617    if (spell_iswordp_nmw(p, curwin)) {
   3618      break;
   3619    }
   3620  }
   3621 
   3622  int col = 0;
   3623 
   3624  // Go back to start of the word.
   3625  while (p > line) {
   3626    col = (int)(p - line);
   3627    MB_PTR_BACK(line, p);
   3628    if (!spell_iswordp(p, curwin)) {
   3629      break;
   3630    }
   3631    col = 0;
   3632  }
   3633 
   3634  return col;
   3635 }
   3636 
   3637 // Need to check for 'spellcapcheck' now, the word is removed before
   3638 // expand_spelling() is called.  Therefore the ugly global variable.
   3639 static bool spell_expand_need_cap;
   3640 
   3641 void spell_expand_check_cap(colnr_T col)
   3642 {
   3643  spell_expand_need_cap = check_need_cap(curwin, curwin->w_cursor.lnum, col);
   3644 }
   3645 
   3646 // Get list of spelling suggestions.
   3647 // Used for Insert mode completion CTRL-X ?.
   3648 // Returns the number of matches.  The matches are in "matchp[]", array of
   3649 // allocated strings.
   3650 int expand_spelling(linenr_T lnum, char *pat, char ***matchp)
   3651 {
   3652  garray_T ga;
   3653 
   3654  spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, true);
   3655  *matchp = ga.ga_data;
   3656  return ga.ga_len;
   3657 }
   3658 
   3659 /// @return  true if "val" is a valid 'spelllang' value.
   3660 bool valid_spelllang(const char *val)
   3661  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
   3662 {
   3663  return valid_name(val, ".-_,@");
   3664 }
   3665 
   3666 /// @return  true if "val" is a valid 'spellfile' value.
   3667 bool valid_spellfile(const char *val)
   3668  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
   3669 {
   3670  char spf_name[MAXPATHL];
   3671  char *spf = (char *)val;
   3672  while (*spf != NUL) {
   3673    size_t l = copy_option_part(&spf, spf_name, MAXPATHL, ",");
   3674    if (l >= MAXPATHL - 4 || l < 4 || strcmp(spf_name + l - 4, ".add") != 0) {
   3675      return false;
   3676    }
   3677    for (char *s = spf_name; *s != NUL; s++) {
   3678      if (!vim_is_fname_char((uint8_t)(*s))) {
   3679        return false;
   3680      }
   3681    }
   3682  }
   3683  return true;
   3684 }
   3685 
   3686 const char *did_set_spell_option(void)
   3687 {
   3688  const char *errmsg = NULL;
   3689 
   3690  FOR_ALL_WINDOWS_IN_TAB(wp, curtab) {
   3691    if (wp->w_buffer == curbuf && wp->w_p_spell) {
   3692      errmsg = parse_spelllang(wp);
   3693      break;
   3694    }
   3695  }
   3696  return errmsg;
   3697 }
   3698 
   3699 /// Set curbuf->b_cap_prog to the regexp program for 'spellcapcheck'.
   3700 /// Return error message when failed, NULL when OK.
   3701 const char *compile_cap_prog(synblock_T *synblock)
   3702  FUNC_ATTR_NONNULL_ALL
   3703 {
   3704  regprog_T *rp = synblock->b_cap_prog;
   3705 
   3706  if (synblock->b_p_spc == NULL || *synblock->b_p_spc == NUL) {
   3707    synblock->b_cap_prog = NULL;
   3708  } else {
   3709    // Prepend a ^ so that we only match at one column
   3710    char *re = concat_str("^", synblock->b_p_spc);
   3711    synblock->b_cap_prog = vim_regcomp(re, RE_MAGIC);
   3712    xfree(re);
   3713    if (synblock->b_cap_prog == NULL) {
   3714      synblock->b_cap_prog = rp;         // restore the previous program
   3715      return e_invarg;
   3716    }
   3717  }
   3718 
   3719  vim_regfree(rp);
   3720  return NULL;
   3721 }