neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

spellfile.c (187236B)


      1 // spellfile.c: code for reading and writing spell files.
      2 //
      3 // See spell.c for information about spell checking.
      4 
      5 // Vim spell file format: <HEADER>
      6 //                        <SECTIONS>
      7 //                        <LWORDTREE>
      8 //                        <KWORDTREE>
      9 //                        <PREFIXTREE>
     10 //
     11 // <HEADER>: <fileID> <versionnr>
     12 //
     13 // <fileID>     8 bytes    "VIMspell"
     14 // <versionnr>  1 byte      VIMSPELLVERSION
     15 //
     16 //
     17 // Sections make it possible to add information to the .spl file without
     18 // making it incompatible with previous versions.  There are two kinds of
     19 // sections:
     20 // 1. Not essential for correct spell checking.  E.g. for making suggestions.
     21 //    These are skipped when not supported.
     22 // 2. Optional information, but essential for spell checking when present.
     23 //    E.g. conditions for affixes.  When this section is present but not
     24 //    supported an error message is given.
     25 //
     26 // <SECTIONS>: <section> ... <sectionend>
     27 //
     28 // <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
     29 //
     30 // <sectionID>    1 byte    number from 0 to 254 identifying the section
     31 //
     32 // <sectionflags> 1 byte    SNF_REQUIRED: this section is required for correct
     33 //                                          spell checking
     34 //
     35 // <sectionlen>   4 bytes   length of section contents, MSB first
     36 //
     37 // <sectionend>   1 byte    SN_END
     38 //
     39 //
     40 // sectionID == SN_INFO: <infotext>
     41 // <infotext>    N bytes    free format text with spell file info (version,
     42 //                          website, etc)
     43 //
     44 // sectionID == SN_REGION: <regionname> ...
     45 // <regionname>  2 bytes    Up to MAXREGIONS region names: ca, au, etc.
     46 //                          Lower case.
     47 //                          First <regionname> is region 1.
     48 //
     49 // sectionID == SN_CHARFLAGS: <charflagslen> <charflags>
     50 //                              <folcharslen> <folchars>
     51 // <charflagslen> 1 byte    Number of bytes in <charflags> (should be 128).
     52 // <charflags>  N bytes     List of flags (first one is for character 128):
     53 //                          0x01  word character        CF_WORD
     54 //                          0x02  upper-case character  CF_UPPER
     55 // <folcharslen>  2 bytes   Number of bytes in <folchars>.
     56 // <folchars>     N bytes   Folded characters, first one is for character 128.
     57 //
     58 // sectionID == SN_MIDWORD: <midword>
     59 // <midword>     N bytes    Characters that are word characters only when used
     60 //                          in the middle of a word.
     61 //
     62 // sectionID == SN_PREFCOND: <prefcondcnt> <prefcond> ...
     63 // <prefcondcnt> 2 bytes    Number of <prefcond> items following.
     64 // <prefcond> : <condlen> <condstr>
     65 // <condlen>    1 byte      Length of <condstr>.
     66 // <condstr>    N bytes     Condition for the prefix.
     67 //
     68 // sectionID == SN_REP: <repcount> <rep> ...
     69 // <repcount>    2 bytes    number of <rep> items, MSB first.
     70 // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
     71 // <repfromlen>  1 byte     length of <repfrom>
     72 // <repfrom>     N bytes    "from" part of replacement
     73 // <reptolen>    1 byte     length of <repto>
     74 // <repto>       N bytes    "to" part of replacement
     75 //
     76 // sectionID == SN_REPSAL: <repcount> <rep> ...
     77 //   just like SN_REP but for soundfolded words
     78 //
     79 // sectionID == SN_SAL: <salflags> <salcount> <sal> ...
     80 // <salflags>    1 byte     flags for soundsalike conversion:
     81 //                          SAL_F0LLOWUP
     82 //                          SAL_COLLAPSE
     83 //                          SAL_REM_ACCENTS
     84 // <salcount>    2 bytes    number of <sal> items following
     85 // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
     86 // <salfromlen>  1 byte     length of <salfrom>
     87 // <salfrom>     N bytes    "from" part of soundsalike
     88 // <saltolen>    1 byte     length of <salto>
     89 // <salto>       N bytes    "to" part of soundsalike
     90 //
     91 // sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
     92 // <sofofromlen> 2 bytes    length of <sofofrom>
     93 // <sofofrom>    N bytes    "from" part of soundfold
     94 // <sofotolen>   2 bytes    length of <sofoto>
     95 // <sofoto>      N bytes    "to" part of soundfold
     96 //
     97 // sectionID == SN_SUGFILE: <timestamp>
     98 // <timestamp>   8 bytes    time in seconds that must match with .sug file
     99 //
    100 // sectionID == SN_NOSPLITSUGS: nothing
    101 //
    102 // sectionID == SN_NOCOMPOUNDSUGS: nothing
    103 //
    104 // sectionID == SN_WORDS: <word> ...
    105 // <word>        N bytes    NUL terminated common word
    106 //
    107 // sectionID == SN_MAP: <mapstr>
    108 // <mapstr>      N bytes    String with sequences of similar characters,
    109 //                          separated by slashes.
    110 //
    111 // sectionID == SN_COMPOUND: <compmax> <compminlen> <compsylmax> <compoptions>
    112 //                              <comppatcount> <comppattern> ... <compflags>
    113 // <compmax>     1 byte     Maximum nr of words in compound word.
    114 // <compminlen>  1 byte     Minimal word length for compounding.
    115 // <compsylmax>  1 byte     Maximum nr of syllables in compound word.
    116 // <compoptions> 2 bytes    COMP_ flags.
    117 // <comppatcount> 2 bytes   number of <comppattern> following
    118 // <compflags>   N bytes    Flags from COMPOUNDRULE items, separated by
    119 //                          slashes.
    120 //
    121 // <comppattern>: <comppatlen> <comppattext>
    122 // <comppatlen>  1 byte     length of <comppattext>
    123 // <comppattext> N bytes    end or begin chars from CHECKCOMPOUNDPATTERN
    124 //
    125 // sectionID == SN_NOBREAK: (empty, its presence is what matters)
    126 //
    127 // sectionID == SN_SYLLABLE: <syllable>
    128 // <syllable>    N bytes    String from SYLLABLE item.
    129 //
    130 // <LWORDTREE>: <wordtree>
    131 //
    132 // <KWORDTREE>: <wordtree>
    133 //
    134 // <PREFIXTREE>: <wordtree>
    135 //
    136 //
    137 // <wordtree>: <nodecount> <nodedata> ...
    138 //
    139 // <nodecount>  4 bytes     Number of nodes following.  MSB first.
    140 //
    141 // <nodedata>: <siblingcount> <sibling> ...
    142 //
    143 // <siblingcount> 1 byte    Number of siblings in this node.  The siblings
    144 //                          follow in sorted order.
    145 //
    146 // <sibling>: <byte> [ <nodeidx> <xbyte>
    147 //                    | <flags> [<flags2>] [<region>] [<affixID>]
    148 //                    | [<pflags>] <affixID> <prefcondnr> ]
    149 //
    150 // <byte>       1 byte      Byte value of the sibling.  Special cases:
    151 //                          BY_NOFLAGS: End of word without flags and for all
    152 //                                      regions.
    153 //                                      For PREFIXTREE <affixID> and
    154 //                                      <prefcondnr> follow.
    155 //                          BY_FLAGS:   End of word, <flags> follow.
    156 //                                      For PREFIXTREE <pflags>, <affixID>
    157 //                                      and <prefcondnr> follow.
    158 //                          BY_FLAGS2:  End of word, <flags> and <flags2>
    159 //                                      follow.  Not used in PREFIXTREE.
    160 //                          BY_INDEX:   Child of sibling is shared, <nodeidx>
    161 //                                      and <xbyte> follow.
    162 //
    163 // <nodeidx>    3 bytes     Index of child for this sibling, MSB first.
    164 //
    165 // <xbyte>      1 byte      Byte value of the sibling.
    166 //
    167 // <flags>      1 byte      Bitmask of:
    168 //                          WF_ALLCAP   word must have only capitals
    169 //                          WF_ONECAP   first char of word must be capital
    170 //                          WF_KEEPCAP  keep-case word
    171 //                          WF_FIXCAP   keep-case word, all caps not allowed
    172 //                          WF_RARE     rare word
    173 //                          WF_BANNED   bad word
    174 //                          WF_REGION   <region> follows
    175 //                          WF_AFX      <affixID> follows
    176 //
    177 // <flags2>     1 byte      Bitmask of:
    178 //                          WF_HAS_AFF >> 8   word includes affix
    179 //                          WF_NEEDCOMP >> 8  word only valid in compound
    180 //                          WF_NOSUGGEST >> 8  word not used for suggestions
    181 //                          WF_COMPROOT >> 8  word already a compound
    182 //                          WF_NOCOMPBEF >> 8 no compounding before this word
    183 //                          WF_NOCOMPAFT >> 8 no compounding after this word
    184 //
    185 // <pflags>     1 byte      Bitmask of:
    186 //                          WFP_RARE    rare prefix
    187 //                          WFP_NC      non-combining prefix
    188 //                          WFP_UP      letter after prefix made upper case
    189 //
    190 // <region>     1 byte      Bitmask for regions in which word is valid.  When
    191 //                          omitted it's valid in all regions.
    192 //                          Lowest bit is for region 1.
    193 //
    194 // <affixID>    1 byte      ID of affix that can be used with this word.  In
    195 //                          PREFIXTREE used for the required prefix ID.
    196 //
    197 // <prefcondnr> 2 bytes     Prefix condition number, index in <prefcond> list
    198 //                          from HEADER.
    199 //
    200 // All text characters are in 'encoding', but stored as single bytes.
    201 
    202 // Vim .sug file format:  <SUGHEADER>
    203 //                        <SUGWORDTREE>
    204 //                        <SUGTABLE>
    205 //
    206 // <SUGHEADER>: <fileID> <versionnr> <timestamp>
    207 //
    208 // <fileID>     6 bytes     "VIMsug"
    209 // <versionnr>  1 byte      VIMSUGVERSION
    210 // <timestamp>  8 bytes     timestamp that must match with .spl file
    211 //
    212 //
    213 // <SUGWORDTREE>: <wordtree>  (see above, no flags or region used)
    214 //
    215 //
    216 // <SUGTABLE>: <sugwcount> <sugline> ...
    217 //
    218 // <sugwcount>  4 bytes     number of <sugline> following
    219 //
    220 // <sugline>: <sugnr> ... NUL
    221 //
    222 // <sugnr>:     X bytes     word number that results in this soundfolded word,
    223 //                          stored as an offset to the previous number in as
    224 //                          few bytes as possible, see offset2bytes())
    225 
    226 #include <assert.h>
    227 #include <ctype.h>
    228 #include <inttypes.h>
    229 #include <limits.h>
    230 #include <stdbool.h>
    231 #include <stddef.h>
    232 #include <stdio.h>
    233 #include <stdlib.h>
    234 #include <string.h>
    235 #include <time.h>
    236 
    237 #include "nvim/arglist.h"
    238 #include "nvim/ascii_defs.h"
    239 #include "nvim/buffer.h"
    240 #include "nvim/buffer_defs.h"
    241 #include "nvim/charset.h"
    242 #include "nvim/drawscreen.h"
    243 #include "nvim/errors.h"
    244 #include "nvim/ex_cmds_defs.h"
    245 #include "nvim/fileio.h"
    246 #include "nvim/garray.h"
    247 #include "nvim/garray_defs.h"
    248 #include "nvim/gettext_defs.h"
    249 #include "nvim/globals.h"
    250 #include "nvim/hashtab.h"
    251 #include "nvim/hashtab_defs.h"
    252 #include "nvim/macros_defs.h"
    253 #include "nvim/mbyte.h"
    254 #include "nvim/mbyte_defs.h"
    255 #include "nvim/memline.h"
    256 #include "nvim/memory.h"
    257 #include "nvim/message.h"
    258 #include "nvim/option.h"
    259 #include "nvim/option_defs.h"
    260 #include "nvim/option_vars.h"
    261 #include "nvim/os/fs.h"
    262 #include "nvim/os/input.h"
    263 #include "nvim/os/os.h"
    264 #include "nvim/os/os_defs.h"
    265 #include "nvim/os/stdpaths_defs.h"
    266 #include "nvim/os/time.h"
    267 #include "nvim/os/time_defs.h"
    268 #include "nvim/path.h"
    269 #include "nvim/pos_defs.h"
    270 #include "nvim/regexp.h"
    271 #include "nvim/runtime.h"
    272 #include "nvim/runtime_defs.h"
    273 #include "nvim/spell.h"
    274 #include "nvim/spell_defs.h"
    275 #include "nvim/spellfile.h"
    276 #include "nvim/strings.h"
    277 #include "nvim/types_defs.h"
    278 #include "nvim/ui.h"
    279 #include "nvim/undo.h"
    280 #include "nvim/vim_defs.h"
    281 
    282 // Special byte values for <byte>.  Some are only used in the tree for
    283 // postponed prefixes, some only in the other trees.  This is a bit messy...
    284 enum {
    285  BY_NOFLAGS = 0,  // end of word without flags or region; for postponed prefix: no <pflags>
    286  BY_INDEX = 1,    // child is shared, index follows
    287  BY_FLAGS = 2,    // end of word, <flags> byte follows; for postponed prefix: <pflags> follows
    288  BY_FLAGS2 = 3,   // end of word, <flags> and <flags2> bytes follow; never used in prefix tree
    289  BY_SPECIAL = BY_FLAGS2,  // highest special byte value
    290 };
    291 
    292 #define ZERO_FLAG   65009       // used when flag is zero: "0"
    293 
    294 // Flags used in .spl file for soundsalike flags.
    295 enum {
    296  SAL_F0LLOWUP = 1,
    297  SAL_COLLAPSE = 2,
    298  SAL_REM_ACCENTS = 4,
    299 };
    300 
    301 #define VIMSPELLMAGIC "VIMspell"  // string at start of Vim spell file
    302 #define VIMSPELLMAGICL (sizeof(VIMSPELLMAGIC) - 1)
    303 #define VIMSPELLVERSION 50
    304 
    305 // Section IDs.  Only renumber them when VIMSPELLVERSION changes!
    306 enum {
    307  SN_REGION = 0,           // <regionname> section
    308  SN_CHARFLAGS = 1,        // charflags section
    309  SN_MIDWORD = 2,          // <midword> section
    310  SN_PREFCOND = 3,         // <prefcond> section
    311  SN_REP = 4,              // REP items section
    312  SN_SAL = 5,              // SAL items section
    313  SN_SOFO = 6,             // soundfolding section
    314  SN_MAP = 7,              // MAP items section
    315  SN_COMPOUND = 8,         // compound words section
    316  SN_SYLLABLE = 9,         // syllable section
    317  SN_NOBREAK = 10,         // NOBREAK section
    318  SN_SUGFILE = 11,         // timestamp for .sug file
    319  SN_REPSAL = 12,          // REPSAL items section
    320  SN_WORDS = 13,           // common words
    321  SN_NOSPLITSUGS = 14,     // don't split word for suggestions
    322  SN_INFO = 15,            // info section
    323  SN_NOCOMPOUNDSUGS = 16,  // don't compound for suggestions
    324  SN_END = 255,            // end of sections
    325 };
    326 
    327 #define SNF_REQUIRED    1       // <sectionflags>: required section
    328 
    329 enum {
    330  CF_WORD = 0x01,
    331  CF_UPPER = 0x02,
    332 };
    333 
    334 static const char *e_spell_trunc = N_("E758: Truncated spell file");
    335 static const char e_error_while_reading_sug_file_str[]
    336  = N_("E782: Error while reading .sug file: %s");
    337 static const char e_duplicate_char_in_map_entry[]
    338  = N_("E783: Duplicate char in MAP entry");
    339 static const char *e_illegal_character_in_word = N_("E1280: Illegal character in word");
    340 static const char *e_afftrailing = N_("Trailing text in %s line %d: %s");
    341 static const char *e_affname = N_("Affix name too long in %s line %d: %s");
    342 static const char *msg_compressing = N_("Compressing word tree...");
    343 
    344 #define MAXLINELEN  500         // Maximum length in bytes of a line in a .aff
    345                                // and .dic file.
    346 // Main structure to store the contents of a ".aff" file.
    347 typedef struct {
    348  char *af_enc;                 // "SET", normalized, alloc'ed string or NULL
    349  int af_flagtype;              // AFT_CHAR, AFT_LONG, AFT_NUM or AFT_CAPLONG
    350  unsigned af_rare;             // RARE ID for rare word
    351  unsigned af_keepcase;         // KEEPCASE ID for keep-case word
    352  unsigned af_bad;              // BAD ID for banned word
    353  unsigned af_needaffix;        // NEEDAFFIX ID
    354  unsigned af_circumfix;        // CIRCUMFIX ID
    355  unsigned af_needcomp;         // NEEDCOMPOUND ID
    356  unsigned af_comproot;         // COMPOUNDROOT ID
    357  unsigned af_compforbid;       // COMPOUNDFORBIDFLAG ID
    358  unsigned af_comppermit;       // COMPOUNDPERMITFLAG ID
    359  unsigned af_nosuggest;        // NOSUGGEST ID
    360  int af_pfxpostpone;           // postpone prefixes without chop string and
    361                                // without flags
    362  bool af_ignoreextra;          // IGNOREEXTRA present
    363  hashtab_T af_pref;            // hashtable for prefixes, affheader_T
    364  hashtab_T af_suff;            // hashtable for suffixes, affheader_T
    365  hashtab_T af_comp;            // hashtable for compound flags, compitem_T
    366 } afffile_T;
    367 
    368 #define AFT_CHAR        0       // flags are one character
    369 #define AFT_LONG        1       // flags are two characters
    370 #define AFT_CAPLONG     2       // flags are one or two characters
    371 #define AFT_NUM         3       // flags are numbers, comma separated
    372 
    373 typedef struct affentry_S affentry_T;
    374 // Affix entry from ".aff" file.  Used for prefixes and suffixes.
    375 struct affentry_S {
    376  affentry_T *ae_next;          // next affix with same name/number
    377  char *ae_chop;                // text to chop off basic word (can be NULL)
    378  char *ae_add;                 // text to add to basic word (can be NULL)
    379  char *ae_flags;               // flags on the affix (can be NULL)
    380  char *ae_cond;                // condition (NULL for ".")
    381  regprog_T *ae_prog;           // regexp program for ae_cond or NULL
    382  char ae_compforbid;           // COMPOUNDFORBIDFLAG found
    383  char ae_comppermit;           // COMPOUNDPERMITFLAG found
    384 };
    385 
    386 #define AH_KEY_LEN 17          // 2 x 8 bytes + NUL
    387 
    388 // Affix header from ".aff" file.  Used for af_pref and af_suff.
    389 typedef struct {
    390  char ah_key[AH_KEY_LEN];      // key for hashtab == name of affix
    391  unsigned ah_flag;             // affix name as number, uses "af_flagtype"
    392  int ah_newID;                 // prefix ID after renumbering; 0 if not used
    393  int ah_combine;               // suffix may combine with prefix
    394  int ah_follows;               // another affix block should be following
    395  affentry_T *ah_first;         // first affix entry
    396 } affheader_T;
    397 
    398 #define HI2AH(hi)   ((affheader_T *)(hi)->hi_key)
    399 
    400 // Flag used in compound items.
    401 typedef struct {
    402  char ci_key[AH_KEY_LEN];      // key for hashtab == name of compound
    403  unsigned ci_flag;             // affix name as number, uses "af_flagtype"
    404  int ci_newID;                 // affix ID after renumbering.
    405 } compitem_T;
    406 
    407 #define HI2CI(hi)   ((compitem_T *)(hi)->hi_key)
    408 
    409 // Structure that is used to store the items in the word tree.  This avoids
    410 // the need to keep track of each allocated thing, everything is freed all at
    411 // once after ":mkspell" is done.
    412 // Note: "sb_next" must be just before "sb_data" to make sure the alignment of
    413 // "sb_data" is correct for systems where pointers must be aligned on
    414 // pointer-size boundaries and sizeof(pointer) > sizeof(int) (e.g., Sparc).
    415 #define  SBLOCKSIZE 16000       // size of sb_data
    416 typedef struct sblock_S sblock_T;
    417 struct sblock_S {
    418  int sb_used;                  // nr of bytes already in use
    419  sblock_T *sb_next;         // next block in list
    420  char sb_data[];            // data
    421 };
    422 
    423 // A node in the tree.
    424 typedef struct wordnode_S wordnode_T;
    425 struct wordnode_S {
    426  union {   // shared to save space
    427    uint8_t hashkey[6];         // the hash key, only used while compressing
    428    int index;                  // index in written nodes (valid after first
    429                                // round)
    430  } wn_u1;
    431  union {   // shared to save space
    432    wordnode_T *next;           // next node with same hash key
    433    wordnode_T *wnode;          // parent node that will write this node
    434  } wn_u2;
    435  wordnode_T *wn_child;        // child (next byte in word)
    436  wordnode_T *wn_sibling;      // next sibling (alternate byte in word,
    437                               //   always sorted)
    438  int wn_refs;                 // Nr. of references to this node.  Only
    439                               //   relevant for first node in a list of
    440                               //   siblings, in following siblings it is
    441                               //   always one.
    442  uint8_t wn_byte;             // Byte for this node. NUL for word end
    443 
    444  // Info for when "wn_byte" is NUL.
    445  // In PREFIXTREE "wn_region" is used for the prefcondnr.
    446  // In the soundfolded word tree "wn_flags" has the MSW of the wordnr and
    447  // "wn_region" the LSW of the wordnr.
    448  uint8_t wn_affixID;           // supported/required prefix ID or 0
    449  uint16_t wn_flags;            // WF_ flags
    450  int16_t wn_region;            // region mask
    451 
    452 #ifdef SPELL_PRINTTREE
    453  int wn_nr;                    // sequence nr for printing
    454 #endif
    455 };
    456 
    457 #define WN_MASK  0xffff         // mask relevant bits of "wn_flags"
    458 
    459 #define HI2WN(hi)    (wordnode_T *)((hi)->hi_key)
    460 
    461 // Info used while reading the spell files.
    462 typedef struct {
    463  wordnode_T *si_foldroot;     // tree with case-folded words
    464  int si_foldwcount;           // nr of words in si_foldroot
    465 
    466  wordnode_T *si_keeproot;     // tree with keep-case words
    467  int si_keepwcount;           // nr of words in si_keeproot
    468 
    469  wordnode_T *si_prefroot;     // tree with postponed prefixes
    470 
    471  int si_sugtree;              // creating the soundfolding trie
    472 
    473  sblock_T *si_blocks;       // memory blocks used
    474  int si_blocks_cnt;           // memory blocks allocated
    475  int si_did_emsg;              // true when ran out of memory
    476 
    477  int si_compress_cnt;         // words to add before lowering
    478                               // compression limit
    479  wordnode_T *si_first_free;   // List of nodes that have been freed during
    480                               // compression, linked by "wn_child" field.
    481  int si_free_count;           // number of nodes in si_first_free
    482 #ifdef SPELL_PRINTTREE
    483  int si_wordnode_nr;           // sequence nr for nodes
    484 #endif
    485  buf_T *si_spellbuf;     // buffer used to store soundfold word table
    486 
    487  int si_ascii;                 // handling only ASCII words
    488  int si_add;                   // addition file
    489  int si_clear_chartab;             // when true clear char tables
    490  int si_region;                // region mask
    491  vimconv_T si_conv;            // for conversion to 'encoding'
    492  int si_memtot;                // runtime memory used
    493  int si_verbose;               // verbose messages
    494  int si_msg_count;             // number of words added since last message
    495  char *si_info;                // info text chars or NULL
    496  int si_region_count;          // number of regions supported (1 when there
    497                                // are no regions)
    498  char si_region_name[MAXREGIONS * 2 + 1];
    499  // region names; used only if
    500  // si_region_count > 1)
    501 
    502  garray_T si_rep;              // list of fromto_T entries from REP lines
    503  garray_T si_repsal;           // list of fromto_T entries from REPSAL lines
    504  garray_T si_sal;              // list of fromto_T entries from SAL lines
    505  char *si_sofofr;              // SOFOFROM text
    506  char *si_sofoto;              // SOFOTO text
    507  int si_nosugfile;             // NOSUGFILE item found
    508  int si_nosplitsugs;           // NOSPLITSUGS item found
    509  int si_nocompoundsugs;        // NOCOMPOUNDSUGS item found
    510  int si_followup;              // soundsalike: ?
    511  int si_collapse;              // soundsalike: ?
    512  hashtab_T si_commonwords;     // hashtable for common words
    513  time_t si_sugtime;            // timestamp for .sug file
    514  int si_rem_accents;           // soundsalike: remove accents
    515  garray_T si_map;              // MAP info concatenated
    516  char *si_midword;             // MIDWORD chars or NULL
    517  int si_compmax;               // max nr of words for compounding
    518  int si_compminlen;            // minimal length for compounding
    519  int si_compsylmax;            // max nr of syllables for compounding
    520  int si_compoptions;           // COMP_ flags
    521  garray_T si_comppat;          // CHECKCOMPOUNDPATTERN items, each stored as
    522                                // a string
    523  char *si_compflags;           // flags used for compounding
    524  char si_nobreak;              // NOBREAK
    525  char *si_syllable;            // syllable string
    526  garray_T si_prefcond;         // table with conditions for postponed
    527                                // prefixes, each stored as a string
    528  int si_newprefID;             // current value for ah_newID
    529  int si_newcompID;             // current value for compound ID
    530 } spellinfo_T;
    531 
    532 #include "spellfile.c.generated.h"
    533 
    534 /// Read n bytes from fd to buf, returning on errors
    535 ///
    536 /// @param[out]  buf  Buffer to read to, must be at least n bytes long.
    537 /// @param[in]  n  Amount of bytes to read.
    538 /// @param  fd  FILE* to read from.
    539 /// @param  exit_code  Code to run before returning.
    540 ///
    541 /// @return Allows to proceed if everything is OK, returns SP_TRUNCERROR if
    542 ///         there are not enough bytes, returns SP_OTHERERROR if reading failed.
    543 #define SPELL_READ_BYTES(buf, n, fd, exit_code) \
    544  do { \
    545    const size_t n__SPRB = (n); \
    546    FILE *const fd__SPRB = (fd); \
    547    char *const buf__SPRB = (buf); \
    548    const size_t read_bytes__SPRB = fread(buf__SPRB, 1, n__SPRB, fd__SPRB); \
    549    if (read_bytes__SPRB != n__SPRB) { \
    550      exit_code; \
    551      return feof(fd__SPRB) ? SP_TRUNCERROR : SP_OTHERERROR; \
    552    } \
    553  } while (0)
    554 
    555 /// Like #SPELL_READ_BYTES, but also error out if NUL byte was read
    556 ///
    557 /// @return Allows to proceed if everything is OK, returns SP_TRUNCERROR if
    558 ///         there are not enough bytes, returns SP_OTHERERROR if reading failed,
    559 ///         returns SP_FORMERROR if read out a NUL byte.
    560 #define SPELL_READ_NONNUL_BYTES(buf, n, fd, exit_code) \
    561  do { \
    562    const size_t n__SPRNB = (n); \
    563    FILE *const fd__SPRNB = (fd); \
    564    char *const buf__SPRNB = (buf); \
    565    SPELL_READ_BYTES(buf__SPRNB, n__SPRNB, fd__SPRNB, exit_code); \
    566    if (memchr(buf__SPRNB, NUL, (size_t)n__SPRNB)) { \
    567      exit_code; \
    568      return SP_FORMERROR; \
    569    } \
    570  } while (0)
    571 
    572 /// Check that spell file starts with a magic string
    573 ///
    574 /// Does not check for version of the file.
    575 ///
    576 /// @param  fd  File to check.
    577 ///
    578 /// @return 0 in case of success, SP_TRUNCERROR if file contains not enough
    579 ///         bytes, SP_FORMERROR if it does not match magic string and
    580 ///         SP_OTHERERROR if reading file failed.
    581 static inline int spell_check_magic_string(FILE *const fd)
    582  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
    583 {
    584  char buf[VIMSPELLMAGICL];
    585  SPELL_READ_BYTES(buf, VIMSPELLMAGICL, fd,; );
    586  if (memcmp(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0) {
    587    return SP_FORMERROR;
    588  }
    589  return 0;
    590 }
    591 
    592 /// Load one spell file and store the info into a slang_T.
    593 ///
    594 /// This is invoked in three ways:
    595 /// - From spell_load_cb() to load a spell file for the first time.  "lang" is
    596 ///   the language name, "old_lp" is NULL.  Will allocate an slang_T.
    597 /// - To reload a spell file that was changed.  "lang" is NULL and "old_lp"
    598 ///   points to the existing slang_T.
    599 /// - Just after writing a .spl file; it's read back to produce the .sug file.
    600 ///   "old_lp" is NULL and "lang" is NULL.  Will allocate an slang_T.
    601 ///
    602 /// @param silent  no error if file doesn't exist
    603 ///
    604 /// @return  the slang_T the spell file was loaded into.  NULL for error.
    605 slang_T *spell_load_file(char *fname, char *lang, slang_T *old_lp, bool silent)
    606 {
    607  char *p;
    608  slang_T *lp = NULL;
    609  int res;
    610  bool did_estack_push = false;
    611  ESTACK_CHECK_DECLARATION;
    612 
    613  FILE *fd = os_fopen(fname, "r");
    614  if (fd == NULL) {
    615    if (!silent) {
    616      semsg(_(e_notopen), fname);
    617    } else if (p_verbose > 2) {
    618      verbose_enter();
    619      smsg(0, e_notopen, fname);
    620      verbose_leave();
    621    }
    622    goto endFAIL;
    623  }
    624  if (p_verbose > 2) {
    625    verbose_enter();
    626    smsg(0, _("Reading spell file \"%s\""), fname);
    627    verbose_leave();
    628  }
    629 
    630  if (old_lp == NULL) {
    631    lp = slang_alloc(lang);
    632 
    633    // Remember the file name, used to reload the file when it's updated.
    634    lp->sl_fname = xstrdup(fname);
    635 
    636    // Check for .add.spl.
    637    lp->sl_add = strstr(path_tail(fname), SPL_FNAME_ADD) != NULL;
    638  } else {
    639    lp = old_lp;
    640  }
    641 
    642  // Set sourcing_name, so that error messages mention the file name.
    643  estack_push(ETYPE_SPELL, fname, 0);
    644  ESTACK_CHECK_SETUP;
    645  did_estack_push = true;
    646 
    647  // <HEADER>: <fileID>
    648  const int scms_ret = spell_check_magic_string(fd);
    649  switch (scms_ret) {
    650  case SP_FORMERROR:
    651  case SP_TRUNCERROR:
    652    semsg("%s", _("E757: This does not look like a spell file"));
    653    goto endFAIL;
    654  case SP_OTHERERROR:
    655    semsg(_("E5042: Failed to read spell file %s: %s"),
    656          fname, strerror(ferror(fd)));
    657    goto endFAIL;
    658  case 0:
    659    break;
    660  }
    661  int c = getc(fd);                                         // <versionnr>
    662  if (c < VIMSPELLVERSION) {
    663    emsg(_("E771: Old spell file, needs to be updated"));
    664    goto endFAIL;
    665  } else if (c > VIMSPELLVERSION) {
    666    emsg(_("E772: Spell file is for newer version of Vim"));
    667    goto endFAIL;
    668  }
    669 
    670  // <SECTIONS>: <section> ... <sectionend>
    671  // <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
    672  while (true) {
    673    int n = getc(fd);                           // <sectionID> or <sectionend>
    674    if (n == SN_END) {
    675      break;
    676    }
    677    c = getc(fd);                                       // <sectionflags>
    678    int len = get4c(fd);                                    // <sectionlen>
    679    if (len < 0) {
    680      goto truncerr;
    681    }
    682 
    683    res = 0;
    684    switch (n) {
    685    case SN_INFO:
    686      XFREE_CLEAR(lp->sl_info);
    687      lp->sl_info = read_string(fd, (size_t)len);  // <infotext>
    688      if (lp->sl_info == NULL) {
    689        goto endFAIL;
    690      }
    691      break;
    692 
    693    case SN_REGION:
    694      res = read_region_section(fd, lp, len);
    695      break;
    696 
    697    case SN_CHARFLAGS:
    698      res = read_charflags_section(fd);
    699      break;
    700 
    701    case SN_MIDWORD:
    702      lp->sl_midword = read_string(fd, (size_t)len);  // <midword>
    703      if (lp->sl_midword == NULL) {
    704        goto endFAIL;
    705      }
    706      break;
    707 
    708    case SN_PREFCOND:
    709      res = read_prefcond_section(fd, lp);
    710      break;
    711 
    712    case SN_REP:
    713      res = read_rep_section(fd, &lp->sl_rep, lp->sl_rep_first);
    714      break;
    715 
    716    case SN_REPSAL:
    717      res = read_rep_section(fd, &lp->sl_repsal, lp->sl_repsal_first);
    718      break;
    719 
    720    case SN_SAL:
    721      res = read_sal_section(fd, lp);
    722      break;
    723 
    724    case SN_SOFO:
    725      res = read_sofo_section(fd, lp);
    726      break;
    727 
    728    case SN_MAP:
    729      p = read_string(fd, (size_t)len);  // <mapstr>
    730      if (p == NULL) {
    731        goto endFAIL;
    732      }
    733      set_map_str(lp, p);
    734      xfree(p);
    735      break;
    736 
    737    case SN_WORDS:
    738      res = read_words_section(fd, lp, len);
    739      break;
    740 
    741    case SN_SUGFILE:
    742      lp->sl_sugtime = get8ctime(fd);                   // <timestamp>
    743      break;
    744 
    745    case SN_NOSPLITSUGS:
    746      lp->sl_nosplitsugs = true;
    747      break;
    748 
    749    case SN_NOCOMPOUNDSUGS:
    750      lp->sl_nocompoundsugs = true;
    751      break;
    752 
    753    case SN_COMPOUND:
    754      res = read_compound(fd, lp, len);
    755      break;
    756 
    757    case SN_NOBREAK:
    758      lp->sl_nobreak = true;
    759      break;
    760 
    761    case SN_SYLLABLE:
    762      lp->sl_syllable = read_string(fd, (size_t)len);  // <syllable>
    763      if (lp->sl_syllable == NULL) {
    764        goto endFAIL;
    765      }
    766      if (init_syl_tab(lp) != OK) {
    767        goto endFAIL;
    768      }
    769      break;
    770 
    771    default:
    772      // Unsupported section.  When it's required give an error
    773      // message.  When it's not required skip the contents.
    774      if (c & SNF_REQUIRED) {
    775        emsg(_("E770: Unsupported section in spell file"));
    776        goto endFAIL;
    777      }
    778      while (--len >= 0) {
    779        if (getc(fd) < 0) {
    780          goto truncerr;
    781        }
    782      }
    783      break;
    784    }
    785 someerror:
    786    if (res == SP_FORMERROR) {
    787      emsg(_(e_format));
    788      goto endFAIL;
    789    }
    790    if (res == SP_TRUNCERROR) {
    791 truncerr:
    792      emsg(_(e_spell_trunc));
    793      goto endFAIL;
    794    }
    795    if (res == SP_OTHERERROR) {
    796      goto endFAIL;
    797    }
    798  }
    799 
    800  // <LWORDTREE>
    801  res = spell_read_tree(fd, &lp->sl_fbyts, &lp->sl_fbyts_len,
    802                        &lp->sl_fidxs, false, 0);
    803  if (res != 0) {
    804    goto someerror;
    805  }
    806 
    807  // <KWORDTREE>
    808  res = spell_read_tree(fd, &lp->sl_kbyts, NULL, &lp->sl_kidxs, false, 0);
    809  if (res != 0) {
    810    goto someerror;
    811  }
    812 
    813  // <PREFIXTREE>
    814  res = spell_read_tree(fd, &lp->sl_pbyts, NULL, &lp->sl_pidxs, true,
    815                        lp->sl_prefixcnt);
    816  if (res != 0) {
    817    goto someerror;
    818  }
    819 
    820  // For a new file link it in the list of spell files.
    821  if (old_lp == NULL && lang != NULL) {
    822    lp->sl_next = first_lang;
    823    first_lang = lp;
    824  }
    825 
    826  goto endOK;
    827 
    828 endFAIL:
    829  if (lang != NULL) {
    830    // truncating the name signals the error to spell_load_lang()
    831    *lang = NUL;
    832  }
    833  if (lp != NULL && old_lp == NULL) {
    834    slang_free(lp);
    835  }
    836  lp = NULL;
    837 
    838 endOK:
    839  if (fd != NULL) {
    840    fclose(fd);
    841  }
    842  if (did_estack_push) {
    843    ESTACK_CHECK_NOW;
    844    estack_pop();
    845  }
    846 
    847  return lp;
    848 }
    849 
    850 // Fill in the wordcount fields for a trie.
    851 // Returns the total number of words.
    852 static void tree_count_words(const uint8_t *byts, idx_T *idxs)
    853 {
    854  idx_T arridx[MAXWLEN];
    855  int curi[MAXWLEN];
    856  int wordcount[MAXWLEN];
    857 
    858  arridx[0] = 0;
    859  curi[0] = 1;
    860  wordcount[0] = 0;
    861  int depth = 0;
    862  while (depth >= 0 && !got_int) {
    863    if (curi[depth] > byts[arridx[depth]]) {
    864      // Done all bytes at this node, go up one level.
    865      idxs[arridx[depth]] = wordcount[depth];
    866      if (depth > 0) {
    867        wordcount[depth - 1] += wordcount[depth];
    868      }
    869 
    870      depth--;
    871      fast_breakcheck();
    872    } else {
    873      // Do one more byte at this node.
    874      idx_T n = arridx[depth] + curi[depth];
    875      curi[depth]++;
    876 
    877      int c = byts[n];
    878      if (c == 0) {
    879        // End of word, count it.
    880        wordcount[depth]++;
    881 
    882        // Skip over any other NUL bytes (same word with different
    883        // flags).
    884        while (byts[n + 1] == 0) {
    885          n++;
    886          curi[depth]++;
    887        }
    888      } else {
    889        // Normal char, go one level deeper to count the words.
    890        depth++;
    891        arridx[depth] = idxs[n];
    892        curi[depth] = 1;
    893        wordcount[depth] = 0;
    894      }
    895    }
    896  }
    897 }
    898 
    899 /// Load the .sug files for languages that have one and weren't loaded yet.
    900 void suggest_load_files(void)
    901 {
    902  char buf[MAXWLEN];
    903  garray_T ga;
    904 
    905  // Do this for all languages that support sound folding.
    906  for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) {
    907    langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
    908    slang_T *slang = lp->lp_slang;
    909    if (slang->sl_sugtime != 0 && !slang->sl_sugloaded) {
    910      // Change ".spl" to ".sug" and open the file.  When the file isn't
    911      // found silently skip it.  Do set "sl_sugloaded" so that we
    912      // don't try again and again.
    913      slang->sl_sugloaded = true;
    914 
    915      char *dotp = strrchr(slang->sl_fname, '.');
    916      if (dotp == NULL || path_fnamecmp(dotp, ".spl") != 0) {
    917        continue;
    918      }
    919      STRCPY(dotp, ".sug");
    920      FILE *fd = os_fopen(slang->sl_fname, "r");
    921      if (fd == NULL) {
    922        goto nextone;
    923      }
    924 
    925      // <SUGHEADER>: <fileID> <versionnr> <timestamp>
    926      for (int i = 0; i < VIMSUGMAGICL; i++) {
    927        buf[i] = (char)getc(fd);                              // <fileID>
    928      }
    929      if (strncmp(buf, VIMSUGMAGIC, VIMSUGMAGICL) != 0) {
    930        semsg(_("E778: This does not look like a .sug file: %s"),
    931              slang->sl_fname);
    932        goto nextone;
    933      }
    934      int c = getc(fd);                                     // <versionnr>
    935      if (c < VIMSUGVERSION) {
    936        semsg(_("E779: Old .sug file, needs to be updated: %s"),
    937              slang->sl_fname);
    938        goto nextone;
    939      } else if (c > VIMSUGVERSION) {
    940        semsg(_("E780: .sug file is for newer version of Vim: %s"),
    941              slang->sl_fname);
    942        goto nextone;
    943      }
    944 
    945      // Check the timestamp, it must be exactly the same as the one in
    946      // the .spl file.  Otherwise the word numbers won't match.
    947      time_t timestamp = get8ctime(fd);                        // <timestamp>
    948      if (timestamp != slang->sl_sugtime) {
    949        semsg(_("E781: .sug file doesn't match .spl file: %s"),
    950              slang->sl_fname);
    951        goto nextone;
    952      }
    953 
    954      // <SUGWORDTREE>: <wordtree>
    955      // Read the trie with the soundfolded words.
    956      if (spell_read_tree(fd, &slang->sl_sbyts, NULL, &slang->sl_sidxs,
    957                          false, 0) != 0) {
    958 someerror:
    959        semsg(_(e_error_while_reading_sug_file_str),
    960              slang->sl_fname);
    961        slang_clear_sug(slang);
    962        goto nextone;
    963      }
    964 
    965      // <SUGTABLE>: <sugwcount> <sugline> ...
    966      //
    967      // Read the table with word numbers.  We use a file buffer for
    968      // this, because it's so much like a file with lines.  Makes it
    969      // possible to swap the info and save on memory use.
    970      slang->sl_sugbuf = open_spellbuf();
    971 
    972      // <sugwcount>
    973      int wcount = get4c(fd);
    974      if (wcount < 0) {
    975        goto someerror;
    976      }
    977 
    978      // Read all the wordnr lists into the buffer, one NUL terminated
    979      // list per line.
    980      ga_init(&ga, 1, 100);
    981      for (int wordnr = 0; wordnr < wcount; wordnr++) {
    982        ga.ga_len = 0;
    983        while (true) {
    984          c = getc(fd);                                     // <sugline>
    985          if (c < 0) {
    986            goto someerror;
    987          }
    988          GA_APPEND(uint8_t, &ga, (uint8_t)c);
    989          if (c == NUL) {
    990            break;
    991          }
    992        }
    993        if (ml_append_buf(slang->sl_sugbuf, (linenr_T)wordnr,
    994                          ga.ga_data, ga.ga_len, true) == FAIL) {
    995          goto someerror;
    996        }
    997      }
    998      ga_clear(&ga);
    999 
   1000      // Need to put word counts in the word tries, so that we can find
   1001      // a word by its number.
   1002      tree_count_words(slang->sl_fbyts, slang->sl_fidxs);
   1003      tree_count_words(slang->sl_sbyts, slang->sl_sidxs);
   1004 
   1005 nextone:
   1006      if (fd != NULL) {
   1007        fclose(fd);
   1008      }
   1009      STRCPY(dotp, ".spl");
   1010    }
   1011  }
   1012 }
   1013 
   1014 // Read a length field from "fd" in "cnt_bytes" bytes.
   1015 // Allocate memory, read the string into it and add a NUL at the end.
   1016 // Returns NULL when the count is zero.
   1017 // Sets "*cntp" to SP_*ERROR when there is an error, length of the result
   1018 // otherwise.
   1019 static char *read_cnt_string(FILE *fd, int cnt_bytes, int *cntp)
   1020 {
   1021  int cnt = 0;
   1022 
   1023  // read the length bytes, MSB first
   1024  for (int i = 0; i < cnt_bytes; i++) {
   1025    const int c = getc(fd);
   1026 
   1027    if (c == EOF) {
   1028      *cntp = SP_TRUNCERROR;
   1029      return NULL;
   1030    }
   1031    cnt = (int)(((unsigned)cnt << 8) + (unsigned)c);
   1032  }
   1033  *cntp = cnt;
   1034  if (cnt == 0) {
   1035    return NULL;            // nothing to read, return NULL
   1036  }
   1037  char *str = read_string(fd, (size_t)cnt);
   1038  if (str == NULL) {
   1039    *cntp = SP_OTHERERROR;
   1040  }
   1041  return str;
   1042 }
   1043 
   1044 // Read SN_REGION: <regionname> ...
   1045 // Return SP_*ERROR flags.
   1046 static int read_region_section(FILE *fd, slang_T *lp, int len)
   1047 {
   1048  if (len > MAXREGIONS * 2) {
   1049    return SP_FORMERROR;
   1050  }
   1051  SPELL_READ_NONNUL_BYTES(lp->sl_regions, (size_t)len, fd,; );
   1052  lp->sl_regions[len] = NUL;
   1053  return 0;
   1054 }
   1055 
   1056 // Read SN_CHARFLAGS section: <charflagslen> <charflags>
   1057 //                              <folcharslen> <folchars>
   1058 // Return SP_*ERROR flags.
   1059 static int read_charflags_section(FILE *fd)
   1060 {
   1061  int flagslen, follen;
   1062 
   1063  // <charflagslen> <charflags>
   1064  char *flags = read_cnt_string(fd, 1, &flagslen);
   1065  if (flagslen < 0) {
   1066    return flagslen;
   1067  }
   1068 
   1069  // <folcharslen> <folchars>
   1070  char *fol = read_cnt_string(fd, 2, &follen);
   1071  if (follen < 0) {
   1072    xfree(flags);
   1073    return follen;
   1074  }
   1075 
   1076  // Set the word-char flags and fill SPELL_ISUPPER() table.
   1077  if (flags != NULL && fol != NULL) {
   1078    set_spell_charflags(flags, flagslen, fol);
   1079  }
   1080 
   1081  xfree(flags);
   1082  xfree(fol);
   1083 
   1084  // When <charflagslen> is zero then <fcharlen> must also be zero.
   1085  if ((flags == NULL) != (fol == NULL)) {
   1086    return SP_FORMERROR;
   1087  }
   1088  return 0;
   1089 }
   1090 
   1091 // Read SN_PREFCOND section.
   1092 // Return SP_*ERROR flags.
   1093 static int read_prefcond_section(FILE *fd, slang_T *lp)
   1094 {
   1095  // <prefcondcnt> <prefcond> ...
   1096  const int cnt = get2c(fd);  // <prefcondcnt>
   1097  if (cnt <= 0) {
   1098    return SP_FORMERROR;
   1099  }
   1100 
   1101  lp->sl_prefprog = xcalloc((size_t)cnt, sizeof(regprog_T *));
   1102  lp->sl_prefixcnt = cnt;
   1103 
   1104  for (int i = 0; i < cnt; i++) {
   1105    // <prefcond> : <condlen> <condstr>
   1106    const int n = getc(fd);  // <condlen>
   1107    if (n < 0 || n >= MAXWLEN) {
   1108      return SP_FORMERROR;
   1109    }
   1110 
   1111    // When <condlen> is zero we have an empty condition.  Otherwise
   1112    // compile the regexp program used to check for the condition.
   1113    if (n > 0) {
   1114      char buf[MAXWLEN + 1];
   1115      buf[0] = '^';  // always match at one position only
   1116      SPELL_READ_NONNUL_BYTES(buf + 1, (size_t)n, fd,; );
   1117      buf[n + 1] = NUL;
   1118      lp->sl_prefprog[i] = vim_regcomp(buf, RE_MAGIC | RE_STRING);
   1119    }
   1120  }
   1121  return 0;
   1122 }
   1123 
   1124 // Read REP or REPSAL items section from "fd": <repcount> <rep> ...
   1125 // Return SP_*ERROR flags.
   1126 static int read_rep_section(FILE *fd, garray_T *gap, int16_t *first)
   1127 {
   1128  fromto_T *ftp;
   1129 
   1130  int cnt = get2c(fd);                                      // <repcount>
   1131  if (cnt < 0) {
   1132    return SP_TRUNCERROR;
   1133  }
   1134 
   1135  ga_grow(gap, cnt);
   1136 
   1137  // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
   1138  for (; gap->ga_len < cnt; gap->ga_len++) {
   1139    int c;
   1140    ftp = &((fromto_T *)gap->ga_data)[gap->ga_len];
   1141    ftp->ft_from = read_cnt_string(fd, 1, &c);
   1142    if (c < 0) {
   1143      return c;
   1144    }
   1145    if (c == 0) {
   1146      return SP_FORMERROR;
   1147    }
   1148    ftp->ft_to = read_cnt_string(fd, 1, &c);
   1149    if (c <= 0) {
   1150      xfree(ftp->ft_from);
   1151      if (c < 0) {
   1152        return c;
   1153      }
   1154      return SP_FORMERROR;
   1155    }
   1156  }
   1157 
   1158  // Fill the first-index table.
   1159  for (int i = 0; i < 256; i++) {
   1160    first[i] = -1;
   1161  }
   1162  for (int i = 0; i < gap->ga_len; i++) {
   1163    ftp = &((fromto_T *)gap->ga_data)[i];
   1164    if (first[(uint8_t)(*ftp->ft_from)] == -1) {
   1165      first[(uint8_t)(*ftp->ft_from)] = (int16_t)i;
   1166    }
   1167  }
   1168  return 0;
   1169 }
   1170 
   1171 // Read SN_SAL section: <salflags> <salcount> <sal> ...
   1172 // Return SP_*ERROR flags.
   1173 static int read_sal_section(FILE *fd, slang_T *slang)
   1174 {
   1175  slang->sl_sofo = false;
   1176 
   1177  const int flags = getc(fd);                   // <salflags>
   1178  if (flags & SAL_F0LLOWUP) {
   1179    slang->sl_followup = true;
   1180  }
   1181  if (flags & SAL_COLLAPSE) {
   1182    slang->sl_collapse = true;
   1183  }
   1184  if (flags & SAL_REM_ACCENTS) {
   1185    slang->sl_rem_accents = true;
   1186  }
   1187 
   1188  int cnt = get2c(fd);                              // <salcount>
   1189  if (cnt < 0) {
   1190    return SP_TRUNCERROR;
   1191  }
   1192 
   1193  garray_T *gap = &slang->sl_sal;
   1194  ga_init(gap, sizeof(salitem_T), 10);
   1195  ga_grow(gap, cnt + 1);
   1196 
   1197  // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
   1198  for (; gap->ga_len < cnt; gap->ga_len++) {
   1199    int c = NUL;
   1200 
   1201    salitem_T *smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
   1202    int ccnt = getc(fd);                            // <salfromlen>
   1203    if (ccnt < 0) {
   1204      return SP_TRUNCERROR;
   1205    }
   1206    char *p = xmalloc((size_t)ccnt + 2);
   1207    smp->sm_lead = p;
   1208 
   1209    // Read up to the first special char into sm_lead.
   1210    int i = 0;
   1211    for (; i < ccnt; i++) {
   1212      c = getc(fd);                             // <salfrom>
   1213      if (vim_strchr("0123456789(-<^$", c) != NULL) {
   1214        break;
   1215      }
   1216      *p++ = (char)(uint8_t)c;
   1217    }
   1218    smp->sm_leadlen = (int)(p - smp->sm_lead);
   1219    *p++ = NUL;
   1220 
   1221    // Put (abc) chars in sm_oneof, if any.
   1222    if (c == '(') {
   1223      smp->sm_oneof = p;
   1224      for (++i; i < ccnt; i++) {
   1225        c = getc(fd);                           // <salfrom>
   1226        if (c == ')') {
   1227          break;
   1228        }
   1229        *p++ = (char)(uint8_t)c;
   1230      }
   1231      *p++ = NUL;
   1232      if (++i < ccnt) {
   1233        c = getc(fd);
   1234      }
   1235    } else {
   1236      smp->sm_oneof = NULL;
   1237    }
   1238 
   1239    // Any following chars go in sm_rules.
   1240    smp->sm_rules = p;
   1241    if (i < ccnt) {
   1242      // store the char we got while checking for end of sm_lead
   1243      *p++ = (char)(uint8_t)c;
   1244    }
   1245    i++;
   1246    if (i < ccnt) {
   1247      SPELL_READ_NONNUL_BYTES(                  // <salfrom>
   1248                                                p, (size_t)(ccnt - i), fd,
   1249                                                xfree(smp->sm_lead));
   1250      p += (ccnt - i);
   1251    }
   1252    *p++ = NUL;
   1253 
   1254    // <saltolen> <salto>
   1255    smp->sm_to = read_cnt_string(fd, 1, &ccnt);
   1256    if (ccnt < 0) {
   1257      xfree(smp->sm_lead);
   1258      return ccnt;
   1259    }
   1260 
   1261    // convert the multi-byte strings to wide char strings
   1262    smp->sm_lead_w = mb_str2wide(smp->sm_lead);
   1263    smp->sm_leadlen = mb_charlen(smp->sm_lead);
   1264    if (smp->sm_oneof == NULL) {
   1265      smp->sm_oneof_w = NULL;
   1266    } else {
   1267      smp->sm_oneof_w = mb_str2wide(smp->sm_oneof);
   1268    }
   1269    if (smp->sm_to == NULL) {
   1270      smp->sm_to_w = NULL;
   1271    } else {
   1272      smp->sm_to_w = mb_str2wide(smp->sm_to);
   1273    }
   1274  }
   1275 
   1276  if (!GA_EMPTY(gap)) {
   1277    // Add one extra entry to mark the end with an empty sm_lead.  Avoids
   1278    // that we need to check the index every time.
   1279    salitem_T *smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
   1280    char *p = xmalloc(1);
   1281    p[0] = NUL;
   1282    smp->sm_lead = p;
   1283    smp->sm_lead_w = mb_str2wide(smp->sm_lead);
   1284    smp->sm_leadlen = 0;
   1285    smp->sm_oneof = NULL;
   1286    smp->sm_oneof_w = NULL;
   1287    smp->sm_rules = p;
   1288    smp->sm_to = NULL;
   1289    smp->sm_to_w = NULL;
   1290    gap->ga_len++;
   1291  }
   1292 
   1293  // Fill the first-index table.
   1294  set_sal_first(slang);
   1295 
   1296  return 0;
   1297 }
   1298 
   1299 // Read SN_WORDS: <word> ...
   1300 // Return SP_*ERROR flags.
   1301 static int read_words_section(FILE *fd, slang_T *lp, int len)
   1302 {
   1303  int done = 0;
   1304  int i;
   1305  uint8_t word[MAXWLEN];
   1306 
   1307  while (done < len) {
   1308    // Read one word at a time.
   1309    for (i = 0;; i++) {
   1310      int c = getc(fd);
   1311      if (c == EOF) {
   1312        return SP_TRUNCERROR;
   1313      }
   1314      word[i] = (uint8_t)c;
   1315      if (word[i] == NUL) {
   1316        break;
   1317      }
   1318      if (i == MAXWLEN - 1) {
   1319        return SP_FORMERROR;
   1320      }
   1321    }
   1322 
   1323    // Init the count to 10.
   1324    count_common_word(lp, (char *)word, -1, 10);
   1325    done += i + 1;
   1326  }
   1327  return 0;
   1328 }
   1329 
   1330 // SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
   1331 // Return SP_*ERROR flags.
   1332 static int read_sofo_section(FILE *fd, slang_T *slang)
   1333 {
   1334  int cnt;
   1335  int res;
   1336 
   1337  slang->sl_sofo = true;
   1338 
   1339  // <sofofromlen> <sofofrom>
   1340  char *from = read_cnt_string(fd, 2, &cnt);
   1341  if (cnt < 0) {
   1342    return cnt;
   1343  }
   1344 
   1345  // <sofotolen> <sofoto>
   1346  char *to = read_cnt_string(fd, 2, &cnt);
   1347  if (cnt < 0) {
   1348    xfree(from);
   1349    return cnt;
   1350  }
   1351 
   1352  // Store the info in slang->sl_sal and/or slang->sl_sal_first.
   1353  if (from != NULL && to != NULL) {
   1354    res = set_sofo(slang, from, to);
   1355  } else if (from != NULL || to != NULL) {
   1356    res = SP_FORMERROR;        // only one of two strings is an error
   1357  } else {
   1358    res = 0;
   1359  }
   1360 
   1361  xfree(from);
   1362  xfree(to);
   1363  return res;
   1364 }
   1365 
   1366 // Read the compound section from the .spl file:
   1367 //      <compmax> <compminlen> <compsylmax> <compoptions> <compflags>
   1368 // Returns SP_*ERROR flags.
   1369 static int read_compound(FILE *fd, slang_T *slang, int len)
   1370 {
   1371  int todo = len;
   1372  int cnt;
   1373 
   1374  if (todo < 2) {
   1375    return SP_FORMERROR;        // need at least two bytes
   1376  }
   1377  todo--;
   1378  int c = getc(fd);                                         // <compmax>
   1379  if (c < 2) {
   1380    c = MAXWLEN;
   1381  }
   1382  slang->sl_compmax = c;
   1383 
   1384  todo--;
   1385  c = getc(fd);                                         // <compminlen>
   1386  if (c < 1) {
   1387    c = 0;
   1388  }
   1389  slang->sl_compminlen = c;
   1390 
   1391  todo--;
   1392  c = getc(fd);                                         // <compsylmax>
   1393  if (c < 1) {
   1394    c = MAXWLEN;
   1395  }
   1396  slang->sl_compsylmax = c;
   1397 
   1398  c = getc(fd);                                         // <compoptions>
   1399  if (c != 0) {
   1400    ungetc(c, fd);          // be backwards compatible with Vim 7.0b
   1401  } else {
   1402    todo--;
   1403    c = getc(fd);           // only use the lower byte for now
   1404    todo--;
   1405    slang->sl_compoptions = c;
   1406 
   1407    garray_T *gap = &slang->sl_comppat;
   1408    c = get2c(fd);                                      // <comppatcount>
   1409    if (c < 0) {
   1410      return SP_TRUNCERROR;
   1411    }
   1412    todo -= 2;
   1413    ga_init(gap, sizeof(char *), c);
   1414    ga_grow(gap, c);
   1415    while (--c >= 0) {
   1416      ((char **)(gap->ga_data))[gap->ga_len++] = read_cnt_string(fd, 1, &cnt);
   1417      // <comppatlen> <comppattext>
   1418      if (cnt < 0) {
   1419        return cnt;
   1420      }
   1421      todo -= cnt + 1;
   1422    }
   1423  }
   1424  if (todo < 0) {
   1425    return SP_FORMERROR;
   1426  }
   1427 
   1428  // Turn the COMPOUNDRULE items into a regexp pattern:
   1429  // "a[bc]/a*b+" -> "^\(a[bc]\|a*b\+\)$".
   1430  // Inserting backslashes may double the length, "^\(\)$<Nul>" is 7 bytes.
   1431  // Conversion to utf-8 may double the size.
   1432  c = todo * 2 + 7;
   1433  c += todo * 2;
   1434  char *pat = xmalloc((size_t)c);
   1435 
   1436  // We also need a list of all flags that can appear at the start and one
   1437  // for all flags.
   1438  uint8_t *cp = xmalloc((size_t)todo + 1);
   1439  slang->sl_compstartflags = cp;
   1440  *cp = NUL;
   1441 
   1442  uint8_t *ap = xmalloc((size_t)todo + 1);
   1443  slang->sl_compallflags = ap;
   1444  *ap = NUL;
   1445 
   1446  // And a list of all patterns in their original form, for checking whether
   1447  // compounding may work in match_compoundrule().  This is freed when we
   1448  // encounter a wildcard, the check doesn't work then.
   1449  uint8_t *crp = xmalloc((size_t)todo + 1);
   1450  slang->sl_comprules = crp;
   1451 
   1452  char *pp = pat;
   1453  *pp++ = '^';
   1454  *pp++ = '\\';
   1455  *pp++ = '(';
   1456 
   1457  int atstart = 1;
   1458  while (todo-- > 0) {
   1459    c = getc(fd);                                       // <compflags>
   1460    if (c == EOF) {
   1461      xfree(pat);
   1462      return SP_TRUNCERROR;
   1463    }
   1464 
   1465    // Add all flags to "sl_compallflags".
   1466    if (vim_strchr("?*+[]/", c) == NULL
   1467        && !byte_in_str(slang->sl_compallflags, c)) {
   1468      *ap++ = (uint8_t)c;
   1469      *ap = NUL;
   1470    }
   1471 
   1472    if (atstart != 0) {
   1473      // At start of item: copy flags to "sl_compstartflags".  For a
   1474      // [abc] item set "atstart" to 2 and copy up to the ']'.
   1475      if (c == '[') {
   1476        atstart = 2;
   1477      } else if (c == ']') {
   1478        atstart = 0;
   1479      } else {
   1480        if (!byte_in_str(slang->sl_compstartflags, c)) {
   1481          *cp++ = (uint8_t)c;
   1482          *cp = NUL;
   1483        }
   1484        if (atstart == 1) {
   1485          atstart = 0;
   1486        }
   1487      }
   1488    }
   1489 
   1490    // Copy flag to "sl_comprules", unless we run into a wildcard.
   1491    if (crp != NULL) {
   1492      if (c == '?' || c == '+' || c == '*') {
   1493        XFREE_CLEAR(slang->sl_comprules);
   1494        crp = NULL;
   1495      } else {
   1496        *crp++ = (uint8_t)c;
   1497      }
   1498    }
   1499 
   1500    if (c == '/') {         // slash separates two items
   1501      *pp++ = '\\';
   1502      *pp++ = '|';
   1503      atstart = 1;
   1504    } else {              // normal char, "[abc]" and '*' are copied as-is
   1505      if (c == '?' || c == '+' || c == '~') {
   1506        *pp++ = '\\';               // "a?" becomes "a\?", "a+" becomes "a\+"
   1507      }
   1508      pp += utf_char2bytes(c, pp);
   1509    }
   1510  }
   1511 
   1512  *pp++ = '\\';
   1513  *pp++ = ')';
   1514  *pp++ = '$';
   1515  *pp = NUL;
   1516 
   1517  if (crp != NULL) {
   1518    *crp = NUL;
   1519  }
   1520 
   1521  slang->sl_compprog = vim_regcomp(pat, RE_MAGIC + RE_STRING + RE_STRICT);
   1522  xfree(pat);
   1523  if (slang->sl_compprog == NULL) {
   1524    return SP_FORMERROR;
   1525  }
   1526 
   1527  return 0;
   1528 }
   1529 
   1530 // Set the SOFOFROM and SOFOTO items in language "lp".
   1531 // Returns SP_*ERROR flags when there is something wrong.
   1532 static int set_sofo(slang_T *lp, const char *from, const char *to)
   1533 {
   1534  const char *s;
   1535  const char *p;
   1536 
   1537  // Use "sl_sal" as an array with 256 pointers to a list of wide
   1538  // characters.  The index is the low byte of the character.
   1539  // The list contains from-to pairs with a terminating NUL.
   1540  // sl_sal_first[] is used for latin1 "from" characters.
   1541  garray_T *gap = &lp->sl_sal;
   1542  ga_init(gap, sizeof(int *), 1);
   1543  ga_grow(gap, 256);
   1544  memset(gap->ga_data, 0, sizeof(int *) * 256);
   1545  gap->ga_len = 256;
   1546 
   1547  // First count the number of items for each list.  Temporarily use
   1548  // sl_sal_first[] for this.
   1549  for (p = from, s = to; *p != NUL && *s != NUL;) {
   1550    const int c = mb_cptr2char_adv(&p);
   1551    s += utf_ptr2len(s);
   1552    if (c >= 256) {
   1553      lp->sl_sal_first[c & 0xff]++;
   1554    }
   1555  }
   1556  if (*p != NUL || *s != NUL) {  // lengths differ
   1557    return SP_FORMERROR;
   1558  }
   1559 
   1560  // Allocate the lists.
   1561  for (int i = 0; i < 256; i++) {
   1562    if (lp->sl_sal_first[i] > 0) {
   1563      p = xmalloc(sizeof(int) * (size_t)(lp->sl_sal_first[i] * 2 + 1));
   1564      ((int **)gap->ga_data)[i] = (int *)p;
   1565      *(int *)p = 0;
   1566    }
   1567  }
   1568 
   1569  // Put the characters up to 255 in sl_sal_first[] the rest in a sl_sal
   1570  // list.
   1571  memset(lp->sl_sal_first, 0, sizeof(salfirst_T) * 256);
   1572  for (p = from, s = to; *p != NUL && *s != NUL;) {
   1573    const int c = mb_cptr2char_adv(&p);
   1574    const int i = mb_cptr2char_adv(&s);
   1575    if (c >= 256) {
   1576      // Append the from-to chars at the end of the list with
   1577      // the low byte.
   1578      int *inp = ((int **)gap->ga_data)[c & 0xff];
   1579      while (*inp != 0) {
   1580        inp++;
   1581      }
   1582      *inp++ = c;                     // from char
   1583      *inp++ = i;                     // to char
   1584      *inp++ = NUL;                   // NUL at the end
   1585    } else {
   1586      // mapping byte to char is done in sl_sal_first[]
   1587      lp->sl_sal_first[c] = i;
   1588    }
   1589  }
   1590 
   1591  return 0;
   1592 }
   1593 
   1594 // Fill the first-index table for "lp".
   1595 static void set_sal_first(slang_T *lp)
   1596 {
   1597  garray_T *gap = &lp->sl_sal;
   1598 
   1599  salfirst_T *sfirst = lp->sl_sal_first;
   1600  for (int i = 0; i < 256; i++) {
   1601    sfirst[i] = -1;
   1602  }
   1603  salitem_T *smp = (salitem_T *)gap->ga_data;
   1604  for (int i = 0; i < gap->ga_len; i++) {
   1605    // Use the lowest byte of the first character.  For latin1 it's
   1606    // the character, for other encodings it should differ for most
   1607    // characters.
   1608    int c = *smp[i].sm_lead_w & 0xff;
   1609    if (sfirst[c] == -1) {
   1610      sfirst[c] = i;
   1611 
   1612      // Make sure all entries with this byte are following each
   1613      // other.  Move the ones that are in the wrong position.  Do
   1614      // keep the same ordering!
   1615      while (i + 1 < gap->ga_len
   1616             && (*smp[i + 1].sm_lead_w & 0xff) == c) {
   1617        // Skip over entry with same index byte.
   1618        i++;
   1619      }
   1620 
   1621      for (int n = 1; i + n < gap->ga_len; n++) {
   1622        if ((*smp[i + n].sm_lead_w & 0xff) == c) {
   1623          salitem_T tsal;
   1624 
   1625          // Move entry with same index byte after the entries
   1626          // we already found.
   1627          i++;
   1628          n--;
   1629          tsal = smp[i + n];
   1630          memmove(smp + i + 1, smp + i, sizeof(salitem_T) * (size_t)n);
   1631          smp[i] = tsal;
   1632        }
   1633      }
   1634    }
   1635  }
   1636 }
   1637 
   1638 // Turn a multi-byte string into a wide character string.
   1639 // Return it in allocated memory.
   1640 static int *mb_str2wide(const char *s)
   1641 {
   1642  int i = 0;
   1643 
   1644  int *res = xmalloc(((size_t)mb_charlen(s) + 1) * sizeof(int));
   1645  for (const char *p = s; *p != NUL;) {
   1646    res[i++] = mb_ptr2char_adv(&p);
   1647  }
   1648  res[i] = NUL;
   1649 
   1650  return res;
   1651 }
   1652 
   1653 /// Reads a tree from the .spl or .sug file.
   1654 /// Allocates the memory and stores pointers in "bytsp" and "idxsp".
   1655 /// This is skipped when the tree has zero length.
   1656 ///
   1657 /// @param prefixtree  true for the prefix tree
   1658 /// @param prefixcnt  when "prefixtree" is true: prefix count
   1659 ///
   1660 /// @return  zero when OK, SP_ value for an error.
   1661 static int spell_read_tree(FILE *fd, uint8_t **bytsp, int *bytsp_len, idx_T **idxsp,
   1662                           bool prefixtree, int prefixcnt)
   1663  FUNC_ATTR_NONNULL_ARG(1, 2, 4)
   1664 {
   1665  // The tree size was computed when writing the file, so that we can
   1666  // allocate it as one long block. <nodecount>
   1667  int len = get4c(fd);
   1668  if (len < 0) {
   1669    return SP_TRUNCERROR;
   1670  }
   1671  if ((size_t)len >= SIZE_MAX / sizeof(int)) {
   1672    // Invalid length, multiply with sizeof(int) would overflow.
   1673    return SP_FORMERROR;
   1674  }
   1675  if (len <= 0) {
   1676    return 0;
   1677  }
   1678 
   1679  // Allocate the byte array.
   1680  uint8_t *bp = xmalloc((size_t)len);
   1681  *bytsp = bp;
   1682  if (bytsp_len != NULL) {
   1683    *bytsp_len = len;
   1684  }
   1685 
   1686  // Allocate the index array.
   1687  idx_T *ip = xcalloc((size_t)len, sizeof(*ip));
   1688  *idxsp = ip;
   1689 
   1690  // Recursively read the tree and store it in the array.
   1691  int idx = read_tree_node(fd, bp, ip, len, 0, prefixtree, prefixcnt);
   1692  if (idx < 0) {
   1693    return idx;
   1694  }
   1695  return 0;
   1696 }
   1697 
   1698 /// Read one row of siblings from the spell file and store it in the byte array
   1699 /// "byts" and index array "idxs".  Recursively read the children.
   1700 ///
   1701 /// NOTE: The code here must match put_node()!
   1702 ///
   1703 /// Returns the index (>= 0) following the siblings.
   1704 /// Returns SP_TRUNCERROR if the file is shorter than expected.
   1705 /// Returns SP_FORMERROR if there is a format error.
   1706 ///
   1707 /// @param maxidx  size of arrays
   1708 /// @param startidx  current index in "byts" and "idxs"
   1709 /// @param prefixtree  true for reading PREFIXTREE
   1710 /// @param maxprefcondnr  maximum for <prefcondnr>
   1711 static idx_T read_tree_node(FILE *fd, uint8_t *byts, idx_T *idxs, int maxidx, idx_T startidx,
   1712                            bool prefixtree, int maxprefcondnr)
   1713 {
   1714  idx_T idx = startidx;
   1715 #define SHARED_MASK     0x8000000
   1716 
   1717  int len = getc(fd);                                       // <siblingcount>
   1718  if (len <= 0) {
   1719    return SP_TRUNCERROR;
   1720  }
   1721 
   1722  if (startidx + len >= maxidx) {
   1723    return SP_FORMERROR;
   1724  }
   1725  byts[idx++] = (uint8_t)len;
   1726 
   1727  // Read the byte values, flag/region bytes and shared indexes.
   1728  for (int i = 1; i <= len; i++) {
   1729    int c = getc(fd);                                       // <byte>
   1730    if (c < 0) {
   1731      return SP_TRUNCERROR;
   1732    }
   1733    if (c <= BY_SPECIAL) {
   1734      if (c == BY_NOFLAGS && !prefixtree) {
   1735        // No flags, all regions.
   1736        idxs[idx] = 0;
   1737      } else if (c != BY_INDEX) {
   1738        if (prefixtree) {
   1739          // Read the optional pflags byte, the prefix ID and the
   1740          // condition nr.  In idxs[] store the prefix ID in the low
   1741          // byte, the condition index shifted up 8 bits, the flags
   1742          // shifted up 24 bits.
   1743          if (c == BY_FLAGS) {
   1744            c = getc(fd) << 24;                         // <pflags>
   1745          } else {
   1746            c = 0;
   1747          }
   1748 
   1749          c |= getc(fd);                                // <affixID>
   1750 
   1751          int n = get2c(fd);                                // <prefcondnr>
   1752          if (n >= maxprefcondnr) {
   1753            return SP_FORMERROR;
   1754          }
   1755          c |= (n << 8);
   1756        } else {    // c must be BY_FLAGS or BY_FLAGS2
   1757                    // Read flags and optional region and prefix ID.  In
   1758                    // idxs[] the flags go in the low two bytes, region above
   1759                    // that and prefix ID above the region.
   1760          int c2 = c;
   1761          c = getc(fd);                                 // <flags>
   1762          if (c2 == BY_FLAGS2) {
   1763            c = (getc(fd) << 8) + c;                    // <flags2>
   1764          }
   1765          if (c & WF_REGION) {
   1766            c = (getc(fd) << 16) + c;                   // <region>
   1767          }
   1768          if (c & WF_AFX) {
   1769            c = (getc(fd) << 24) + c;                   // <affixID>
   1770          }
   1771        }
   1772 
   1773        idxs[idx] = c;
   1774        c = 0;
   1775      } else {  // c == BY_INDEX
   1776        // <nodeidx>
   1777        int n = get3c(fd);
   1778        if (n < 0 || n >= maxidx) {
   1779          return SP_FORMERROR;
   1780        }
   1781        idxs[idx] = n + SHARED_MASK;
   1782        c = getc(fd);                                   // <xbyte>
   1783      }
   1784    }
   1785    byts[idx++] = (uint8_t)c;
   1786  }
   1787 
   1788  // Recursively read the children for non-shared siblings.
   1789  // Skip the end-of-word ones (zero byte value) and the shared ones (and
   1790  // remove SHARED_MASK)
   1791  for (int i = 1; i <= len; i++) {
   1792    if (byts[startidx + i] != 0) {
   1793      if (idxs[startidx + i] & SHARED_MASK) {
   1794        idxs[startidx + i] &= ~SHARED_MASK;
   1795      } else {
   1796        idxs[startidx + i] = idx;
   1797        idx = read_tree_node(fd, byts, idxs, maxidx, idx, prefixtree, maxprefcondnr);
   1798        if (idx < 0) {
   1799          break;
   1800        }
   1801      }
   1802    }
   1803  }
   1804 
   1805  return idx;
   1806 }
   1807 
   1808 /// Reload the spell file "fname" if it's loaded.
   1809 ///
   1810 /// @param added_word  invoked through "zg"
   1811 static void spell_reload_one(char *fname, bool added_word)
   1812 {
   1813  bool didit = false;
   1814 
   1815  for (slang_T *slang = first_lang; slang != NULL; slang = slang->sl_next) {
   1816    if (path_full_compare(fname, slang->sl_fname, false, true) == kEqualFiles) {
   1817      slang_clear(slang);
   1818      if (spell_load_file(fname, NULL, slang, false) == NULL) {
   1819        // reloading failed, clear the language
   1820        slang_clear(slang);
   1821      }
   1822      redraw_all_later(UPD_SOME_VALID);
   1823      didit = true;
   1824    }
   1825  }
   1826 
   1827  // When "zg" was used and the file wasn't loaded yet, should redo
   1828  // 'spelllang' to load it now.
   1829  if (added_word && !didit) {
   1830    parse_spelllang(curwin);
   1831  }
   1832 }
   1833 
   1834 // Functions for ":mkspell".
   1835 
   1836 // In the postponed prefixes tree wn_flags is used to store the WFP_ flags,
   1837 // but it must be negative to indicate the prefix tree to tree_add_word().
   1838 // Use a negative number with the lower 8 bits zero.
   1839 #define PFX_FLAGS       (-256)
   1840 
   1841 // flags for "condit" argument of store_aff_word()
   1842 #define CONDIT_COMB     1       // affix must combine
   1843 #define CONDIT_CFIX     2       // affix must have CIRCUMFIX flag
   1844 #define CONDIT_SUF      4       // add a suffix for matching flags
   1845 #define CONDIT_AFF      8       // word already has an affix
   1846 
   1847 // Tunable parameters for when the tree is compressed.  Filled from the
   1848 // 'mkspellmem' option.
   1849 static int compress_start = 30000;     // memory / SBLOCKSIZE
   1850 static int compress_inc = 100;         // memory / SBLOCKSIZE
   1851 static int compress_added = 500000;    // word count
   1852 
   1853 // Check the 'mkspellmem' option.  Return FAIL if it's wrong.
   1854 // Sets "sps_flags".
   1855 int spell_check_msm(void)
   1856 {
   1857  char *p = p_msm;
   1858 
   1859  if (!ascii_isdigit(*p)) {
   1860    return FAIL;
   1861  }
   1862  // block count = (value * 1024) / SBLOCKSIZE (but avoid overflow)
   1863  int start = (getdigits_int(&p, true, 0) * 10) / (SBLOCKSIZE / 102);
   1864  if (*p != ',') {
   1865    return FAIL;
   1866  }
   1867  p++;
   1868  if (!ascii_isdigit(*p)) {
   1869    return FAIL;
   1870  }
   1871  int incr = (getdigits_int(&p, true, 0) * 102) / (SBLOCKSIZE / 10);
   1872  if (*p != ',') {
   1873    return FAIL;
   1874  }
   1875  p++;
   1876  if (!ascii_isdigit(*p)) {
   1877    return FAIL;
   1878  }
   1879  int added = getdigits_int(&p, true, 0) * 1024;
   1880  if (*p != NUL) {
   1881    return FAIL;
   1882  }
   1883 
   1884  if (start == 0 || incr == 0 || added == 0 || incr > start) {
   1885    return FAIL;
   1886  }
   1887 
   1888  compress_start = start;
   1889  compress_inc = incr;
   1890  compress_added = added;
   1891  return OK;
   1892 }
   1893 
   1894 #ifdef SPELL_PRINTTREE
   1895 // For debugging the tree code: print the current tree in a (more or less)
   1896 // readable format, so that we can see what happens when adding a word and/or
   1897 // compressing the tree.
   1898 // Based on code from Olaf Seibert.
   1899 # define PRINTLINESIZE   1000
   1900 # define PRINTWIDTH      6
   1901 
   1902 # define PRINTSOME(l, depth, fmt, a1, a2) vim_snprintf(l + depth * PRINTWIDTH, \
   1903                                                       PRINTLINESIZE - PRINTWIDTH * depth, fmt, a1, \
   1904                                                       a2)
   1905 
   1906 static char line1[PRINTLINESIZE];
   1907 static char line2[PRINTLINESIZE];
   1908 static char line3[PRINTLINESIZE];
   1909 
   1910 static void spell_clear_flags(wordnode_T *node)
   1911 {
   1912  wordnode_T *np;
   1913 
   1914  for (np = node; np != NULL; np = np->wn_sibling) {
   1915    np->wn_u1.index = false;
   1916    spell_clear_flags(np->wn_child);
   1917  }
   1918 }
   1919 
   1920 static void spell_print_node(wordnode_T *node, int depth)
   1921 {
   1922  if (node->wn_u1.index) {
   1923    // Done this node before, print the reference.
   1924    PRINTSOME(line1, depth, "(%d)", node->wn_nr, 0);
   1925    PRINTSOME(line2, depth, "    ", 0, 0);
   1926    PRINTSOME(line3, depth, "    ", 0, 0);
   1927    msg(line1, 0);
   1928    msg(line2, 0);
   1929    msg(line3, 0);
   1930  } else {
   1931    node->wn_u1.index = true;
   1932 
   1933    if (node->wn_byte != NUL) {
   1934      if (node->wn_child != NULL) {
   1935        PRINTSOME(line1, depth, " %c -> ", node->wn_byte, 0);
   1936      } else {
   1937        // Cannot happen?
   1938        PRINTSOME(line1, depth, " %c ???", node->wn_byte, 0);
   1939      }
   1940    } else {
   1941      PRINTSOME(line1, depth, " $    ", 0, 0);
   1942    }
   1943 
   1944    PRINTSOME(line2, depth, "%d/%d    ", node->wn_nr, node->wn_refs);
   1945 
   1946    if (node->wn_sibling != NULL) {
   1947      PRINTSOME(line3, depth, " |    ", 0, 0);
   1948    } else {
   1949      PRINTSOME(line3, depth, "      ", 0, 0);
   1950    }
   1951 
   1952    if (node->wn_byte == NUL) {
   1953      msg(line1, 0);
   1954      msg(line2, 0);
   1955      msg(line3, 0);
   1956    }
   1957 
   1958    // do the children
   1959    if (node->wn_byte != NUL && node->wn_child != NULL) {
   1960      spell_print_node(node->wn_child, depth + 1);
   1961    }
   1962 
   1963    // do the siblings
   1964    if (node->wn_sibling != NULL) {
   1965      // get rid of all parent details except |
   1966      STRCPY(line1, line3);
   1967      STRCPY(line2, line3);
   1968      spell_print_node(node->wn_sibling, depth);
   1969    }
   1970  }
   1971 }
   1972 
   1973 static void spell_print_tree(wordnode_T *root)
   1974 {
   1975  if (root == NULL) {
   1976    return;
   1977  }
   1978 
   1979  // Clear the "wn_u1.index" fields, used to remember what has been done.
   1980  spell_clear_flags(root);
   1981 
   1982  // Recursively print the tree.
   1983  spell_print_node(root, 0);
   1984 }
   1985 
   1986 #endif  // SPELL_PRINTTREE
   1987 
   1988 // Reads the affix file "fname".
   1989 // Returns an afffile_T, NULL for complete failure.
   1990 static afffile_T *spell_read_aff(spellinfo_T *spin, char *fname)
   1991 {
   1992  char rline[MAXLINELEN];
   1993  char *line;
   1994  char *pc = NULL;
   1995 #define MAXITEMCNT  30
   1996  char *(items[MAXITEMCNT]);
   1997  char *p;
   1998  int lnum = 0;
   1999  affheader_T *cur_aff = NULL;
   2000  bool did_postpone_prefix = false;
   2001  int aff_todo = 0;
   2002  hashtab_T *tp;
   2003  char *low = NULL;
   2004  char *fol = NULL;
   2005  char *upp = NULL;
   2006  bool found_map = false;
   2007  hashitem_T *hi;
   2008  int compminlen = 0;              // COMPOUNDMIN value
   2009  int compsylmax = 0;              // COMPOUNDSYLMAX value
   2010  int compoptions = 0;             // COMP_ flags
   2011  int compmax = 0;                 // COMPOUNDWORDMAX value
   2012  char *compflags = NULL;          // COMPOUNDFLAG and COMPOUNDRULE
   2013                                   // concatenated
   2014  char *midword = NULL;            // MIDWORD value
   2015  char *syllable = NULL;           // SYLLABLE value
   2016  char *sofofrom = NULL;           // SOFOFROM value
   2017  char *sofoto = NULL;             // SOFOTO value
   2018 
   2019  // Open the file.
   2020  FILE *fd = os_fopen(fname, "r");
   2021  if (fd == NULL) {
   2022    semsg(_(e_notopen), fname);
   2023    return NULL;
   2024  }
   2025 
   2026  vim_snprintf(IObuff, IOSIZE, _("Reading affix file %s..."), fname);
   2027  spell_message(spin, IObuff);
   2028 
   2029  // Only do REP lines when not done in another .aff file already.
   2030  bool do_rep = GA_EMPTY(&spin->si_rep);
   2031 
   2032  // Only do REPSAL lines when not done in another .aff file already.
   2033  bool do_repsal = GA_EMPTY(&spin->si_repsal);
   2034 
   2035  // Only do SAL lines when not done in another .aff file already.
   2036  bool do_sal = GA_EMPTY(&spin->si_sal);
   2037 
   2038  // Only do MAP lines when not done in another .aff file already.
   2039  bool do_mapline = GA_EMPTY(&spin->si_map);
   2040 
   2041  // Allocate and init the afffile_T structure.
   2042  afffile_T *aff = getroom(spin, sizeof(*aff), true);
   2043  hash_init(&aff->af_pref);
   2044  hash_init(&aff->af_suff);
   2045  hash_init(&aff->af_comp);
   2046 
   2047  // Read all the lines in the file one by one.
   2048  while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) {
   2049    line_breakcheck();
   2050    lnum++;
   2051 
   2052    // Skip comment lines.
   2053    if (*rline == '#') {
   2054      continue;
   2055    }
   2056 
   2057    // Convert from "SET" to 'encoding' when needed.
   2058    xfree(pc);
   2059    if (spin->si_conv.vc_type != CONV_NONE) {
   2060      pc = string_convert(&spin->si_conv, rline, NULL);
   2061      if (pc == NULL) {
   2062        smsg(0, _("Conversion failure for word in %s line %d: %s"),
   2063             fname, lnum, rline);
   2064        continue;
   2065      }
   2066      line = pc;
   2067    } else {
   2068      pc = NULL;
   2069      line = rline;
   2070    }
   2071 
   2072    // Split the line up in white separated items.  Put a NUL after each
   2073    // item.
   2074    int itemcnt = 0;
   2075    for (p = line;;) {
   2076      while (*p != NUL && (uint8_t)(*p) <= ' ') {  // skip white space and CR/NL
   2077        p++;
   2078      }
   2079      if (*p == NUL) {
   2080        break;
   2081      }
   2082      if (itemcnt == MAXITEMCNT) {          // too many items
   2083        break;
   2084      }
   2085      items[itemcnt++] = p;
   2086      // A few items have arbitrary text argument, don't split them.
   2087      if (itemcnt == 2 && spell_info_item(items[0])) {
   2088        while ((uint8_t)(*p) >= ' ' || *p == TAB) {  // skip until CR/NL
   2089          p++;
   2090        }
   2091      } else {
   2092        while ((uint8_t)(*p) > ' ') {  // skip until white space or CR/NL
   2093          p++;
   2094        }
   2095      }
   2096      if (*p == NUL) {
   2097        break;
   2098      }
   2099      *p++ = NUL;
   2100    }
   2101 
   2102    // Handle non-empty lines.
   2103    if (itemcnt > 0) {
   2104      if (is_aff_rule(items, itemcnt, "SET", 2) && aff->af_enc == NULL) {
   2105        // Setup for conversion from "ENC" to 'encoding'.
   2106        aff->af_enc = enc_canonize(items[1]);
   2107        if (!spin->si_ascii
   2108            && convert_setup(&spin->si_conv, aff->af_enc, p_enc) == FAIL) {
   2109          smsg(0, _("Conversion in %s not supported: from %s to %s"),
   2110               fname, aff->af_enc, p_enc);
   2111        }
   2112        spin->si_conv.vc_fail = true;
   2113      } else if (is_aff_rule(items, itemcnt, "FLAG", 2)
   2114                 && aff->af_flagtype == AFT_CHAR) {
   2115        if (strcmp(items[1], "long") == 0) {
   2116          aff->af_flagtype = AFT_LONG;
   2117        } else if (strcmp(items[1], "num") == 0) {
   2118          aff->af_flagtype = AFT_NUM;
   2119        } else if (strcmp(items[1], "caplong") == 0) {
   2120          aff->af_flagtype = AFT_CAPLONG;
   2121        } else {
   2122          smsg(0, _("Invalid value for FLAG in %s line %d: %s"),
   2123               fname, lnum, items[1]);
   2124        }
   2125        if (aff->af_rare != 0
   2126            || aff->af_keepcase != 0
   2127            || aff->af_bad != 0
   2128            || aff->af_needaffix != 0
   2129            || aff->af_circumfix != 0
   2130            || aff->af_needcomp != 0
   2131            || aff->af_comproot != 0
   2132            || aff->af_nosuggest != 0
   2133            || compflags != NULL
   2134            || aff->af_suff.ht_used > 0
   2135            || aff->af_pref.ht_used > 0) {
   2136          smsg(0, _("FLAG after using flags in %s line %d: %s"),
   2137               fname, lnum, items[1]);
   2138        }
   2139      } else if (spell_info_item(items[0]) && itemcnt > 1) {
   2140        p = getroom(spin,
   2141                    (spin->si_info == NULL ? 0 : strlen(spin->si_info))
   2142                    + strlen(items[0])
   2143                    + strlen(items[1]) + 3, false);
   2144        if (spin->si_info != NULL) {
   2145          STRCPY(p, spin->si_info);
   2146          strcat(p, "\n");
   2147        }
   2148        strcat(p, items[0]);
   2149        strcat(p, " ");
   2150        strcat(p, items[1]);
   2151        spin->si_info = p;
   2152      } else if (is_aff_rule(items, itemcnt, "MIDWORD", 2) && midword == NULL) {
   2153        midword = getroom_save(spin, items[1]);
   2154      } else if (is_aff_rule(items, itemcnt, "TRY", 2)) {
   2155        // ignored, we look in the tree for what chars may appear
   2156      } else if ((is_aff_rule(items, itemcnt, "RAR", 2)  // TODO(vim): remove "RAR" later
   2157                  || is_aff_rule(items, itemcnt, "RARE", 2))
   2158                 && aff->af_rare == 0) {
   2159        aff->af_rare = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2160      } else if ((is_aff_rule(items, itemcnt, "KEP", 2)  // TODO(vim): remove "KEP" later
   2161                  || is_aff_rule(items, itemcnt, "KEEPCASE", 2))
   2162                 && aff->af_keepcase == 0) {
   2163        aff->af_keepcase = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2164      } else if ((is_aff_rule(items, itemcnt, "BAD", 2)
   2165                  || is_aff_rule(items, itemcnt, "FORBIDDENWORD", 2))
   2166                 && aff->af_bad == 0) {
   2167        aff->af_bad = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2168      } else if (is_aff_rule(items, itemcnt, "NEEDAFFIX", 2)
   2169                 && aff->af_needaffix == 0) {
   2170        aff->af_needaffix = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2171      } else if (is_aff_rule(items, itemcnt, "CIRCUMFIX", 2)
   2172                 && aff->af_circumfix == 0) {
   2173        aff->af_circumfix = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2174      } else if (is_aff_rule(items, itemcnt, "NOSUGGEST", 2)
   2175                 && aff->af_nosuggest == 0) {
   2176        aff->af_nosuggest = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2177      } else if ((is_aff_rule(items, itemcnt, "NEEDCOMPOUND", 2)
   2178                  || is_aff_rule(items, itemcnt, "ONLYINCOMPOUND", 2))
   2179                 && aff->af_needcomp == 0) {
   2180        aff->af_needcomp = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2181      } else if (is_aff_rule(items, itemcnt, "COMPOUNDROOT", 2)
   2182                 && aff->af_comproot == 0) {
   2183        aff->af_comproot = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2184      } else if (is_aff_rule(items, itemcnt, "COMPOUNDFORBIDFLAG", 2)
   2185                 && aff->af_compforbid == 0) {
   2186        aff->af_compforbid = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2187        if (aff->af_pref.ht_used > 0) {
   2188          smsg(0,
   2189               _("Defining COMPOUNDFORBIDFLAG after PFX item may give wrong results in %s line %d"),
   2190               fname, lnum);
   2191        }
   2192      } else if (is_aff_rule(items, itemcnt, "COMPOUNDPERMITFLAG", 2)
   2193                 && aff->af_comppermit == 0) {
   2194        aff->af_comppermit = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2195        if (aff->af_pref.ht_used > 0) {
   2196          smsg(0,
   2197               _("Defining COMPOUNDPERMITFLAG after PFX item may give wrong results in %s line %d"),
   2198               fname, lnum);
   2199        }
   2200      } else if (is_aff_rule(items, itemcnt, "COMPOUNDFLAG", 2)
   2201                 && compflags == NULL) {
   2202        // Turn flag "c" into COMPOUNDRULE compatible string "c+",
   2203        // "Na" into "Na+", "1234" into "1234+".
   2204        p = getroom(spin, strlen(items[1]) + 2, false);
   2205        STRCPY(p, items[1]);
   2206        strcat(p, "+");
   2207        compflags = p;
   2208      } else if (is_aff_rule(items, itemcnt, "COMPOUNDRULES", 2)) {
   2209        // We don't use the count, but do check that it's a number and
   2210        // not COMPOUNDRULE mistyped.
   2211        if (atoi(items[1]) == 0) {
   2212          smsg(0, _("Wrong COMPOUNDRULES value in %s line %d: %s"),
   2213               fname, lnum, items[1]);
   2214        }
   2215      } else if (is_aff_rule(items, itemcnt, "COMPOUNDRULE", 2)) {
   2216        // Don't use the first rule if it is a number.
   2217        if (compflags != NULL || *skipdigits(items[1]) != NUL) {
   2218          // Concatenate this string to previously defined ones,
   2219          // using a slash to separate them.
   2220          int l = (int)strlen(items[1]) + 1;
   2221          if (compflags != NULL) {
   2222            l += (int)strlen(compflags) + 1;
   2223          }
   2224          p = getroom(spin, (size_t)l, false);
   2225          if (compflags != NULL) {
   2226            STRCPY(p, compflags);
   2227            strcat(p, "/");
   2228          }
   2229          strcat(p, items[1]);
   2230          compflags = p;
   2231        }
   2232      } else if (is_aff_rule(items, itemcnt, "COMPOUNDWORDMAX", 2)
   2233                 && compmax == 0) {
   2234        compmax = atoi(items[1]);
   2235        if (compmax == 0) {
   2236          smsg(0, _("Wrong COMPOUNDWORDMAX value in %s line %d: %s"),
   2237               fname, lnum, items[1]);
   2238        }
   2239      } else if (is_aff_rule(items, itemcnt, "COMPOUNDMIN", 2)
   2240                 && compminlen == 0) {
   2241        compminlen = atoi(items[1]);
   2242        if (compminlen == 0) {
   2243          smsg(0, _("Wrong COMPOUNDMIN value in %s line %d: %s"),
   2244               fname, lnum, items[1]);
   2245        }
   2246      } else if (is_aff_rule(items, itemcnt, "COMPOUNDSYLMAX", 2)
   2247                 && compsylmax == 0) {
   2248        compsylmax = atoi(items[1]);
   2249        if (compsylmax == 0) {
   2250          smsg(0, _("Wrong COMPOUNDSYLMAX value in %s line %d: %s"),
   2251               fname, lnum, items[1]);
   2252        }
   2253      } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDDUP", 1)) {
   2254        compoptions |= COMP_CHECKDUP;
   2255      } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDREP", 1)) {
   2256        compoptions |= COMP_CHECKREP;
   2257      } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDCASE", 1)) {
   2258        compoptions |= COMP_CHECKCASE;
   2259      } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDTRIPLE", 1)) {
   2260        compoptions |= COMP_CHECKTRIPLE;
   2261      } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDPATTERN", 2)) {
   2262        if (atoi(items[1]) == 0) {
   2263          smsg(0, _("Wrong CHECKCOMPOUNDPATTERN value in %s line %d: %s"),
   2264               fname, lnum, items[1]);
   2265        }
   2266      } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDPATTERN", 3)) {
   2267        garray_T *gap = &spin->si_comppat;
   2268        int i;
   2269 
   2270        // Only add the couple if it isn't already there.
   2271        for (i = 0; i < gap->ga_len - 1; i += 2) {
   2272          if (strcmp(((char **)(gap->ga_data))[i], items[1]) == 0
   2273              && strcmp(((char **)(gap->ga_data))[i + 1], items[2]) == 0) {
   2274            break;
   2275          }
   2276        }
   2277        if (i >= gap->ga_len) {
   2278          ga_grow(gap, 2);
   2279          ((char **)(gap->ga_data))[gap->ga_len++] = getroom_save(spin, items[1]);
   2280          ((char **)(gap->ga_data))[gap->ga_len++] = getroom_save(spin, items[2]);
   2281        }
   2282      } else if (is_aff_rule(items, itemcnt, "SYLLABLE", 2)
   2283                 && syllable == NULL) {
   2284        syllable = getroom_save(spin, items[1]);
   2285      } else if (is_aff_rule(items, itemcnt, "NOBREAK", 1)) {
   2286        spin->si_nobreak = true;
   2287      } else if (is_aff_rule(items, itemcnt, "NOSPLITSUGS", 1)) {
   2288        spin->si_nosplitsugs = true;
   2289      } else if (is_aff_rule(items, itemcnt, "NOCOMPOUNDSUGS", 1)) {
   2290        spin->si_nocompoundsugs = true;
   2291      } else if (is_aff_rule(items, itemcnt, "NOSUGFILE", 1)) {
   2292        spin->si_nosugfile = true;
   2293      } else if (is_aff_rule(items, itemcnt, "PFXPOSTPONE", 1)) {
   2294        aff->af_pfxpostpone = true;
   2295      } else if (is_aff_rule(items, itemcnt, "IGNOREEXTRA", 1)) {
   2296        aff->af_ignoreextra = true;
   2297      } else if ((strcmp(items[0], "PFX") == 0
   2298                  || strcmp(items[0], "SFX") == 0)
   2299                 && aff_todo == 0
   2300                 && itemcnt >= 4) {
   2301        int lasti = 4;
   2302        char key[AH_KEY_LEN];
   2303 
   2304        if (*items[0] == 'P') {
   2305          tp = &aff->af_pref;
   2306        } else {
   2307          tp = &aff->af_suff;
   2308        }
   2309 
   2310        // Myspell allows the same affix name to be used multiple
   2311        // times.  The affix files that do this have an undocumented
   2312        // "S" flag on all but the last block, thus we check for that
   2313        // and store it in ah_follows.
   2314        xstrlcpy(key, items[1], AH_KEY_LEN);
   2315        hi = hash_find(tp, key);
   2316        if (!HASHITEM_EMPTY(hi)) {
   2317          cur_aff = HI2AH(hi);
   2318          if (cur_aff->ah_combine != (*items[2] == 'Y')) {
   2319            smsg(0, _("Different combining flag in continued affix block in %s line %d: %s"),
   2320                 fname, lnum, items[1]);
   2321          }
   2322          if (!cur_aff->ah_follows) {
   2323            smsg(0, _("Duplicate affix in %s line %d: %s"),
   2324                 fname, lnum, items[1]);
   2325          }
   2326        } else {
   2327          // New affix letter.
   2328          cur_aff = getroom(spin, sizeof(*cur_aff), true);
   2329          cur_aff->ah_flag = affitem2flag(aff->af_flagtype, items[1], fname, lnum);
   2330          if (cur_aff->ah_flag == 0 || strlen(items[1]) >= AH_KEY_LEN) {
   2331            break;
   2332          }
   2333          if (cur_aff->ah_flag == aff->af_bad
   2334              || cur_aff->ah_flag == aff->af_rare
   2335              || cur_aff->ah_flag == aff->af_keepcase
   2336              || cur_aff->ah_flag == aff->af_needaffix
   2337              || cur_aff->ah_flag == aff->af_circumfix
   2338              || cur_aff->ah_flag == aff->af_nosuggest
   2339              || cur_aff->ah_flag == aff->af_needcomp
   2340              || cur_aff->ah_flag == aff->af_comproot) {
   2341            smsg(0, _("Affix also used for BAD/RARE/KEEPCASE/NEEDAFFIX/NEEDCOMPOUND/NOSUGGEST "
   2342                      "in %s line %d: %s"),
   2343                 fname, lnum, items[1]);
   2344          }
   2345          STRCPY(cur_aff->ah_key, items[1]);
   2346          hash_add(tp, cur_aff->ah_key);
   2347 
   2348          cur_aff->ah_combine = (*items[2] == 'Y');
   2349        }
   2350 
   2351        // Check for the "S" flag, which apparently means that another
   2352        // block with the same affix name is following.
   2353        if (itemcnt > lasti && strcmp(items[lasti], "S") == 0) {
   2354          lasti++;
   2355          cur_aff->ah_follows = true;
   2356        } else {
   2357          cur_aff->ah_follows = false;
   2358        }
   2359 
   2360        // Myspell allows extra text after the item, but that might
   2361        // mean mistakes go unnoticed.  Require a comment-starter,
   2362        // unless IGNOREEXTRA is used.  Hunspell uses a "-" item.
   2363        if (itemcnt > lasti
   2364            && !aff->af_ignoreextra
   2365            && *items[lasti] != '#') {
   2366          smsg(0, _(e_afftrailing), fname, lnum, items[lasti]);
   2367        }
   2368 
   2369        if (strcmp(items[2], "Y") != 0 && strcmp(items[2], "N") != 0) {
   2370          smsg(0, _("Expected Y or N in %s line %d: %s"),
   2371               fname, lnum, items[2]);
   2372        }
   2373 
   2374        if (*items[0] == 'P' && aff->af_pfxpostpone) {
   2375          if (cur_aff->ah_newID == 0) {
   2376            // Use a new number in the .spl file later, to be able
   2377            // to handle multiple .aff files.
   2378            check_renumber(spin);
   2379            cur_aff->ah_newID = ++spin->si_newprefID;
   2380 
   2381            // We only really use ah_newID if the prefix is
   2382            // postponed.  We know that only after handling all
   2383            // the items.
   2384            did_postpone_prefix = false;
   2385          } else {
   2386            // Did use the ID in a previous block.
   2387            did_postpone_prefix = true;
   2388          }
   2389        }
   2390 
   2391        aff_todo = atoi(items[3]);
   2392      } else if ((strcmp(items[0], "PFX") == 0
   2393                  || strcmp(items[0], "SFX") == 0)
   2394                 && aff_todo > 0
   2395                 && strcmp(cur_aff->ah_key, items[1]) == 0
   2396                 && itemcnt >= 5) {
   2397        affentry_T *aff_entry;
   2398        int lasti = 5;
   2399 
   2400        // Myspell allows extra text after the item, but that might
   2401        // mean mistakes go unnoticed.  Require a comment-starter.
   2402        // Hunspell uses a "-" item.
   2403        if (itemcnt > lasti && *items[lasti] != '#'
   2404            && (strcmp(items[lasti], "-") != 0
   2405                || itemcnt != lasti + 1)) {
   2406          smsg(0, _(e_afftrailing), fname, lnum, items[lasti]);
   2407        }
   2408 
   2409        // New item for an affix letter.
   2410        aff_todo--;
   2411        aff_entry = getroom(spin, sizeof(*aff_entry), true);
   2412 
   2413        if (strcmp(items[2], "0") != 0) {
   2414          aff_entry->ae_chop = getroom_save(spin, items[2]);
   2415        }
   2416        if (strcmp(items[3], "0") != 0) {
   2417          aff_entry->ae_add = getroom_save(spin, items[3]);
   2418 
   2419          // Recognize flags on the affix: abcd/XYZ
   2420          aff_entry->ae_flags = vim_strchr(aff_entry->ae_add, '/');
   2421          if (aff_entry->ae_flags != NULL) {
   2422            *aff_entry->ae_flags++ = NUL;
   2423            aff_process_flags(aff, aff_entry);
   2424          }
   2425        }
   2426 
   2427        // Don't use an affix entry with non-ASCII characters when
   2428        // "spin->si_ascii" is true.
   2429        if (!spin->si_ascii || !(has_non_ascii(aff_entry->ae_chop)
   2430                                 || has_non_ascii(aff_entry->ae_add))) {
   2431          aff_entry->ae_next = cur_aff->ah_first;
   2432          cur_aff->ah_first = aff_entry;
   2433 
   2434          if (strcmp(items[4], ".") != 0) {
   2435            char buf[MAXLINELEN];
   2436 
   2437            aff_entry->ae_cond = getroom_save(spin, items[4]);
   2438            snprintf(buf, sizeof(buf), *items[0] == 'P' ? "^%s" : "%s$", items[4]);
   2439            aff_entry->ae_prog = vim_regcomp(buf, RE_MAGIC + RE_STRING + RE_STRICT);
   2440            if (aff_entry->ae_prog == NULL) {
   2441              smsg(0, _("Broken condition in %s line %d: %s"),
   2442                   fname, lnum, items[4]);
   2443            }
   2444          }
   2445 
   2446          // For postponed prefixes we need an entry in si_prefcond
   2447          // for the condition.  Use an existing one if possible.
   2448          // Can't be done for an affix with flags, ignoring
   2449          // COMPOUNDFORBIDFLAG and COMPOUNDPERMITFLAG.
   2450          if (*items[0] == 'P' && aff->af_pfxpostpone
   2451              && aff_entry->ae_flags == NULL) {
   2452            bool upper = false;
   2453            // When the chop string is one lower-case letter and
   2454            // the add string ends in the upper-case letter we set
   2455            // the "upper" flag, clear "ae_chop" and remove the
   2456            // letters from "ae_add".  The condition must either
   2457            // be empty or start with the same letter.
   2458            if (aff_entry->ae_chop != NULL
   2459                && aff_entry->ae_add != NULL
   2460                && aff_entry->ae_chop[utfc_ptr2len(aff_entry->ae_chop)] ==
   2461                NUL) {
   2462              int c = utf_ptr2char(aff_entry->ae_chop);
   2463              int c_up = SPELL_TOUPPER(c);
   2464              if (c_up != c
   2465                  && (aff_entry->ae_cond == NULL
   2466                      || utf_ptr2char(aff_entry->ae_cond) == c)) {
   2467                p = aff_entry->ae_add + strlen(aff_entry->ae_add);
   2468                MB_PTR_BACK(aff_entry->ae_add, p);
   2469                if (utf_ptr2char(p) == c_up) {
   2470                  upper = true;
   2471                  aff_entry->ae_chop = NULL;
   2472                  *p = NUL;
   2473 
   2474                  // The condition is matched with the
   2475                  // actual word, thus must check for the
   2476                  // upper-case letter.
   2477                  if (aff_entry->ae_cond != NULL) {
   2478                    char buf[MAXLINELEN];
   2479                    onecap_copy(items[4], buf, true);
   2480                    aff_entry->ae_cond = getroom_save(spin, buf);
   2481                    if (aff_entry->ae_cond != NULL) {
   2482                      snprintf(buf, MAXLINELEN, "^%s", aff_entry->ae_cond);
   2483                      vim_regfree(aff_entry->ae_prog);
   2484                      aff_entry->ae_prog = vim_regcomp(buf, RE_MAGIC + RE_STRING);
   2485                    }
   2486                  }
   2487                }
   2488              }
   2489            }
   2490 
   2491            if (aff_entry->ae_chop == NULL) {
   2492              int idx;
   2493 
   2494              // Find a previously used condition.
   2495              for (idx = spin->si_prefcond.ga_len - 1; idx >= 0; idx--) {
   2496                p = ((char **)spin->si_prefcond.ga_data)[idx];
   2497                if (str_equal(p, aff_entry->ae_cond)) {
   2498                  break;
   2499                }
   2500              }
   2501              if (idx < 0) {
   2502                // Not found, add a new condition.
   2503                idx = spin->si_prefcond.ga_len;
   2504                char **pp = GA_APPEND_VIA_PTR(char *, &spin->si_prefcond);
   2505                *pp = (aff_entry->ae_cond == NULL)
   2506                      ? NULL : getroom_save(spin, aff_entry->ae_cond);
   2507              }
   2508 
   2509              // Add the prefix to the prefix tree.
   2510              if (aff_entry->ae_add == NULL) {
   2511                p = "";
   2512              } else {
   2513                p = aff_entry->ae_add;
   2514              }
   2515 
   2516              // PFX_FLAGS is a negative number, so that
   2517              // tree_add_word() knows this is the prefix tree.
   2518              int n = PFX_FLAGS;
   2519              if (!cur_aff->ah_combine) {
   2520                n |= WFP_NC;
   2521              }
   2522              if (upper) {
   2523                n |= WFP_UP;
   2524              }
   2525              if (aff_entry->ae_comppermit) {
   2526                n |= WFP_COMPPERMIT;
   2527              }
   2528              if (aff_entry->ae_compforbid) {
   2529                n |= WFP_COMPFORBID;
   2530              }
   2531              tree_add_word(spin, p, spin->si_prefroot, n,
   2532                            idx, cur_aff->ah_newID);
   2533              did_postpone_prefix = true;
   2534            }
   2535 
   2536            // Didn't actually use ah_newID, backup si_newprefID.
   2537            if (aff_todo == 0 && !did_postpone_prefix) {
   2538              spin->si_newprefID--;
   2539              cur_aff->ah_newID = 0;
   2540            }
   2541          }
   2542        }
   2543      } else if (is_aff_rule(items, itemcnt, "FOL", 2) && fol == NULL) {
   2544        fol = xstrdup(items[1]);
   2545      } else if (is_aff_rule(items, itemcnt, "LOW", 2) && low == NULL) {
   2546        low = xstrdup(items[1]);
   2547      } else if (is_aff_rule(items, itemcnt, "UPP", 2) && upp == NULL) {
   2548        upp = xstrdup(items[1]);
   2549      } else if (is_aff_rule(items, itemcnt, "REP", 2)
   2550                 || is_aff_rule(items, itemcnt, "REPSAL", 2)) {
   2551        // Ignore REP/REPSAL count
   2552        if (!isdigit((uint8_t)(*items[1]))) {
   2553          smsg(0, _("Expected REP(SAL) count in %s line %d"),
   2554               fname, lnum);
   2555        }
   2556      } else if ((strcmp(items[0], "REP") == 0
   2557                  || strcmp(items[0], "REPSAL") == 0)
   2558                 && itemcnt >= 3) {
   2559        // REP/REPSAL item
   2560        // Myspell ignores extra arguments, we require it starts with
   2561        // # to detect mistakes.
   2562        if (itemcnt > 3 && items[3][0] != '#') {
   2563          smsg(0, _(e_afftrailing), fname, lnum, items[3]);
   2564        }
   2565        if (items[0][3] == 'S' ? do_repsal : do_rep) {
   2566          // Replace underscore with space (can't include a space
   2567          // directly).
   2568          for (p = items[1]; *p != NUL; MB_PTR_ADV(p)) {
   2569            if (*p == '_') {
   2570              *p = ' ';
   2571            }
   2572          }
   2573          for (p = items[2]; *p != NUL; MB_PTR_ADV(p)) {
   2574            if (*p == '_') {
   2575              *p = ' ';
   2576            }
   2577          }
   2578          add_fromto(spin, items[0][3] == 'S'
   2579                     ? &spin->si_repsal
   2580                     : &spin->si_rep, items[1], items[2]);
   2581        }
   2582      } else if (is_aff_rule(items, itemcnt, "MAP", 2)) {
   2583        // MAP item or count
   2584        if (!found_map) {
   2585          // First line contains the count.
   2586          found_map = true;
   2587          if (!isdigit((uint8_t)(*items[1]))) {
   2588            smsg(0, _("Expected MAP count in %s line %d"),
   2589                 fname, lnum);
   2590          }
   2591        } else if (do_mapline) {
   2592          // Check that every character appears only once.
   2593          for (p = items[1]; *p != NUL;) {
   2594            int c = mb_ptr2char_adv((const char **)&p);
   2595            if ((!GA_EMPTY(&spin->si_map)
   2596                 && vim_strchr(spin->si_map.ga_data, c)
   2597                 != NULL)
   2598                || vim_strchr(p, c) != NULL) {
   2599              smsg(0, _("Duplicate character in MAP in %s line %d"),
   2600                   fname, lnum);
   2601            }
   2602          }
   2603 
   2604          // We simply concatenate all the MAP strings, separated by
   2605          // slashes.
   2606          ga_concat(&spin->si_map, items[1]);
   2607          ga_append(&spin->si_map, '/');
   2608        }
   2609      }
   2610      // Accept "SAL from to" and "SAL from to  #comment".
   2611      else if (is_aff_rule(items, itemcnt, "SAL", 3)) {
   2612        if (do_sal) {
   2613          // SAL item (sounds-a-like)
   2614          // Either one of the known keys or a from-to pair.
   2615          if (strcmp(items[1], "followup") == 0) {
   2616            spin->si_followup = sal_to_bool(items[2]);
   2617          } else if (strcmp(items[1], "collapse_result") == 0) {
   2618            spin->si_collapse = sal_to_bool(items[2]);
   2619          } else if (strcmp(items[1], "remove_accents") == 0) {
   2620            spin->si_rem_accents = sal_to_bool(items[2]);
   2621          } else {
   2622            // when "to" is "_" it means empty
   2623            add_fromto(spin, &spin->si_sal, items[1],
   2624                       strcmp(items[2], "_") == 0 ? ""
   2625                                                  : items[2]);
   2626          }
   2627        }
   2628      } else if (is_aff_rule(items, itemcnt, "SOFOFROM", 2)
   2629                 && sofofrom == NULL) {
   2630        sofofrom = getroom_save(spin, items[1]);
   2631      } else if (is_aff_rule(items, itemcnt, "SOFOTO", 2)
   2632                 && sofoto == NULL) {
   2633        sofoto = getroom_save(spin, items[1]);
   2634      } else if (strcmp(items[0], "COMMON") == 0) {
   2635        for (int i = 1; i < itemcnt; i++) {
   2636          if (HASHITEM_EMPTY(hash_find(&spin->si_commonwords, items[i]))) {
   2637            p = xstrdup(items[i]);
   2638            hash_add(&spin->si_commonwords, p);
   2639          }
   2640        }
   2641      } else {
   2642        smsg(0, _("Unrecognized or duplicate item in %s line %d: %s"),
   2643             fname, lnum, items[0]);
   2644      }
   2645    }
   2646  }
   2647 
   2648  if (fol != NULL || low != NULL || upp != NULL) {
   2649    if (spin->si_clear_chartab) {
   2650      // Clear the char type tables, don't want to use any of the
   2651      // currently used spell properties.
   2652      init_spell_chartab();
   2653      spin->si_clear_chartab = false;
   2654    }
   2655 
   2656    xfree(fol);
   2657    xfree(low);
   2658    xfree(upp);
   2659  }
   2660 
   2661  // Use compound specifications of the .aff file for the spell info.
   2662  if (compmax != 0) {
   2663    aff_check_number(spin->si_compmax, compmax, "COMPOUNDWORDMAX");
   2664    spin->si_compmax = compmax;
   2665  }
   2666 
   2667  if (compminlen != 0) {
   2668    aff_check_number(spin->si_compminlen, compminlen, "COMPOUNDMIN");
   2669    spin->si_compminlen = compminlen;
   2670  }
   2671 
   2672  if (compsylmax != 0) {
   2673    if (syllable == NULL) {
   2674      smsg(0, "%s", _("COMPOUNDSYLMAX used without SYLLABLE"));
   2675    }
   2676    aff_check_number(spin->si_compsylmax, compsylmax, "COMPOUNDSYLMAX");
   2677    spin->si_compsylmax = compsylmax;
   2678  }
   2679 
   2680  if (compoptions != 0) {
   2681    aff_check_number(spin->si_compoptions, compoptions, "COMPOUND options");
   2682    spin->si_compoptions |= compoptions;
   2683  }
   2684 
   2685  if (compflags != NULL) {
   2686    process_compflags(spin, aff, compflags);
   2687  }
   2688 
   2689  // Check that we didn't use too many renumbered flags.
   2690  if (spin->si_newcompID < spin->si_newprefID) {
   2691    if (spin->si_newcompID == 127 || spin->si_newcompID == 255) {
   2692      msg(_("Too many postponed prefixes"), 0);
   2693    } else if (spin->si_newprefID == 0 || spin->si_newprefID == 127) {
   2694      msg(_("Too many compound flags"), 0);
   2695    } else {
   2696      msg(_("Too many postponed prefixes and/or compound flags"), 0);
   2697    }
   2698  }
   2699 
   2700  if (syllable != NULL) {
   2701    aff_check_string(spin->si_syllable, syllable, "SYLLABLE");
   2702    spin->si_syllable = syllable;
   2703  }
   2704 
   2705  if (sofofrom != NULL || sofoto != NULL) {
   2706    if (sofofrom == NULL || sofoto == NULL) {
   2707      smsg(0, _("Missing SOFO%s line in %s"),
   2708           sofofrom == NULL ? "FROM" : "TO", fname);
   2709    } else if (!GA_EMPTY(&spin->si_sal)) {
   2710      smsg(0, _("Both SAL and SOFO lines in %s"), fname);
   2711    } else {
   2712      aff_check_string(spin->si_sofofr, sofofrom, "SOFOFROM");
   2713      aff_check_string(spin->si_sofoto, sofoto, "SOFOTO");
   2714      spin->si_sofofr = sofofrom;
   2715      spin->si_sofoto = sofoto;
   2716    }
   2717  }
   2718 
   2719  if (midword != NULL) {
   2720    aff_check_string(spin->si_midword, midword, "MIDWORD");
   2721    spin->si_midword = midword;
   2722  }
   2723 
   2724  xfree(pc);
   2725  fclose(fd);
   2726  return aff;
   2727 }
   2728 
   2729 /// @return  true when items[0] equals "rulename", there are "mincount" items or
   2730 ///          a comment is following after item "mincount".
   2731 static bool is_aff_rule(char **items, int itemcnt, char *rulename, int mincount)
   2732 {
   2733  return strcmp(items[0], rulename) == 0
   2734         && (itemcnt == mincount
   2735             || (itemcnt > mincount && items[mincount][0] == '#'));
   2736 }
   2737 
   2738 // For affix "entry" move COMPOUNDFORBIDFLAG and COMPOUNDPERMITFLAG from
   2739 // ae_flags to ae_comppermit and ae_compforbid.
   2740 static void aff_process_flags(afffile_T *affile, affentry_T *entry)
   2741 {
   2742  if (entry->ae_flags != NULL
   2743      && (affile->af_compforbid != 0 || affile->af_comppermit != 0)) {
   2744    for (char *p = entry->ae_flags; *p != NUL;) {
   2745      char *prevp = p;
   2746      unsigned flag = get_affitem(affile->af_flagtype, &p);
   2747      if (flag == affile->af_comppermit || flag == affile->af_compforbid) {
   2748        STRMOVE(prevp, p);
   2749        p = prevp;
   2750        if (flag == affile->af_comppermit) {
   2751          entry->ae_comppermit = true;
   2752        } else {
   2753          entry->ae_compforbid = true;
   2754        }
   2755      }
   2756      if (affile->af_flagtype == AFT_NUM && *p == ',') {
   2757        p++;
   2758      }
   2759    }
   2760    if (*entry->ae_flags == NUL) {
   2761      entry->ae_flags = NULL;           // nothing left
   2762    }
   2763  }
   2764 }
   2765 
   2766 /// @return  true if "s" is the name of an info item in the affix file.
   2767 static bool spell_info_item(char *s)
   2768 {
   2769  return strcmp(s, "NAME") == 0
   2770         || strcmp(s, "HOME") == 0
   2771         || strcmp(s, "VERSION") == 0
   2772         || strcmp(s, "AUTHOR") == 0
   2773         || strcmp(s, "EMAIL") == 0
   2774         || strcmp(s, "COPYRIGHT") == 0;
   2775 }
   2776 
   2777 // Turn an affix flag name into a number, according to the FLAG type.
   2778 // returns zero for failure.
   2779 static unsigned affitem2flag(int flagtype, char *item, char *fname, int lnum)
   2780 {
   2781  char *p = item;
   2782 
   2783  unsigned res = get_affitem(flagtype, &p);
   2784  if (res == 0) {
   2785    if (flagtype == AFT_NUM) {
   2786      smsg(0, _("Flag is not a number in %s line %d: %s"),
   2787           fname, lnum, item);
   2788    } else {
   2789      smsg(0, _("Illegal flag in %s line %d: %s"),
   2790           fname, lnum, item);
   2791    }
   2792  }
   2793  if (*p != NUL) {
   2794    smsg(0, _(e_affname), fname, lnum, item);
   2795    return 0;
   2796  }
   2797 
   2798  return res;
   2799 }
   2800 
   2801 // Get one affix name from "*pp" and advance the pointer.
   2802 // Returns ZERO_FLAG for "0".
   2803 // Returns zero for an error, still advances the pointer then.
   2804 static unsigned get_affitem(int flagtype, char **pp)
   2805 {
   2806  int res;
   2807 
   2808  if (flagtype == AFT_NUM) {
   2809    if (!ascii_isdigit(**pp)) {
   2810      (*pp)++;            // always advance, avoid getting stuck
   2811      return 0;
   2812    }
   2813    res = getdigits_int(pp, true, 0);
   2814    if (res == 0) {
   2815      res = ZERO_FLAG;
   2816    }
   2817  } else {
   2818    res = mb_ptr2char_adv((const char **)pp);
   2819    if (flagtype == AFT_LONG || (flagtype == AFT_CAPLONG
   2820                                 && res >= 'A' && res <= 'Z')) {
   2821      if (**pp == NUL) {
   2822        return 0;
   2823      }
   2824      res = mb_ptr2char_adv((const char **)pp) + (res << 16);
   2825    }
   2826  }
   2827  return (unsigned)res;
   2828 }
   2829 
   2830 /// Process the "compflags" string used in an affix file and append it to
   2831 /// spin->si_compflags.
   2832 /// The processing involves changing the affix names to ID numbers, so that
   2833 /// they fit in one byte.
   2834 static void process_compflags(spellinfo_T *spin, afffile_T *aff, char *compflags)
   2835 {
   2836  compitem_T *ci;
   2837  int id;
   2838  char key[AH_KEY_LEN];
   2839 
   2840  // Make room for the old and the new compflags, concatenated with a / in
   2841  // between.  Processing it makes it shorter, but we don't know by how
   2842  // much, thus allocate the maximum.
   2843  int len = (int)strlen(compflags) + 1;
   2844  if (spin->si_compflags != NULL) {
   2845    len += (int)strlen(spin->si_compflags) + 1;
   2846  }
   2847  char *p = getroom(spin, (size_t)len, false);
   2848  if (spin->si_compflags != NULL) {
   2849    STRCPY(p, spin->si_compflags);
   2850    strcat(p, "/");
   2851  }
   2852  spin->si_compflags = p;
   2853  uint8_t *tp = (uint8_t *)p + strlen(p);
   2854 
   2855  for (p = compflags; *p != NUL;) {
   2856    if (vim_strchr("/?*+[]", (uint8_t)(*p)) != NULL) {
   2857      // Copy non-flag characters directly.
   2858      *tp++ = (uint8_t)(*p++);
   2859    } else {
   2860      // First get the flag number, also checks validity.
   2861      char *prevp = p;
   2862      unsigned flag = get_affitem(aff->af_flagtype, &p);
   2863      if (flag != 0) {
   2864        // Find the flag in the hashtable.  If it was used before, use
   2865        // the existing ID.  Otherwise add a new entry.
   2866        xmemcpyz(key, prevp, (size_t)(p - prevp));
   2867        hashitem_T *hi = hash_find(&aff->af_comp, key);
   2868        if (!HASHITEM_EMPTY(hi)) {
   2869          id = HI2CI(hi)->ci_newID;
   2870        } else {
   2871          ci = getroom(spin, sizeof(compitem_T), true);
   2872          STRCPY(ci->ci_key, key);
   2873          ci->ci_flag = flag;
   2874          // Avoid using a flag ID that has a special meaning in a
   2875          // regexp (also inside []).
   2876          do {
   2877            check_renumber(spin);
   2878            id = spin->si_newcompID--;
   2879          } while (vim_strchr("/?*+[]\\-^", id) != NULL);
   2880          ci->ci_newID = id;
   2881          hash_add(&aff->af_comp, ci->ci_key);
   2882        }
   2883        *tp++ = (uint8_t)id;
   2884      }
   2885      if (aff->af_flagtype == AFT_NUM && *p == ',') {
   2886        p++;
   2887      }
   2888    }
   2889  }
   2890 
   2891  *tp = NUL;
   2892 }
   2893 
   2894 // Check that the new IDs for postponed affixes and compounding don't overrun
   2895 // each other.  We have almost 255 available, but start at 0-127 to avoid
   2896 // using two bytes for utf-8.  When the 0-127 range is used up go to 128-255.
   2897 // When that is used up an error message is given.
   2898 static void check_renumber(spellinfo_T *spin)
   2899 {
   2900  if (spin->si_newprefID == spin->si_newcompID && spin->si_newcompID < 128) {
   2901    spin->si_newprefID = 127;
   2902    spin->si_newcompID = 255;
   2903  }
   2904 }
   2905 
   2906 // Returns true if flag "flag" appears in affix list "afflist".
   2907 static bool flag_in_afflist(int flagtype, char *afflist, unsigned flag)
   2908 {
   2909  switch (flagtype) {
   2910  case AFT_CHAR:
   2911    return vim_strchr(afflist, (int)flag) != NULL;
   2912 
   2913  case AFT_CAPLONG:
   2914  case AFT_LONG:
   2915    for (char *p = afflist; *p != NUL;) {
   2916      unsigned n = (unsigned)mb_ptr2char_adv((const char **)&p);
   2917      if ((flagtype == AFT_LONG || (n >= 'A' && n <= 'Z'))
   2918          && *p != NUL) {
   2919        n = (unsigned)mb_ptr2char_adv((const char **)&p) + (n << 16);
   2920      }
   2921      if (n == flag) {
   2922        return true;
   2923      }
   2924    }
   2925    break;
   2926 
   2927  case AFT_NUM:
   2928    for (char *p = afflist; *p != NUL;) {
   2929      int digits = getdigits_int(&p, true, 0);
   2930      assert(digits >= 0);
   2931      unsigned n = (unsigned)digits;
   2932      if (n == 0) {
   2933        n = ZERO_FLAG;
   2934      }
   2935      if (n == flag) {
   2936        return true;
   2937      }
   2938      if (*p != NUL) {          // skip over comma
   2939        p++;
   2940      }
   2941    }
   2942    break;
   2943  }
   2944  return false;
   2945 }
   2946 
   2947 // Give a warning when "spinval" and "affval" numbers are set and not the same.
   2948 static void aff_check_number(int spinval, int affval, char *name)
   2949 {
   2950  if (spinval != 0 && spinval != affval) {
   2951    smsg(0, _("%s value differs from what is used in another .aff file"),
   2952         name);
   2953  }
   2954 }
   2955 
   2956 /// Give a warning when "spinval" and "affval" strings are set and not the same.
   2957 static void aff_check_string(char *spinval, char *affval, char *name)
   2958 {
   2959  if (spinval != NULL && strcmp(spinval, affval) != 0) {
   2960    smsg(0, _("%s value differs from what is used in another .aff file"),
   2961         name);
   2962  }
   2963 }
   2964 
   2965 /// @return  true if strings "s1" and "s2" are equal.  Also consider both being
   2966 ///          NULL as equal.
   2967 static bool str_equal(char *s1, char *s2)
   2968 {
   2969  if (s1 == NULL || s2 == NULL) {
   2970    return s1 == s2;
   2971  }
   2972  return strcmp(s1, s2) == 0;
   2973 }
   2974 
   2975 /// Add a from-to item to "gap".  Used for REP and SAL items.
   2976 /// They are stored case-folded.
   2977 static void add_fromto(spellinfo_T *spin, garray_T *gap, char *from, char *to)
   2978 {
   2979  char word[MAXWLEN];
   2980 
   2981  fromto_T *ftp = GA_APPEND_VIA_PTR(fromto_T, gap);
   2982  spell_casefold(curwin, from, (int)strlen(from), word, MAXWLEN);
   2983  ftp->ft_from = getroom_save(spin, word);
   2984  spell_casefold(curwin, to, (int)strlen(to), word, MAXWLEN);
   2985  ftp->ft_to = getroom_save(spin, word);
   2986 }
   2987 
   2988 /// Converts a boolean argument in a SAL line to true or false;
   2989 static bool sal_to_bool(char *s)
   2990 {
   2991  return strcmp(s, "1") == 0 || strcmp(s, "true") == 0;
   2992 }
   2993 
   2994 // Free the structure filled by spell_read_aff().
   2995 static void spell_free_aff(afffile_T *aff)
   2996 {
   2997  xfree(aff->af_enc);
   2998 
   2999  // All this trouble to free the "ae_prog" items...
   3000  for (hashtab_T *ht = &aff->af_pref;; ht = &aff->af_suff) {
   3001    int todo = (int)ht->ht_used;
   3002    for (hashitem_T *hi = ht->ht_array; todo > 0; hi++) {
   3003      if (!HASHITEM_EMPTY(hi)) {
   3004        todo--;
   3005        affheader_T *ah = HI2AH(hi);
   3006        for (affentry_T *ae = ah->ah_first; ae != NULL; ae = ae->ae_next) {
   3007          vim_regfree(ae->ae_prog);
   3008        }
   3009      }
   3010    }
   3011    if (ht == &aff->af_suff) {
   3012      break;
   3013    }
   3014  }
   3015 
   3016  hash_clear(&aff->af_pref);
   3017  hash_clear(&aff->af_suff);
   3018  hash_clear(&aff->af_comp);
   3019 }
   3020 
   3021 // Read dictionary file "fname".
   3022 // Returns OK or FAIL;
   3023 static int spell_read_dic(spellinfo_T *spin, char *fname, afffile_T *affile)
   3024 {
   3025  hashtab_T ht;
   3026  char line[MAXLINELEN];
   3027  char store_afflist[MAXWLEN];
   3028  char *pc;
   3029  char *w;
   3030  int lnum = 1;
   3031  int non_ascii = 0;
   3032  int retval = OK;
   3033  char message[MAXLINELEN + MAXWLEN];
   3034  int duplicate = 0;
   3035  Timestamp last_msg_time = 0;
   3036 
   3037  // Open the file.
   3038  FILE *fd = os_fopen(fname, "r");
   3039  if (fd == NULL) {
   3040    semsg(_(e_notopen), fname);
   3041    return FAIL;
   3042  }
   3043 
   3044  // The hashtable is only used to detect duplicated words.
   3045  hash_init(&ht);
   3046 
   3047  vim_snprintf(IObuff, IOSIZE,
   3048               _("Reading dictionary file %s..."), fname);
   3049  spell_message(spin, IObuff);
   3050 
   3051  // start with a message for the first line
   3052  spin->si_msg_count = 999999;
   3053 
   3054  // Read and ignore the first line: word count.
   3055  if (vim_fgets(line, MAXLINELEN, fd) || !ascii_isdigit(*skipwhite(line))) {
   3056    semsg(_("E760: No word count in %s"), fname);
   3057  }
   3058 
   3059  // Read all the lines in the file one by one.
   3060  // The words are converted to 'encoding' here, before being added to
   3061  // the hashtable.
   3062  while (!vim_fgets(line, MAXLINELEN, fd) && !got_int) {
   3063    line_breakcheck();
   3064    lnum++;
   3065    if (line[0] == '#' || line[0] == '/') {
   3066      continue;         // comment line
   3067    }
   3068    // Remove CR, LF and white space from the end.  White space halfway through
   3069    // the word is kept to allow multi-word terms like "et al.".
   3070    int l = (int)strlen(line);
   3071    while (l > 0 && (uint8_t)line[l - 1] <= ' ') {
   3072      l--;
   3073    }
   3074    if (l == 0) {
   3075      continue;         // empty line
   3076    }
   3077    line[l] = NUL;
   3078 
   3079    // Convert from "SET" to 'encoding' when needed.
   3080    if (spin->si_conv.vc_type != CONV_NONE) {
   3081      pc = string_convert(&spin->si_conv, line, NULL);
   3082      if (pc == NULL) {
   3083        smsg(0, _("Conversion failure for word in %s line %d: %s"),
   3084             fname, lnum, line);
   3085        continue;
   3086      }
   3087      w = pc;
   3088    } else {
   3089      pc = NULL;
   3090      w = line;
   3091    }
   3092 
   3093    // Truncate the word at the "/", set "afflist" to what follows.
   3094    // Replace "\/" by "/" and "\\" by "\".
   3095    char *afflist = NULL;
   3096    for (char *p = w; *p != NUL; MB_PTR_ADV(p)) {
   3097      if (*p == '\\' && (p[1] == '\\' || p[1] == '/')) {
   3098        STRMOVE(p, p + 1);
   3099      } else if (*p == '/') {
   3100        *p = NUL;
   3101        afflist = p + 1;
   3102        break;
   3103      }
   3104    }
   3105 
   3106    // Skip non-ASCII words when "spin->si_ascii" is true.
   3107    if (spin->si_ascii && has_non_ascii(w)) {
   3108      non_ascii++;
   3109      xfree(pc);
   3110      continue;
   3111    }
   3112 
   3113    // This takes time, print a message every 10000 words, but not more
   3114    // often than once per second.
   3115    if (spin->si_verbose && spin->si_msg_count > 10000) {
   3116      spin->si_msg_count = 0;
   3117      if (os_time() > last_msg_time) {
   3118        last_msg_time = os_time();
   3119        vim_snprintf(message, sizeof(message),
   3120                     _("line %6d, word %6d - %s"),
   3121                     lnum, spin->si_foldwcount + spin->si_keepwcount, w);
   3122        msg_start();
   3123        msg_outtrans_long(message, 0);
   3124        msg_clr_eos();
   3125        msg_didout = false;
   3126        msg_col = 0;
   3127        ui_flush();
   3128      }
   3129    }
   3130 
   3131    // Store the word in the hashtable to be able to find duplicates.
   3132    char *dw = getroom_save(spin, w);
   3133    if (dw == NULL) {
   3134      retval = FAIL;
   3135      xfree(pc);
   3136      break;
   3137    }
   3138 
   3139    hash_T hash = hash_hash(dw);
   3140    hashitem_T *hi = hash_lookup(&ht, dw, strlen(dw), hash);
   3141    if (!HASHITEM_EMPTY(hi)) {
   3142      if (p_verbose > 0) {
   3143        smsg(0, _("Duplicate word in %s line %d: %s"),
   3144             fname, lnum, dw);
   3145      } else if (duplicate == 0) {
   3146        smsg(0, _("First duplicate word in %s line %d: %s"),
   3147             fname, lnum, dw);
   3148      }
   3149      duplicate++;
   3150    } else {
   3151      hash_add_item(&ht, hi, dw, hash);
   3152    }
   3153 
   3154    int flags = 0;
   3155    store_afflist[0] = NUL;
   3156    int pfxlen = 0;
   3157    bool need_affix = false;
   3158    if (afflist != NULL) {
   3159      // Extract flags from the affix list.
   3160      flags |= get_affix_flags(affile, afflist);
   3161 
   3162      if (affile->af_needaffix != 0
   3163          && flag_in_afflist(affile->af_flagtype, afflist,
   3164                             affile->af_needaffix)) {
   3165        need_affix = true;
   3166      }
   3167 
   3168      if (affile->af_pfxpostpone) {
   3169        // Need to store the list of prefix IDs with the word.
   3170        pfxlen = get_pfxlist(affile, afflist, store_afflist);
   3171      }
   3172 
   3173      if (spin->si_compflags != NULL) {
   3174        // Need to store the list of compound flags with the word.
   3175        // Concatenate them to the list of prefix IDs.
   3176        get_compflags(affile, afflist, store_afflist + pfxlen);
   3177      }
   3178    }
   3179 
   3180    // Add the word to the word tree(s).
   3181    if (store_word(spin, dw, flags, spin->si_region,
   3182                   store_afflist, need_affix) == FAIL) {
   3183      retval = FAIL;
   3184    }
   3185 
   3186    if (afflist != NULL) {
   3187      // Find all matching suffixes and add the resulting words.
   3188      // Additionally do matching prefixes that combine.
   3189      if (store_aff_word(spin, dw, afflist, affile,
   3190                         &affile->af_suff, &affile->af_pref,
   3191                         CONDIT_SUF, flags, store_afflist, pfxlen) == FAIL) {
   3192        retval = FAIL;
   3193      }
   3194 
   3195      // Find all matching prefixes and add the resulting words.
   3196      if (store_aff_word(spin, dw, afflist, affile,
   3197                         &affile->af_pref, NULL,
   3198                         CONDIT_SUF, flags, store_afflist, pfxlen) == FAIL) {
   3199        retval = FAIL;
   3200      }
   3201    }
   3202 
   3203    xfree(pc);
   3204  }
   3205 
   3206  if (duplicate > 0) {
   3207    smsg(0, _("%d duplicate word(s) in %s"), duplicate, fname);
   3208  }
   3209  if (spin->si_ascii && non_ascii > 0) {
   3210    smsg(0, _("Ignored %d word(s) with non-ASCII characters in %s"),
   3211         non_ascii, fname);
   3212  }
   3213  hash_clear(&ht);
   3214 
   3215  fclose(fd);
   3216  return retval;
   3217 }
   3218 
   3219 // Check for affix flags in "afflist" that are turned into word flags.
   3220 // Return WF_ flags.
   3221 static int get_affix_flags(afffile_T *affile, char *afflist)
   3222 {
   3223  int flags = 0;
   3224 
   3225  if (affile->af_keepcase != 0
   3226      && flag_in_afflist(affile->af_flagtype, afflist,
   3227                         affile->af_keepcase)) {
   3228    flags |= WF_KEEPCAP | WF_FIXCAP;
   3229  }
   3230  if (affile->af_rare != 0
   3231      && flag_in_afflist(affile->af_flagtype, afflist, affile->af_rare)) {
   3232    flags |= WF_RARE;
   3233  }
   3234  if (affile->af_bad != 0
   3235      && flag_in_afflist(affile->af_flagtype, afflist, affile->af_bad)) {
   3236    flags |= WF_BANNED;
   3237  }
   3238  if (affile->af_needcomp != 0
   3239      && flag_in_afflist(affile->af_flagtype, afflist,
   3240                         affile->af_needcomp)) {
   3241    flags |= WF_NEEDCOMP;
   3242  }
   3243  if (affile->af_comproot != 0
   3244      && flag_in_afflist(affile->af_flagtype, afflist,
   3245                         affile->af_comproot)) {
   3246    flags |= WF_COMPROOT;
   3247  }
   3248  if (affile->af_nosuggest != 0
   3249      && flag_in_afflist(affile->af_flagtype, afflist,
   3250                         affile->af_nosuggest)) {
   3251    flags |= WF_NOSUGGEST;
   3252  }
   3253  return flags;
   3254 }
   3255 
   3256 // Get the list of prefix IDs from the affix list "afflist".
   3257 // Used for PFXPOSTPONE.
   3258 // Put the resulting flags in "store_afflist[MAXWLEN]" with a terminating NUL
   3259 // and return the number of affixes.
   3260 static int get_pfxlist(afffile_T *affile, char *afflist, char *store_afflist)
   3261 {
   3262  int cnt = 0;
   3263  char key[AH_KEY_LEN];
   3264 
   3265  for (char *p = afflist; *p != NUL;) {
   3266    char *prevp = p;
   3267    if (get_affitem(affile->af_flagtype, &p) != 0) {
   3268      // A flag is a postponed prefix flag if it appears in "af_pref"
   3269      // and its ID is not zero.
   3270      xmemcpyz(key, prevp, (size_t)(p - prevp));
   3271      hashitem_T *hi = hash_find(&affile->af_pref, key);
   3272      if (!HASHITEM_EMPTY(hi)) {
   3273        int id = HI2AH(hi)->ah_newID;
   3274        if (id != 0) {
   3275          store_afflist[cnt++] = (char)(uint8_t)id;
   3276        }
   3277      }
   3278    }
   3279    if (affile->af_flagtype == AFT_NUM && *p == ',') {
   3280      p++;
   3281    }
   3282  }
   3283 
   3284  store_afflist[cnt] = NUL;
   3285  return cnt;
   3286 }
   3287 
   3288 // Get the list of compound IDs from the affix list "afflist" that are used
   3289 // for compound words.
   3290 // Puts the flags in "store_afflist[]".
   3291 static void get_compflags(afffile_T *affile, char *afflist, char *store_afflist)
   3292 {
   3293  int cnt = 0;
   3294  char key[AH_KEY_LEN];
   3295 
   3296  for (char *p = afflist; *p != NUL;) {
   3297    char *prevp = p;
   3298    if (get_affitem(affile->af_flagtype, &p) != 0) {
   3299      // A flag is a compound flag if it appears in "af_comp".
   3300      xmemcpyz(key, prevp, (size_t)(p - prevp));
   3301      hashitem_T *hi = hash_find(&affile->af_comp, key);
   3302      if (!HASHITEM_EMPTY(hi)) {
   3303        store_afflist[cnt++] = (char)(uint8_t)HI2CI(hi)->ci_newID;
   3304      }
   3305    }
   3306    if (affile->af_flagtype == AFT_NUM && *p == ',') {
   3307      p++;
   3308    }
   3309  }
   3310 
   3311  store_afflist[cnt] = NUL;
   3312 }
   3313 
   3314 /// Apply affixes to a word and store the resulting words.
   3315 /// "ht" is the hashtable with affentry_T that need to be applied, either
   3316 /// prefixes or suffixes.
   3317 /// "xht", when not NULL, is the prefix hashtable, to be used additionally on
   3318 /// the resulting words for combining affixes.
   3319 ///
   3320 /// @param spin  spell info
   3321 /// @param word  basic word start
   3322 /// @param afflist  list of names of supported affixes
   3323 /// @param condit  CONDIT_SUF et al.
   3324 /// @param flags  flags for the word
   3325 /// @param pfxlist  list of prefix IDs
   3326 /// @param pfxlen  nr of flags in "pfxlist" for prefixes, rest is compound flags
   3327 ///
   3328 /// @return  FAIL when out of memory.
   3329 static int store_aff_word(spellinfo_T *spin, char *word, char *afflist, afffile_T *affile,
   3330                          hashtab_T *ht, hashtab_T *xht, int condit, int flags, char *pfxlist,
   3331                          int pfxlen)
   3332 {
   3333  affentry_T *ae;
   3334  char newword[MAXWLEN];
   3335  int retval = OK;
   3336  int j;
   3337  char store_afflist[MAXWLEN];
   3338  char pfx_pfxlist[MAXWLEN];
   3339  size_t wordlen = strlen(word);
   3340 
   3341  int todo = (int)ht->ht_used;
   3342  for (hashitem_T *hi = ht->ht_array; todo > 0 && retval == OK; hi++) {
   3343    if (!HASHITEM_EMPTY(hi)) {
   3344      todo--;
   3345      affheader_T *ah = HI2AH(hi);
   3346 
   3347      // Check that the affix combines, if required, and that the word
   3348      // supports this affix.
   3349      if (((condit & CONDIT_COMB) == 0 || ah->ah_combine)
   3350          && flag_in_afflist(affile->af_flagtype, afflist,
   3351                             ah->ah_flag)) {
   3352        // Loop over all affix entries with this name.
   3353        for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next) {
   3354          // Check the condition.  It's not logical to match case
   3355          // here, but it is required for compatibility with
   3356          // Myspell.
   3357          // Another requirement from Myspell is that the chop
   3358          // string is shorter than the word itself.
   3359          // For prefixes, when "PFXPOSTPONE" was used, only do
   3360          // prefixes with a chop string and/or flags.
   3361          // When a previously added affix had CIRCUMFIX this one
   3362          // must have it too, if it had not then this one must not
   3363          // have one either.
   3364          if ((xht != NULL || !affile->af_pfxpostpone
   3365               || ae->ae_chop != NULL
   3366               || ae->ae_flags != NULL)
   3367              && (ae->ae_chop == NULL
   3368                  || strlen(ae->ae_chop) < wordlen)
   3369              && (ae->ae_prog == NULL
   3370                  || vim_regexec_prog(&ae->ae_prog, false, word, 0))
   3371              && (((condit & CONDIT_CFIX) == 0)
   3372                  == ((condit & CONDIT_AFF) == 0
   3373                      || ae->ae_flags == NULL
   3374                      || !flag_in_afflist(affile->af_flagtype,
   3375                                          ae->ae_flags, affile->af_circumfix)))) {
   3376            // Match.  Remove the chop and add the affix.
   3377            if (xht == NULL) {
   3378              // prefix: chop/add at the start of the word
   3379              if (ae->ae_add == NULL) {
   3380                *newword = NUL;
   3381              } else {
   3382                xstrlcpy(newword, ae->ae_add, MAXWLEN);
   3383              }
   3384              char *p = word;
   3385              if (ae->ae_chop != NULL) {
   3386                // Skip chop string.
   3387                int i = mb_charlen(ae->ae_chop);
   3388                for (; i > 0; i--) {
   3389                  MB_PTR_ADV(p);
   3390                }
   3391              }
   3392              strcat(newword, p);
   3393            } else {
   3394              // suffix: chop/add at the end of the word
   3395              xstrlcpy(newword, word, MAXWLEN);
   3396              if (ae->ae_chop != NULL) {
   3397                // Remove chop string.
   3398                char *p = newword + strlen(newword);
   3399                int i = mb_charlen(ae->ae_chop);
   3400                for (; i > 0; i--) {
   3401                  MB_PTR_BACK(newword, p);
   3402                }
   3403                *p = NUL;
   3404              }
   3405              if (ae->ae_add != NULL) {
   3406                strcat(newword, ae->ae_add);
   3407              }
   3408            }
   3409 
   3410            int use_flags = flags;
   3411            char *use_pfxlist = pfxlist;
   3412            int use_pfxlen = pfxlen;
   3413            bool need_affix = false;
   3414            int use_condit = condit | CONDIT_COMB | CONDIT_AFF;
   3415            if (ae->ae_flags != NULL) {
   3416              // Extract flags from the affix list.
   3417              use_flags |= get_affix_flags(affile, ae->ae_flags);
   3418 
   3419              if (affile->af_needaffix != 0
   3420                  && flag_in_afflist(affile->af_flagtype, ae->ae_flags,
   3421                                     affile->af_needaffix)) {
   3422                need_affix = true;
   3423              }
   3424 
   3425              // When there is a CIRCUMFIX flag the other affix
   3426              // must also have it and we don't add the word
   3427              // with one affix.
   3428              if (affile->af_circumfix != 0
   3429                  && flag_in_afflist(affile->af_flagtype, ae->ae_flags,
   3430                                     affile->af_circumfix)) {
   3431                use_condit |= CONDIT_CFIX;
   3432                if ((condit & CONDIT_CFIX) == 0) {
   3433                  need_affix = true;
   3434                }
   3435              }
   3436 
   3437              if (affile->af_pfxpostpone
   3438                  || spin->si_compflags != NULL) {
   3439                if (affile->af_pfxpostpone) {
   3440                  // Get prefix IDS from the affix list.
   3441                  use_pfxlen = get_pfxlist(affile, ae->ae_flags, store_afflist);
   3442                } else {
   3443                  use_pfxlen = 0;
   3444                }
   3445                use_pfxlist = store_afflist;
   3446 
   3447                // Combine the prefix IDs. Avoid adding the
   3448                // same ID twice.
   3449                for (int i = 0; i < pfxlen; i++) {
   3450                  for (j = 0; j < use_pfxlen; j++) {
   3451                    if (pfxlist[i] == use_pfxlist[j]) {
   3452                      break;
   3453                    }
   3454                  }
   3455                  if (j == use_pfxlen) {
   3456                    use_pfxlist[use_pfxlen++] = pfxlist[i];
   3457                  }
   3458                }
   3459 
   3460                if (spin->si_compflags != NULL) {
   3461                  // Get compound IDS from the affix list.
   3462                  get_compflags(affile, ae->ae_flags,
   3463                                use_pfxlist + use_pfxlen);
   3464                } else {
   3465                  use_pfxlist[use_pfxlen] = NUL;
   3466                }
   3467 
   3468                // Combine the list of compound flags.
   3469                // Concatenate them to the prefix IDs list.
   3470                // Avoid adding the same ID twice.
   3471                for (int i = pfxlen; pfxlist[i] != NUL; i++) {
   3472                  for (j = use_pfxlen; use_pfxlist[j] != NUL; j++) {
   3473                    if (pfxlist[i] == use_pfxlist[j]) {
   3474                      break;
   3475                    }
   3476                  }
   3477                  if (use_pfxlist[j] == NUL) {
   3478                    use_pfxlist[j++] = pfxlist[i];
   3479                    use_pfxlist[j] = NUL;
   3480                  }
   3481                }
   3482              }
   3483            }
   3484 
   3485            // Obey a "COMPOUNDFORBIDFLAG" of the affix: don't
   3486            // use the compound flags.
   3487            if (use_pfxlist != NULL && ae->ae_compforbid) {
   3488              xmemcpyz(pfx_pfxlist, use_pfxlist, (size_t)use_pfxlen);
   3489              use_pfxlist = pfx_pfxlist;
   3490            }
   3491 
   3492            // When there are postponed prefixes...
   3493            if (spin->si_prefroot != NULL
   3494                && spin->si_prefroot->wn_sibling != NULL) {
   3495              // ... add a flag to indicate an affix was used.
   3496              use_flags |= WF_HAS_AFF;
   3497 
   3498              // ... don't use a prefix list if combining
   3499              // affixes is not allowed.  But do use the
   3500              // compound flags after them.
   3501              if (!ah->ah_combine && use_pfxlist != NULL) {
   3502                use_pfxlist += use_pfxlen;
   3503              }
   3504            }
   3505 
   3506            // When compounding is supported and there is no
   3507            // "COMPOUNDPERMITFLAG" then forbid compounding on the
   3508            // side where the affix is applied.
   3509            if (spin->si_compflags != NULL && !ae->ae_comppermit) {
   3510              if (xht != NULL) {
   3511                use_flags |= WF_NOCOMPAFT;
   3512              } else {
   3513                use_flags |= WF_NOCOMPBEF;
   3514              }
   3515            }
   3516 
   3517            // Store the modified word.
   3518            if (store_word(spin, newword, use_flags,
   3519                           spin->si_region, use_pfxlist,
   3520                           need_affix) == FAIL) {
   3521              retval = FAIL;
   3522            }
   3523 
   3524            // When added a prefix or a first suffix and the affix
   3525            // has flags may add a(nother) suffix.  RECURSIVE!
   3526            if ((condit & CONDIT_SUF) && ae->ae_flags != NULL) {
   3527              if (store_aff_word(spin, newword, ae->ae_flags,
   3528                                 affile, &affile->af_suff, xht,
   3529                                 use_condit & (xht == NULL
   3530                                               ? ~0 : ~CONDIT_SUF),
   3531                                 use_flags, use_pfxlist, pfxlen) == FAIL) {
   3532                retval = FAIL;
   3533              }
   3534            }
   3535 
   3536            // When added a suffix and combining is allowed also
   3537            // try adding a prefix additionally.  Both for the
   3538            // word flags and for the affix flags.  RECURSIVE!
   3539            if (xht != NULL && ah->ah_combine) {
   3540              if (store_aff_word(spin, newword,
   3541                                 afflist, affile,
   3542                                 xht, NULL, use_condit,
   3543                                 use_flags, use_pfxlist,
   3544                                 pfxlen) == FAIL
   3545                  || (ae->ae_flags != NULL
   3546                      && store_aff_word(spin, newword,
   3547                                        ae->ae_flags, affile,
   3548                                        xht, NULL, use_condit,
   3549                                        use_flags, use_pfxlist,
   3550                                        pfxlen) == FAIL)) {
   3551                retval = FAIL;
   3552              }
   3553            }
   3554          }
   3555        }
   3556      }
   3557    }
   3558  }
   3559 
   3560  return retval;
   3561 }
   3562 
   3563 // Read a file with a list of words.
   3564 static int spell_read_wordfile(spellinfo_T *spin, char *fname)
   3565 {
   3566  linenr_T lnum = 0;
   3567  char rline[MAXLINELEN];
   3568  char *line;
   3569  char *pc = NULL;
   3570  int retval = OK;
   3571  bool did_word = false;
   3572  int non_ascii = 0;
   3573 
   3574  // Open the file.
   3575  FILE *fd = os_fopen(fname, "r");
   3576  if (fd == NULL) {
   3577    semsg(_(e_notopen), fname);
   3578    return FAIL;
   3579  }
   3580 
   3581  vim_snprintf(IObuff, IOSIZE, _("Reading word file %s..."), fname);
   3582  spell_message(spin, IObuff);
   3583 
   3584  // Read all the lines in the file one by one.
   3585  while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) {
   3586    line_breakcheck();
   3587    lnum++;
   3588 
   3589    // Skip comment lines.
   3590    if (*rline == '#') {
   3591      continue;
   3592    }
   3593 
   3594    // Remove CR, LF and white space from the end.
   3595    int l = (int)strlen(rline);
   3596    while (l > 0 && (uint8_t)rline[l - 1] <= ' ') {
   3597      l--;
   3598    }
   3599    if (l == 0) {
   3600      continue;         // empty or blank line
   3601    }
   3602    rline[l] = NUL;
   3603 
   3604    // Convert from "/encoding={encoding}" to 'encoding' when needed.
   3605    xfree(pc);
   3606    if (spin->si_conv.vc_type != CONV_NONE) {
   3607      pc = string_convert(&spin->si_conv, rline, NULL);
   3608      if (pc == NULL) {
   3609        smsg(0, _("Conversion failure for word in %s line %" PRIdLINENR ": %s"),
   3610             fname, lnum, rline);
   3611        continue;
   3612      }
   3613      line = pc;
   3614    } else {
   3615      pc = NULL;
   3616      line = rline;
   3617    }
   3618 
   3619    if (*line == '/') {
   3620      line++;
   3621      if (strncmp(line, "encoding=", 9) == 0) {
   3622        if (spin->si_conv.vc_type != CONV_NONE) {
   3623          smsg(0, _("Duplicate /encoding= line ignored in %s line %" PRIdLINENR ": %s"),
   3624               fname, lnum, line - 1);
   3625        } else if (did_word) {
   3626          smsg(0, _("/encoding= line after word ignored in %s line %" PRIdLINENR ": %s"),
   3627               fname, lnum, line - 1);
   3628        } else {
   3629          // Setup for conversion to 'encoding'.
   3630          line += 9;
   3631          char *enc = enc_canonize(line);
   3632          if (!spin->si_ascii
   3633              && convert_setup(&spin->si_conv, enc, p_enc) == FAIL) {
   3634            smsg(0, _("Conversion in %s not supported: from %s to %s"),
   3635                 fname, line, p_enc);
   3636          }
   3637          xfree(enc);
   3638          spin->si_conv.vc_fail = true;
   3639        }
   3640        continue;
   3641      }
   3642 
   3643      if (strncmp(line, "regions=", 8) == 0) {
   3644        if (spin->si_region_count > 1) {
   3645          smsg(0, _("Duplicate /regions= line ignored in %s line %" PRIdLINENR ": %s"),
   3646               fname, lnum, line);
   3647        } else {
   3648          line += 8;
   3649          if (strlen(line) > MAXREGIONS * 2) {
   3650            smsg(0, _("Too many regions in %s line %" PRIdLINENR ": %s"),
   3651                 fname, lnum, line);
   3652          } else {
   3653            spin->si_region_count = (int)strlen(line) / 2;
   3654            STRCPY(spin->si_region_name, line);
   3655 
   3656            // Adjust the mask for a word valid in all regions.
   3657            spin->si_region = (1 << spin->si_region_count) - 1;
   3658          }
   3659        }
   3660        continue;
   3661      }
   3662 
   3663      smsg(0, _("/ line ignored in %s line %" PRIdLINENR ": %s"),
   3664           fname, lnum, line - 1);
   3665      continue;
   3666    }
   3667 
   3668    int flags = 0;
   3669    int regionmask = spin->si_region;
   3670 
   3671    // Check for flags and region after a slash.
   3672    char *p = vim_strchr(line, '/');
   3673    if (p != NULL) {
   3674      *p++ = NUL;
   3675      while (*p != NUL) {
   3676        if (*p == '=') {                // keep-case word
   3677          flags |= WF_KEEPCAP | WF_FIXCAP;
   3678        } else if (*p == '!') {                  // Bad, bad, wicked word.
   3679          flags |= WF_BANNED;
   3680        } else if (*p == '?') {                  // Rare word.
   3681          flags |= WF_RARE;
   3682        } else if (ascii_isdigit((uint8_t)(*p))) {              // region number(s)
   3683          if ((flags & WF_REGION) == 0) {           // first one
   3684            regionmask = 0;
   3685          }
   3686          flags |= WF_REGION;
   3687 
   3688          l = (uint8_t)(*p) - '0';
   3689          if (l == 0 || l > spin->si_region_count) {
   3690            smsg(0, _("Invalid region nr in %s line %" PRIdLINENR ": %s"),
   3691                 fname, lnum, p);
   3692            break;
   3693          }
   3694          regionmask |= 1 << (l - 1);
   3695        } else {
   3696          smsg(0, _("Unrecognized flags in %s line %" PRIdLINENR ": %s"),
   3697               fname, lnum, p);
   3698          break;
   3699        }
   3700        p++;
   3701      }
   3702    }
   3703 
   3704    // Skip non-ASCII words when "spin->si_ascii" is true.
   3705    if (spin->si_ascii && has_non_ascii(line)) {
   3706      non_ascii++;
   3707      continue;
   3708    }
   3709 
   3710    // Normal word: store it.
   3711    if (store_word(spin, line, flags, regionmask, NULL, false) == FAIL) {
   3712      retval = FAIL;
   3713      break;
   3714    }
   3715    did_word = true;
   3716  }
   3717 
   3718  xfree(pc);
   3719  fclose(fd);
   3720 
   3721  if (spin->si_ascii && non_ascii > 0) {
   3722    vim_snprintf(IObuff, IOSIZE,
   3723                 _("Ignored %d words with non-ASCII characters"), non_ascii);
   3724    spell_message(spin, IObuff);
   3725  }
   3726 
   3727  return retval;
   3728 }
   3729 
   3730 /// Get part of an sblock_T, "len" bytes long.
   3731 /// This avoids calling free() for every little struct we use (and keeping
   3732 /// track of them).
   3733 /// The memory is cleared to all zeros.
   3734 ///
   3735 /// @param len Length needed (<= SBLOCKSIZE).
   3736 /// @param align Align for pointer.
   3737 /// @return Pointer into block data.
   3738 static void *getroom(spellinfo_T *spin, size_t len, bool align)
   3739  FUNC_ATTR_NONNULL_RET
   3740 {
   3741  sblock_T *bl = spin->si_blocks;
   3742 
   3743  assert(len <= SBLOCKSIZE);
   3744 
   3745  if (align && bl != NULL) {
   3746    // Round size up for alignment.  On some systems structures need to be
   3747    // aligned to the size of a pointer (e.g., SPARC).
   3748    bl->sb_used = (int)(((size_t)bl->sb_used + sizeof(char *) - 1) & ~(sizeof(char *) - 1));
   3749  }
   3750 
   3751  if (bl == NULL || (size_t)bl->sb_used + len > SBLOCKSIZE) {
   3752    // Allocate a block of memory. It is not freed until much later.
   3753    bl = xcalloc(1, offsetof(sblock_T, sb_data) + SBLOCKSIZE + 1);
   3754    bl->sb_next = spin->si_blocks;
   3755    spin->si_blocks = bl;
   3756    bl->sb_used = 0;
   3757    spin->si_blocks_cnt++;
   3758  }
   3759 
   3760  char *p = bl->sb_data + bl->sb_used;
   3761  bl->sb_used += (int)len;
   3762 
   3763  return p;
   3764 }
   3765 
   3766 /// Make a copy of a string into memory allocated with getroom().
   3767 ///
   3768 /// @return  NULL when out of memory.
   3769 static char *getroom_save(spellinfo_T *spin, char *s)
   3770 {
   3771  const size_t s_size = strlen(s) + 1;
   3772  return memcpy(getroom(spin, s_size, false), s, s_size);
   3773 }
   3774 
   3775 // Free the list of allocated sblock_T.
   3776 static void free_blocks(sblock_T *bl)
   3777 {
   3778  while (bl != NULL) {
   3779    sblock_T *next = bl->sb_next;
   3780    xfree(bl);
   3781    bl = next;
   3782  }
   3783 }
   3784 
   3785 // Allocate the root of a word tree.
   3786 // Returns NULL when out of memory.
   3787 static wordnode_T *wordtree_alloc(spellinfo_T *spin)
   3788  FUNC_ATTR_NONNULL_RET
   3789 {
   3790  return (wordnode_T *)getroom(spin, sizeof(wordnode_T), true);
   3791 }
   3792 
   3793 /// Return true if "word" contains valid word characters.
   3794 /// Control characters and trailing '/' are invalid.  Space is OK.
   3795 static bool valid_spell_word(const char *word, const char *end)
   3796 {
   3797  if (!utf_valid_string(word, end)) {
   3798    return false;
   3799  }
   3800  for (const char *p = word; *p != NUL && p < end; p += utfc_ptr2len(p)) {
   3801    if ((uint8_t)(*p) < ' ' || (p[0] == '/' && p[1] == NUL)) {
   3802      return false;
   3803    }
   3804  }
   3805  return true;
   3806 }
   3807 
   3808 /// Store a word in the tree(s).
   3809 /// Always store it in the case-folded tree.  For a keep-case word this is
   3810 /// useful when the word can also be used with all caps (no WF_FIXCAP flag) and
   3811 /// used to find suggestions.
   3812 /// For a keep-case word also store it in the keep-case tree.
   3813 /// When "pfxlist" is not NULL store the word for each postponed prefix ID and
   3814 /// compound flag.
   3815 ///
   3816 /// @param flags  extra flags, wf_banned
   3817 /// @param region  supported region(s)
   3818 /// @param pfxlist  list of prefix ids or null
   3819 /// @param need_affix  only store word with affix id
   3820 static int store_word(spellinfo_T *spin, char *word, int flags, int region, const char *pfxlist,
   3821                      bool need_affix)
   3822 {
   3823  int len = (int)strlen(word);
   3824  int ct = captype(word, word + len);
   3825  char foldword[MAXWLEN];
   3826  int res = OK;
   3827 
   3828  // Avoid adding illegal bytes to the word tree.
   3829  if (!valid_spell_word(word, word + len)) {
   3830    return FAIL;
   3831  }
   3832 
   3833  spell_casefold(curwin, word, len, foldword, MAXWLEN);
   3834  for (const char *p = pfxlist; res == OK; p++) {
   3835    if (!need_affix || (p != NULL && *p != NUL)) {
   3836      res = tree_add_word(spin, foldword, spin->si_foldroot, ct | flags,
   3837                          region, p == NULL ? 0 : *p);
   3838    }
   3839    if (p == NULL || *p == NUL) {
   3840      break;
   3841    }
   3842  }
   3843  spin->si_foldwcount++;
   3844 
   3845  if (res == OK && (ct == WF_KEEPCAP || (flags & WF_KEEPCAP))) {
   3846    for (const char *p = pfxlist; res == OK; p++) {
   3847      if (!need_affix || (p != NULL && *p != NUL)) {
   3848        res = tree_add_word(spin, word, spin->si_keeproot, flags,
   3849                            region, p == NULL ? 0 : *p);
   3850      }
   3851      if (p == NULL || *p == NUL) {
   3852        break;
   3853      }
   3854    }
   3855    spin->si_keepwcount++;
   3856  }
   3857  return res;
   3858 }
   3859 
   3860 // Add word "word" to a word tree at "root".
   3861 // When "flags" < 0 we are adding to the prefix tree where "flags" is used for
   3862 // "rare" and "region" is the condition nr.
   3863 // Returns FAIL when out of memory.
   3864 static int tree_add_word(spellinfo_T *spin, const char *word, wordnode_T *root, int flags,
   3865                         int region, int affixID)
   3866 {
   3867  wordnode_T *node = root;
   3868  wordnode_T **prev = NULL;
   3869 
   3870  // Add each byte of the word to the tree, including the NUL at the end.
   3871  for (int i = 0;; i++) {
   3872    // When there is more than one reference to this node we need to make
   3873    // a copy, so that we can modify it.  Copy the whole list of siblings
   3874    // (we don't optimize for a partly shared list of siblings).
   3875    if (node != NULL && node->wn_refs > 1) {
   3876      node->wn_refs--;
   3877      wordnode_T **copyprev = prev;
   3878      for (wordnode_T *copyp = node; copyp != NULL; copyp = copyp->wn_sibling) {
   3879        // Allocate a new node and copy the info.
   3880        wordnode_T *np = get_wordnode(spin);
   3881        if (np == NULL) {
   3882          return FAIL;
   3883        }
   3884        np->wn_child = copyp->wn_child;
   3885        if (np->wn_child != NULL) {
   3886          np->wn_child->wn_refs++;              // child gets extra ref
   3887        }
   3888        np->wn_byte = copyp->wn_byte;
   3889        if (np->wn_byte == NUL) {
   3890          np->wn_flags = copyp->wn_flags;
   3891          np->wn_region = copyp->wn_region;
   3892          np->wn_affixID = copyp->wn_affixID;
   3893        }
   3894 
   3895        // Link the new node in the list, there will be one ref.
   3896        np->wn_refs = 1;
   3897        if (copyprev != NULL) {
   3898          *copyprev = np;
   3899        }
   3900        copyprev = &np->wn_sibling;
   3901 
   3902        // Let "node" point to the head of the copied list.
   3903        if (copyp == node) {
   3904          node = np;
   3905        }
   3906      }
   3907    }
   3908 
   3909    // Look for the sibling that has the same character.  They are sorted
   3910    // on byte value, thus stop searching when a sibling is found with a
   3911    // higher byte value.  For zero bytes (end of word) the sorting is
   3912    // done on flags and then on affixID.
   3913    while (node != NULL
   3914           && (node->wn_byte < (uint8_t)word[i]
   3915               || (node->wn_byte == NUL
   3916                   && (flags < 0
   3917                       ? node->wn_affixID < (unsigned)affixID
   3918                       : (node->wn_flags < (unsigned)(flags & WN_MASK)
   3919                          || (node->wn_flags == (flags & WN_MASK)
   3920                              && (spin->si_sugtree
   3921                                  ? (node->wn_region & 0xffff) < region
   3922                                  : node->wn_affixID
   3923                                  < (unsigned)affixID))))))) {
   3924      prev = &node->wn_sibling;
   3925      node = *prev;
   3926    }
   3927    if (node == NULL
   3928        || node->wn_byte != (uint8_t)word[i]
   3929        || (word[i] == NUL
   3930            && (flags < 0
   3931                || spin->si_sugtree
   3932                || node->wn_flags != (flags & WN_MASK)
   3933                || node->wn_affixID != affixID))) {
   3934      // Allocate a new node.
   3935      wordnode_T *np = get_wordnode(spin);
   3936      if (np == NULL) {
   3937        return FAIL;
   3938      }
   3939      np->wn_byte = (uint8_t)word[i];
   3940 
   3941      // If "node" is NULL this is a new child or the end of the sibling
   3942      // list: ref count is one.  Otherwise use ref count of sibling and
   3943      // make ref count of sibling one (matters when inserting in front
   3944      // of the list of siblings).
   3945      if (node == NULL) {
   3946        np->wn_refs = 1;
   3947      } else {
   3948        np->wn_refs = node->wn_refs;
   3949        node->wn_refs = 1;
   3950      }
   3951      if (prev != NULL) {
   3952        *prev = np;
   3953      }
   3954      np->wn_sibling = node;
   3955      node = np;
   3956    }
   3957 
   3958    if (word[i] == NUL) {
   3959      node->wn_flags = (uint16_t)flags;
   3960      node->wn_region |= (int16_t)region;
   3961      node->wn_affixID = (uint8_t)affixID;
   3962      break;
   3963    }
   3964    prev = &node->wn_child;
   3965    node = *prev;
   3966  }
   3967 #ifdef SPELL_PRINTTREE
   3968  smsg(0, "Added \"%s\"", word);
   3969  spell_print_tree(root->wn_sibling);
   3970 #endif
   3971 
   3972  // count nr of words added since last message
   3973  spin->si_msg_count++;
   3974 
   3975  if (spin->si_compress_cnt > 1) {
   3976    if (--spin->si_compress_cnt == 1) {
   3977      // Did enough words to lower the block count limit.
   3978      spin->si_blocks_cnt += compress_inc;
   3979    }
   3980  }
   3981 
   3982  // When we have allocated lots of memory we need to compress the word tree
   3983  // to free up some room.  But compression is slow, and we might actually
   3984  // need that room, thus only compress in the following situations:
   3985  // 1. When not compressed before (si_compress_cnt == 0): when using
   3986  //    "compress_start" blocks.
   3987  // 2. When compressed before and used "compress_inc" blocks before
   3988  //    adding "compress_added" words (si_compress_cnt > 1).
   3989  // 3. When compressed before, added "compress_added" words
   3990  //    (si_compress_cnt == 1) and the number of free nodes drops below the
   3991  //    maximum word length.
   3992 #ifndef SPELL_COMPRESS_ALWAYS
   3993  if (spin->si_compress_cnt == 1
   3994      ? spin->si_free_count < MAXWLEN
   3995      : spin->si_blocks_cnt >= compress_start)
   3996 #endif
   3997  {
   3998    // Decrement the block counter.  The effect is that we compress again
   3999    // when the freed up room has been used and another "compress_inc"
   4000    // blocks have been allocated.  Unless "compress_added" words have
   4001    // been added, then the limit is put back again.
   4002    spin->si_blocks_cnt -= compress_inc;
   4003    spin->si_compress_cnt = compress_added;
   4004 
   4005    if (spin->si_verbose) {
   4006      msg_start();
   4007      msg_puts(_(msg_compressing));
   4008      msg_clr_eos();
   4009      msg_didout = false;
   4010      msg_col = 0;
   4011      ui_flush();
   4012    }
   4013 
   4014    // Compress both trees.  Either they both have many nodes, which makes
   4015    // compression useful, or one of them is small, which means
   4016    // compression goes fast.  But when filling the soundfold word tree
   4017    // there is no keep-case tree.
   4018    wordtree_compress(spin, spin->si_foldroot, "case-folded");
   4019    if (affixID >= 0) {
   4020      wordtree_compress(spin, spin->si_keeproot, "keep-case");
   4021    }
   4022  }
   4023 
   4024  return OK;
   4025 }
   4026 
   4027 // Get a wordnode_T, either from the list of previously freed nodes or
   4028 // allocate a new one.
   4029 // Returns NULL when out of memory.
   4030 static wordnode_T *get_wordnode(spellinfo_T *spin)
   4031 {
   4032  wordnode_T *n;
   4033 
   4034  if (spin->si_first_free == NULL) {
   4035    n = (wordnode_T *)getroom(spin, sizeof(wordnode_T), true);
   4036  } else {
   4037    n = spin->si_first_free;
   4038    spin->si_first_free = n->wn_child;
   4039    CLEAR_POINTER(n);
   4040    spin->si_free_count--;
   4041  }
   4042 #ifdef SPELL_PRINTTREE
   4043  if (n != NULL) {
   4044    n->wn_nr = ++spin->si_wordnode_nr;
   4045  }
   4046 #endif
   4047  return n;
   4048 }
   4049 
   4050 // Decrement the reference count on a node (which is the head of a list of
   4051 // siblings).  If the reference count becomes zero free the node and its
   4052 // siblings.
   4053 // Returns the number of nodes actually freed.
   4054 static int deref_wordnode(spellinfo_T *spin, wordnode_T *node)
   4055  FUNC_ATTR_NONNULL_ALL
   4056 {
   4057  int cnt = 0;
   4058 
   4059  if (--node->wn_refs == 0) {
   4060    for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
   4061      if (np->wn_child != NULL) {
   4062        cnt += deref_wordnode(spin, np->wn_child);
   4063      }
   4064      free_wordnode(spin, np);
   4065      cnt++;
   4066    }
   4067    cnt++;          // length field
   4068  }
   4069  return cnt;
   4070 }
   4071 
   4072 // Free a wordnode_T for re-use later.
   4073 // Only the "wn_child" field becomes invalid.
   4074 static void free_wordnode(spellinfo_T *spin, wordnode_T *n)
   4075  FUNC_ATTR_NONNULL_ALL
   4076 {
   4077  n->wn_child = spin->si_first_free;
   4078  spin->si_first_free = n;
   4079  spin->si_free_count++;
   4080 }
   4081 
   4082 // Compress a tree: find tails that are identical and can be shared.
   4083 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root, const char *name)
   4084  FUNC_ATTR_NONNULL_ALL
   4085 {
   4086  hashtab_T ht;
   4087  int tot = 0;
   4088  long perc;
   4089 
   4090  // Skip the root itself, it's not actually used.  The first sibling is the
   4091  // start of the tree.
   4092  if (root->wn_sibling == NULL) {
   4093    return;
   4094  }
   4095 
   4096  hash_init(&ht);
   4097  const int n = node_compress(spin, root->wn_sibling, &ht, &tot);
   4098 
   4099 #ifndef SPELL_PRINTTREE
   4100  if (spin->si_verbose || p_verbose > 2)
   4101 #endif
   4102  {
   4103    if (tot > 1000000) {
   4104      perc = (tot - n) / (tot / 100);
   4105    } else if (tot == 0) {
   4106      perc = 0;
   4107    } else {
   4108      perc = (tot - n) * 100 / tot;
   4109    }
   4110    vim_snprintf(IObuff, IOSIZE,
   4111                 _("Compressed %s: %d of %d nodes; %d (%ld%%) remaining"),
   4112                 name, n, tot, tot - n, perc);
   4113    spell_message(spin, IObuff);
   4114  }
   4115 #ifdef SPELL_PRINTTREE
   4116  spell_print_tree(root->wn_sibling);
   4117 #endif
   4118  hash_clear(&ht);
   4119 }
   4120 
   4121 /// Compress a node, its siblings and its children, depth first.
   4122 /// Returns the number of compressed nodes.
   4123 ///
   4124 /// @param tot  total count of nodes before compressing, incremented while going through the tree
   4125 static int node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot)
   4126  FUNC_ATTR_NONNULL_ALL
   4127 {
   4128  wordnode_T *tp;
   4129  wordnode_T *child;
   4130  int len = 0;
   4131  unsigned n;
   4132  int compressed = 0;
   4133 
   4134  // Go through the list of siblings.  Compress each child and then try
   4135  // finding an identical child to replace it.
   4136  // Note that with "child" we mean not just the node that is pointed to,
   4137  // but the whole list of siblings of which the child node is the first.
   4138  for (wordnode_T *np = node; np != NULL && !got_int; np = np->wn_sibling) {
   4139    len++;
   4140    if ((child = np->wn_child) != NULL) {
   4141      // Compress the child first.  This fills hashkey.
   4142      compressed += node_compress(spin, child, ht, tot);
   4143 
   4144      // Try to find an identical child.
   4145      hash_T hash = hash_hash((char *)child->wn_u1.hashkey);
   4146      hashitem_T *hi = hash_lookup(ht, (const char *)child->wn_u1.hashkey,
   4147                                   strlen((char *)child->wn_u1.hashkey), hash);
   4148      if (!HASHITEM_EMPTY(hi)) {
   4149        // There are children we encountered before with a hash value
   4150        // identical to the current child.  Now check if there is one
   4151        // that is really identical.
   4152        for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next) {
   4153          if (node_equal(child, tp)) {
   4154            // Found one!  Now use that child in place of the
   4155            // current one.  This means the current child and all
   4156            // its siblings is unlinked from the tree.
   4157            tp->wn_refs++;
   4158            compressed += deref_wordnode(spin, child);
   4159            np->wn_child = tp;
   4160            break;
   4161          }
   4162        }
   4163        if (tp == NULL) {
   4164          // No other child with this hash value equals the child of
   4165          // the node, add it to the linked list after the first
   4166          // item.
   4167          tp = HI2WN(hi);
   4168          child->wn_u2.next = tp->wn_u2.next;
   4169          tp->wn_u2.next = child;
   4170        }
   4171      } else {
   4172        // No other child has this hash value, add it to the
   4173        // hashtable.
   4174        hash_add_item(ht, hi, (char *)child->wn_u1.hashkey, hash);
   4175      }
   4176    }
   4177  }
   4178  *tot += len + 1;      // add one for the node that stores the length
   4179 
   4180  // Make a hash key for the node and its siblings, so that we can quickly
   4181  // find a lookalike node.  This must be done after compressing the sibling
   4182  // list, otherwise the hash key would become invalid by the compression.
   4183  node->wn_u1.hashkey[0] = (uint8_t)len;
   4184  unsigned nr = 0;
   4185  for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
   4186    if (np->wn_byte == NUL) {
   4187      // end node: use wn_flags, wn_region and wn_affixID
   4188      n = (unsigned)(np->wn_flags + (np->wn_region << 8) + (np->wn_affixID << 16));
   4189    } else {
   4190      // byte node: use the byte value and the child pointer
   4191      n = (unsigned)(np->wn_byte + ((uintptr_t)np->wn_child << 8));
   4192    }
   4193    nr = nr * 101 + n;
   4194  }
   4195 
   4196  // Avoid NUL bytes, it terminates the hash key.
   4197  n = nr & 0xff;
   4198  node->wn_u1.hashkey[1] = n == 0 ? 1 : (uint8_t)n;
   4199  n = (nr >> 8) & 0xff;
   4200  node->wn_u1.hashkey[2] = n == 0 ? 1 : (uint8_t)n;
   4201  n = (nr >> 16) & 0xff;
   4202  node->wn_u1.hashkey[3] = n == 0 ? 1 : (uint8_t)n;
   4203  n = (nr >> 24) & 0xff;
   4204  node->wn_u1.hashkey[4] = n == 0 ? 1 : (uint8_t)n;
   4205  node->wn_u1.hashkey[5] = NUL;
   4206 
   4207  // Check for CTRL-C pressed now and then.
   4208  veryfast_breakcheck();
   4209 
   4210  return compressed;
   4211 }
   4212 
   4213 // Returns true when two nodes have identical siblings and children.
   4214 static bool node_equal(wordnode_T *n1, wordnode_T *n2)
   4215 {
   4216  wordnode_T *p1;
   4217  wordnode_T *p2;
   4218 
   4219  for (p1 = n1, p2 = n2; p1 != NULL && p2 != NULL;
   4220       p1 = p1->wn_sibling, p2 = p2->wn_sibling) {
   4221    if (p1->wn_byte != p2->wn_byte
   4222        || (p1->wn_byte == NUL
   4223            ? (p1->wn_flags != p2->wn_flags
   4224               || p1->wn_region != p2->wn_region
   4225               || p1->wn_affixID != p2->wn_affixID)
   4226            : (p1->wn_child != p2->wn_child))) {
   4227      break;
   4228    }
   4229  }
   4230 
   4231  return p1 == NULL && p2 == NULL;
   4232 }
   4233 
   4234 /// Function given to qsort() to sort the REP items on "from" string.
   4235 static int rep_compare(const void *s1, const void *s2)
   4236 {
   4237  fromto_T *p1 = (fromto_T *)s1;
   4238  fromto_T *p2 = (fromto_T *)s2;
   4239 
   4240  return strcmp(p1->ft_from, p2->ft_from);
   4241 }
   4242 
   4243 /// Write the Vim .spl file "fname".
   4244 ///
   4245 /// @return  OK/FAIL.
   4246 static int write_vim_spell(spellinfo_T *spin, char *fname)
   4247 {
   4248  int retval = OK;
   4249  int regionmask;
   4250 
   4251  FILE *fd = os_fopen(fname, "w");
   4252  if (fd == NULL) {
   4253    semsg(_(e_notopen), fname);
   4254    return FAIL;
   4255  }
   4256 
   4257  // <HEADER>: <fileID> <versionnr>
   4258  // <fileID>
   4259  size_t fwv = fwrite(VIMSPELLMAGIC, VIMSPELLMAGICL, 1, fd);
   4260  if (fwv != 1) {
   4261    // Catch first write error, don't try writing more.
   4262    goto theend;
   4263  }
   4264 
   4265  putc(VIMSPELLVERSION, fd);                                // <versionnr>
   4266 
   4267  // <SECTIONS>: <section> ... <sectionend>
   4268 
   4269  // SN_INFO: <infotext>
   4270  if (spin->si_info != NULL) {
   4271    putc(SN_INFO, fd);                                  // <sectionID>
   4272    putc(0, fd);                                        // <sectionflags>
   4273    size_t i = strlen(spin->si_info);
   4274    put_bytes(fd, i, 4);                                // <sectionlen>
   4275    fwv &= fwrite(spin->si_info, i, 1, fd);             // <infotext>
   4276  }
   4277 
   4278  // SN_REGION: <regionname> ...
   4279  // Write the region names only if there is more than one.
   4280  if (spin->si_region_count > 1) {
   4281    putc(SN_REGION, fd);                                // <sectionID>
   4282    putc(SNF_REQUIRED, fd);                             // <sectionflags>
   4283    size_t l = (size_t)spin->si_region_count * 2;
   4284    put_bytes(fd, l, 4);                                // <sectionlen>
   4285    fwv &= fwrite(spin->si_region_name, l, 1, fd);
   4286    // <regionname> ...
   4287    regionmask = (1 << spin->si_region_count) - 1;
   4288  } else {
   4289    regionmask = 0;
   4290  }
   4291 
   4292  // SN_CHARFLAGS: <charflagslen> <charflags> <folcharslen> <folchars>
   4293  //
   4294  // The table with character flags and the table for case folding.
   4295  // This makes sure the same characters are recognized as word characters
   4296  // when generating and when using a spell file.
   4297  // Skip this for ASCII, the table may conflict with the one used for
   4298  // 'encoding'.
   4299  // Also skip this for an .add.spl file, the main spell file must contain
   4300  // the table (avoids that it conflicts).  File is shorter too.
   4301  if (!spin->si_ascii && !spin->si_add) {
   4302    char folchars[128 * 8];
   4303 
   4304    putc(SN_CHARFLAGS, fd);                             // <sectionID>
   4305    putc(SNF_REQUIRED, fd);                             // <sectionflags>
   4306 
   4307    // Form the <folchars> string first, we need to know its length.
   4308    size_t l = 0;
   4309    for (size_t i = 128; i < 256; i++) {
   4310      l += (size_t)utf_char2bytes(spelltab.st_fold[i], folchars + l);
   4311    }
   4312    put_bytes(fd, 1 + 128 + 2 + l, 4);                  // <sectionlen>
   4313 
   4314    fputc(128, fd);                                     // <charflagslen>
   4315    for (size_t i = 128; i < 256; i++) {
   4316      int flags = 0;
   4317      if (spelltab.st_isw[i]) {
   4318        flags |= CF_WORD;
   4319      }
   4320      if (spelltab.st_isu[i]) {
   4321        flags |= CF_UPPER;
   4322      }
   4323      fputc(flags, fd);                                 // <charflags>
   4324    }
   4325 
   4326    put_bytes(fd, l, 2);                                // <folcharslen>
   4327    fwv &= fwrite(folchars, l, 1, fd);                  // <folchars>
   4328  }
   4329 
   4330  // SN_MIDWORD: <midword>
   4331  if (spin->si_midword != NULL) {
   4332    putc(SN_MIDWORD, fd);                               // <sectionID>
   4333    putc(SNF_REQUIRED, fd);                             // <sectionflags>
   4334 
   4335    size_t i = strlen(spin->si_midword);
   4336    put_bytes(fd, i, 4);                                // <sectionlen>
   4337    fwv &= fwrite(spin->si_midword, i, 1, fd);
   4338    // <midword>
   4339  }
   4340 
   4341  // SN_PREFCOND: <prefcondcnt> <prefcond> ...
   4342  if (!GA_EMPTY(&spin->si_prefcond)) {
   4343    putc(SN_PREFCOND, fd);                              // <sectionID>
   4344    putc(SNF_REQUIRED, fd);                             // <sectionflags>
   4345 
   4346    size_t l = (size_t)write_spell_prefcond(NULL, &spin->si_prefcond, &fwv);
   4347    put_bytes(fd, l, 4);                                // <sectionlen>
   4348 
   4349    write_spell_prefcond(fd, &spin->si_prefcond, &fwv);
   4350  }
   4351 
   4352  // SN_REP: <repcount> <rep> ...
   4353  // SN_SAL: <salflags> <salcount> <sal> ...
   4354  // SN_REPSAL: <repcount> <rep> ...
   4355 
   4356  // round 1: SN_REP section
   4357  // round 2: SN_SAL section (unless SN_SOFO is used)
   4358  // round 3: SN_REPSAL section
   4359  for (unsigned round = 1; round <= 3; round++) {
   4360    garray_T *gap;
   4361    if (round == 1) {
   4362      gap = &spin->si_rep;
   4363    } else if (round == 2) {
   4364      // Don't write SN_SAL when using a SN_SOFO section
   4365      if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) {
   4366        continue;
   4367      }
   4368      gap = &spin->si_sal;
   4369    } else {
   4370      gap = &spin->si_repsal;
   4371    }
   4372 
   4373    // Don't write the section if there are no items.
   4374    if (GA_EMPTY(gap)) {
   4375      continue;
   4376    }
   4377 
   4378    // Sort the REP/REPSAL items.
   4379    if (round != 2) {
   4380      qsort(gap->ga_data, (size_t)gap->ga_len,
   4381            sizeof(fromto_T), rep_compare);
   4382    }
   4383 
   4384    int sect_id = round == 1 ? SN_REP : (round == 2 ? SN_SAL : SN_REPSAL);
   4385    putc(sect_id, fd);                                  // <sectionID>
   4386 
   4387    // This is for making suggestions, section is not required.
   4388    putc(0, fd);                                        // <sectionflags>
   4389 
   4390    // Compute the length of what follows.
   4391    size_t l = 2;  // count <repcount> or <salcount>
   4392    assert(gap->ga_len >= 0);
   4393    for (size_t i = 0; i < (size_t)gap->ga_len; i++) {
   4394      fromto_T *ftp = &((fromto_T *)gap->ga_data)[i];
   4395      l += 1 + strlen(ftp->ft_from);  // count <*fromlen> and <*from>
   4396      l += 1 + strlen(ftp->ft_to);    // count <*tolen> and <*to>
   4397    }
   4398    if (round == 2) {
   4399      l++;                            // count <salflags>
   4400    }
   4401    put_bytes(fd, l, 4);                                // <sectionlen>
   4402 
   4403    if (round == 2) {
   4404      int i = 0;
   4405      if (spin->si_followup) {
   4406        i |= SAL_F0LLOWUP;
   4407      }
   4408      if (spin->si_collapse) {
   4409        i |= SAL_COLLAPSE;
   4410      }
   4411      if (spin->si_rem_accents) {
   4412        i |= SAL_REM_ACCENTS;
   4413      }
   4414      putc(i, fd);                                      // <salflags>
   4415    }
   4416 
   4417    put_bytes(fd, (uintmax_t)gap->ga_len, 2);    // <repcount> or <salcount>
   4418    for (size_t i = 0; i < (size_t)gap->ga_len; i++) {
   4419      // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
   4420      // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
   4421      fromto_T *ftp = &((fromto_T *)gap->ga_data)[i];
   4422      for (unsigned rr = 1; rr <= 2; rr++) {
   4423        char *p = rr == 1 ? ftp->ft_from : ftp->ft_to;
   4424        l = strlen(p);
   4425        assert(l < INT_MAX);
   4426        putc((int)l, fd);
   4427        if (l > 0) {
   4428          fwv &= fwrite(p, l, 1, fd);
   4429        }
   4430      }
   4431    }
   4432  }
   4433 
   4434  // SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
   4435  // This is for making suggestions, section is not required.
   4436  if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) {
   4437    putc(SN_SOFO, fd);                                  // <sectionID>
   4438    putc(0, fd);                                        // <sectionflags>
   4439 
   4440    size_t l = strlen(spin->si_sofofr);
   4441    put_bytes(fd, l + strlen(spin->si_sofoto) + 4, 4);  // <sectionlen>
   4442 
   4443    put_bytes(fd, l, 2);                                // <sofofromlen>
   4444    fwv &= fwrite(spin->si_sofofr, l, 1, fd);           // <sofofrom>
   4445 
   4446    l = strlen(spin->si_sofoto);
   4447    put_bytes(fd, l, 2);                                // <sofotolen>
   4448    fwv &= fwrite(spin->si_sofoto, l, 1, fd);           // <sofoto>
   4449  }
   4450 
   4451  // SN_WORDS: <word> ...
   4452  // This is for making suggestions, section is not required.
   4453  if (spin->si_commonwords.ht_used > 0) {
   4454    putc(SN_WORDS, fd);                                 // <sectionID>
   4455    putc(0, fd);                                        // <sectionflags>
   4456 
   4457    // round 1: count the bytes
   4458    // round 2: write the bytes
   4459    for (unsigned round = 1; round <= 2; round++) {
   4460      size_t todo;
   4461      size_t len = 0;
   4462      hashitem_T *hi;
   4463 
   4464      todo = spin->si_commonwords.ht_used;
   4465      for (hi = spin->si_commonwords.ht_array; todo > 0; hi++) {
   4466        if (!HASHITEM_EMPTY(hi)) {
   4467          size_t l = strlen(hi->hi_key) + 1;
   4468          len += l;
   4469          if (round == 2) {                             // <word>
   4470            fwv &= fwrite(hi->hi_key, l, 1, fd);
   4471          }
   4472          todo--;
   4473        }
   4474      }
   4475      if (round == 1) {
   4476        put_bytes(fd, len, 4);                          // <sectionlen>
   4477      }
   4478    }
   4479  }
   4480 
   4481  // SN_MAP: <mapstr>
   4482  // This is for making suggestions, section is not required.
   4483  if (!GA_EMPTY(&spin->si_map)) {
   4484    putc(SN_MAP, fd);                                   // <sectionID>
   4485    putc(0, fd);                                        // <sectionflags>
   4486    size_t l = (size_t)spin->si_map.ga_len;
   4487    put_bytes(fd, l, 4);                                // <sectionlen>
   4488    fwv &= fwrite(spin->si_map.ga_data, l, 1, fd);      // <mapstr>
   4489  }
   4490 
   4491  // SN_SUGFILE: <timestamp>
   4492  // This is used to notify that a .sug file may be available and at the
   4493  // same time allows for checking that a .sug file that is found matches
   4494  // with this .spl file.  That's because the word numbers must be exactly
   4495  // right.
   4496  if (!spin->si_nosugfile
   4497      && (!GA_EMPTY(&spin->si_sal)
   4498          || (spin->si_sofofr != NULL && spin->si_sofoto != NULL))) {
   4499    putc(SN_SUGFILE, fd);                               // <sectionID>
   4500    putc(0, fd);                                        // <sectionflags>
   4501    put_bytes(fd, 8, 4);                                // <sectionlen>
   4502 
   4503    // Set si_sugtime and write it to the file.
   4504    spin->si_sugtime = time(NULL);
   4505    put_time(fd, spin->si_sugtime);                     // <timestamp>
   4506  }
   4507 
   4508  // SN_NOSPLITSUGS: nothing
   4509  // This is used to notify that no suggestions with word splits are to be
   4510  // made.
   4511  if (spin->si_nosplitsugs) {
   4512    putc(SN_NOSPLITSUGS, fd);                           // <sectionID>
   4513    putc(0, fd);                                        // <sectionflags>
   4514    put_bytes(fd, 0, 4);                                // <sectionlen>
   4515  }
   4516 
   4517  // SN_NOCOMPUNDSUGS: nothing
   4518  // This is used to notify that no suggestions with compounds are to be
   4519  // made.
   4520  if (spin->si_nocompoundsugs) {
   4521    putc(SN_NOCOMPOUNDSUGS, fd);                        // <sectionID>
   4522    putc(0, fd);                                        // <sectionflags>
   4523    put_bytes(fd, 0, 4);                                // <sectionlen>
   4524  }
   4525 
   4526  // SN_COMPOUND: compound info.
   4527  // We don't mark it required, when not supported all compound words will
   4528  // be bad words.
   4529  if (spin->si_compflags != NULL) {
   4530    putc(SN_COMPOUND, fd);                              // <sectionID>
   4531    putc(0, fd);                                        // <sectionflags>
   4532 
   4533    size_t l = strlen(spin->si_compflags);
   4534    assert(spin->si_comppat.ga_len >= 0);
   4535    for (size_t i = 0; i < (size_t)spin->si_comppat.ga_len; i++) {
   4536      l += strlen(((char **)(spin->si_comppat.ga_data))[i]) + 1;
   4537    }
   4538    put_bytes(fd, l + 7, 4);                            // <sectionlen>
   4539 
   4540    putc(spin->si_compmax, fd);                         // <compmax>
   4541    putc(spin->si_compminlen, fd);                      // <compminlen>
   4542    putc(spin->si_compsylmax, fd);                      // <compsylmax>
   4543    putc(0, fd);                // for Vim 7.0b compatibility
   4544    putc(spin->si_compoptions, fd);                     // <compoptions>
   4545    put_bytes(fd, (uintmax_t)spin->si_comppat.ga_len, 2);  // <comppatcount>
   4546    for (size_t i = 0; i < (size_t)spin->si_comppat.ga_len; i++) {
   4547      char *p = ((char **)(spin->si_comppat.ga_data))[i];
   4548      assert(strlen(p) < INT_MAX);
   4549      putc((int)strlen(p), fd);                         // <comppatlen>
   4550      fwv &= fwrite(p, strlen(p), 1, fd);               // <comppattext>
   4551    }
   4552    // <compflags>
   4553    fwv &= fwrite(spin->si_compflags, strlen(spin->si_compflags), 1, fd);
   4554  }
   4555 
   4556  // SN_NOBREAK: NOBREAK flag
   4557  if (spin->si_nobreak) {
   4558    putc(SN_NOBREAK, fd);                               // <sectionID>
   4559    putc(0, fd);                                        // <sectionflags>
   4560 
   4561    // It's empty, the presence of the section flags the feature.
   4562    put_bytes(fd, 0, 4);                                // <sectionlen>
   4563  }
   4564 
   4565  // SN_SYLLABLE: syllable info.
   4566  // We don't mark it required, when not supported syllables will not be
   4567  // counted.
   4568  if (spin->si_syllable != NULL) {
   4569    putc(SN_SYLLABLE, fd);                              // <sectionID>
   4570    putc(0, fd);                                        // <sectionflags>
   4571 
   4572    size_t l = strlen(spin->si_syllable);
   4573    put_bytes(fd, l, 4);                                // <sectionlen>
   4574    fwv &= fwrite(spin->si_syllable, l, 1, fd);         // <syllable>
   4575  }
   4576 
   4577  // end of <SECTIONS>
   4578  putc(SN_END, fd);                                     // <sectionend>
   4579 
   4580  // <LWORDTREE>  <KWORDTREE>  <PREFIXTREE>
   4581  spin->si_memtot = 0;
   4582  for (unsigned round = 1; round <= 3; round++) {
   4583    wordnode_T *tree;
   4584    if (round == 1) {
   4585      tree = spin->si_foldroot->wn_sibling;
   4586    } else if (round == 2) {
   4587      tree = spin->si_keeproot->wn_sibling;
   4588    } else {
   4589      tree = spin->si_prefroot->wn_sibling;
   4590    }
   4591 
   4592    // Clear the index and wnode fields in the tree.
   4593    clear_node(tree);
   4594 
   4595    // Count the number of nodes.  Needed to be able to allocate the
   4596    // memory when reading the nodes.  Also fills in index for shared
   4597    // nodes.
   4598    size_t nodecount = (size_t)put_node(NULL, tree, 0, regionmask, round == 3);
   4599 
   4600    // number of nodes in 4 bytes
   4601    put_bytes(fd, nodecount, 4);                        // <nodecount>
   4602    assert(nodecount + nodecount * sizeof(int) < INT_MAX);
   4603    spin->si_memtot += (int)(nodecount + nodecount * sizeof(int));
   4604 
   4605    // Write the nodes.
   4606    put_node(fd, tree, 0, regionmask, round == 3);
   4607  }
   4608 
   4609  // Write another byte to check for errors (file system full).
   4610  if (putc(0, fd) == EOF) {
   4611    retval = FAIL;
   4612  }
   4613 theend:
   4614  if (fclose(fd) == EOF) {
   4615    retval = FAIL;
   4616  }
   4617 
   4618  if (fwv != 1) {
   4619    retval = FAIL;
   4620  }
   4621  if (retval == FAIL) {
   4622    emsg(_(e_write));
   4623  }
   4624 
   4625  return retval;
   4626 }
   4627 
   4628 // Clear the index and wnode fields of "node", it siblings and its
   4629 // children.  This is needed because they are a union with other items to save
   4630 // space.
   4631 static void clear_node(wordnode_T *node)
   4632 {
   4633  if (node != NULL) {
   4634    for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
   4635      np->wn_u1.index = 0;
   4636      np->wn_u2.wnode = NULL;
   4637 
   4638      if (np->wn_byte != NUL) {
   4639        clear_node(np->wn_child);
   4640      }
   4641    }
   4642  }
   4643 }
   4644 
   4645 /// Dump a word tree at node "node".
   4646 ///
   4647 /// This first writes the list of possible bytes (siblings).  Then for each
   4648 /// byte recursively write the children.
   4649 ///
   4650 /// NOTE: The code here must match the code in read_tree_node(), since
   4651 /// assumptions are made about the indexes (so that we don't have to write them
   4652 /// in the file).
   4653 ///
   4654 /// @param fd  NULL when only counting
   4655 /// @param prefixtree  true for PREFIXTREE
   4656 ///
   4657 /// @return  the number of nodes used.
   4658 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, bool prefixtree)
   4659 {
   4660  // If "node" is zero the tree is empty.
   4661  if (node == NULL) {
   4662    return 0;
   4663  }
   4664 
   4665  // Store the index where this node is written.
   4666  node->wn_u1.index = idx;
   4667 
   4668  // Count the number of siblings.
   4669  int siblingcount = 0;
   4670  for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
   4671    siblingcount++;
   4672  }
   4673 
   4674  // Write the sibling count.
   4675  if (fd != NULL) {
   4676    putc(siblingcount, fd);                             // <siblingcount>
   4677  }
   4678  // Write each sibling byte and optionally extra info.
   4679  for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
   4680    if (np->wn_byte == 0) {
   4681      if (fd != NULL) {
   4682        // For a NUL byte (end of word) write the flags etc.
   4683        if (prefixtree) {
   4684          // In PREFIXTREE write the required affixID and the
   4685          // associated condition nr (stored in wn_region).  The
   4686          // byte value is misused to store the "rare" and "not
   4687          // combining" flags
   4688          if (np->wn_flags == (uint16_t)PFX_FLAGS) {
   4689            putc(BY_NOFLAGS, fd);                       // <byte>
   4690          } else {
   4691            putc(BY_FLAGS, fd);                         // <byte>
   4692            putc(np->wn_flags, fd);                     // <pflags>
   4693          }
   4694          putc(np->wn_affixID, fd);                     // <affixID>
   4695          put_bytes(fd, (uintmax_t)np->wn_region, 2);   // <prefcondnr>
   4696        } else {
   4697          // For word trees we write the flag/region items.
   4698          int flags = np->wn_flags;
   4699          if (regionmask != 0 && np->wn_region != regionmask) {
   4700            flags |= WF_REGION;
   4701          }
   4702          if (np->wn_affixID != 0) {
   4703            flags |= WF_AFX;
   4704          }
   4705          if (flags == 0) {
   4706            // word without flags or region
   4707            putc(BY_NOFLAGS, fd);                               // <byte>
   4708          } else {
   4709            if (np->wn_flags >= 0x100) {
   4710              putc(BY_FLAGS2, fd);                              // <byte>
   4711              putc(flags, fd);                                  // <flags>
   4712              putc((int)((unsigned)flags >> 8), fd);            // <flags2>
   4713            } else {
   4714              putc(BY_FLAGS, fd);                               // <byte>
   4715              putc(flags, fd);                                  // <flags>
   4716            }
   4717            if (flags & WF_REGION) {
   4718              putc(np->wn_region, fd);                          // <region>
   4719            }
   4720            if (flags & WF_AFX) {
   4721              putc(np->wn_affixID, fd);                         // <affixID>
   4722            }
   4723          }
   4724        }
   4725      }
   4726    } else {
   4727      if (np->wn_child->wn_u1.index != 0
   4728          && np->wn_child->wn_u2.wnode != node) {
   4729        // The child is written elsewhere, write the reference.
   4730        if (fd != NULL) {
   4731          putc(BY_INDEX, fd);                                      // <byte>
   4732          put_bytes(fd, (uintmax_t)np->wn_child->wn_u1.index, 3);  // <nodeidx>
   4733        }
   4734      } else if (np->wn_child->wn_u2.wnode == NULL) {
   4735        // We will write the child below and give it an index.
   4736        np->wn_child->wn_u2.wnode = node;
   4737      }
   4738 
   4739      if (fd != NULL) {
   4740        if (putc(np->wn_byte, fd) == EOF) {       // <byte> or <xbyte>
   4741          emsg(_(e_write));
   4742          return 0;
   4743        }
   4744      }
   4745    }
   4746  }
   4747 
   4748  // Space used in the array when reading: one for each sibling and one for
   4749  // the count.
   4750  int newindex = idx + siblingcount + 1;
   4751 
   4752  // Recursively dump the children of each sibling.
   4753  for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
   4754    if (np->wn_byte != 0 && np->wn_child->wn_u2.wnode == node) {
   4755      newindex = put_node(fd, np->wn_child, newindex, regionmask,
   4756                          prefixtree);
   4757    }
   4758  }
   4759 
   4760  return newindex;
   4761 }
   4762 
   4763 // ":mkspell [-ascii] outfile  infile ..."
   4764 // ":mkspell [-ascii] addfile"
   4765 void ex_mkspell(exarg_T *eap)
   4766 {
   4767  int fcount;
   4768  char **fnames;
   4769  char *arg = eap->arg;
   4770  bool ascii = false;
   4771 
   4772  if (strncmp(arg, "-ascii", 6) == 0) {
   4773    ascii = true;
   4774    arg = skipwhite(arg + 6);
   4775  }
   4776 
   4777  // Expand all the remaining arguments (e.g., $VIMRUNTIME).
   4778  if (get_arglist_exp(arg, &fcount, &fnames, false) != OK) {
   4779    return;
   4780  }
   4781 
   4782  mkspell(fcount, fnames, ascii, eap->forceit, false);
   4783  FreeWild(fcount, fnames);
   4784 }
   4785 
   4786 // Create the .sug file.
   4787 // Uses the soundfold info in "spin".
   4788 // Writes the file with the name "wfname", with ".spl" changed to ".sug".
   4789 static void spell_make_sugfile(spellinfo_T *spin, char *wfname)
   4790 {
   4791  char *fname = NULL;
   4792  slang_T *slang;
   4793  bool free_slang = false;
   4794 
   4795  // Read back the .spl file that was written.  This fills the required
   4796  // info for soundfolding.  This also uses less memory than the
   4797  // pointer-linked version of the trie.  And it avoids having two versions
   4798  // of the code for the soundfolding stuff.
   4799  // It might have been done already by spell_reload_one().
   4800  for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
   4801    if (path_full_compare(wfname, slang->sl_fname, false, true)
   4802        == kEqualFiles) {
   4803      break;
   4804    }
   4805  }
   4806  if (slang == NULL) {
   4807    spell_message(spin, _("Reading back spell file..."));
   4808    slang = spell_load_file(wfname, NULL, NULL, false);
   4809    if (slang == NULL) {
   4810      return;
   4811    }
   4812    free_slang = true;
   4813  }
   4814 
   4815  // Clear the info in "spin" that is used.
   4816  spin->si_blocks = NULL;
   4817  spin->si_blocks_cnt = 0;
   4818  spin->si_compress_cnt = 0;        // will stay at 0 all the time
   4819  spin->si_free_count = 0;
   4820  spin->si_first_free = NULL;
   4821  spin->si_foldwcount = 0;
   4822 
   4823  // Go through the trie of good words, soundfold each word and add it to
   4824  // the soundfold trie.
   4825  spell_message(spin, _("Performing soundfolding..."));
   4826  if (sug_filltree(spin, slang) == FAIL) {
   4827    goto theend;
   4828  }
   4829 
   4830  // Create the table which links each soundfold word with a list of the
   4831  // good words it may come from.  Creates buffer "spin->si_spellbuf".
   4832  // This also removes the wordnr from the NUL byte entries to make
   4833  // compression possible.
   4834  if (sug_maketable(spin) == FAIL) {
   4835    goto theend;
   4836  }
   4837 
   4838  smsg(0, _("Number of words after soundfolding: %" PRId64),
   4839       (int64_t)spin->si_spellbuf->b_ml.ml_line_count);
   4840 
   4841  // Compress the soundfold trie.
   4842  spell_message(spin, _(msg_compressing));
   4843  wordtree_compress(spin, spin->si_foldroot, "case-folded");
   4844 
   4845  // Write the .sug file.
   4846  // Make the file name by changing ".spl" to ".sug".
   4847  fname = xmalloc(MAXPATHL);
   4848  xstrlcpy(fname, wfname, MAXPATHL);
   4849  int len = (int)strlen(fname);
   4850  fname[len - 2] = 'u';
   4851  fname[len - 1] = 'g';
   4852  sug_write(spin, fname);
   4853 
   4854 theend:
   4855  xfree(fname);
   4856  if (free_slang) {
   4857    slang_free(slang);
   4858  }
   4859  free_blocks(spin->si_blocks);
   4860  close_spellbuf(spin->si_spellbuf);
   4861 }
   4862 
   4863 // Build the soundfold trie for language "slang".
   4864 static int sug_filltree(spellinfo_T *spin, slang_T *slang)
   4865 {
   4866  idx_T arridx[MAXWLEN];
   4867  int curi[MAXWLEN];
   4868  char tword[MAXWLEN];
   4869  char tsalword[MAXWLEN];
   4870  unsigned words_done = 0;
   4871  int wordcount[MAXWLEN];
   4872 
   4873  // We use si_foldroot for the soundfolded trie.
   4874  spin->si_foldroot = wordtree_alloc(spin);
   4875 
   4876  // Let tree_add_word() know we're adding to the soundfolded tree
   4877  spin->si_sugtree = true;
   4878 
   4879  // Go through the whole case-folded tree, soundfold each word and put it
   4880  // in the trie.  Bail out if the tree is empty.
   4881  uint8_t *byts = slang->sl_fbyts;
   4882  idx_T *idxs = slang->sl_fidxs;
   4883  if (byts == NULL || idxs == NULL) {
   4884    return FAIL;
   4885  }
   4886 
   4887  arridx[0] = 0;
   4888  curi[0] = 1;
   4889  wordcount[0] = 0;
   4890 
   4891  int depth = 0;
   4892  while (depth >= 0 && !got_int) {
   4893    if (curi[depth] > byts[arridx[depth]]) {
   4894      // Done all bytes at this node, go up one level.
   4895      idxs[arridx[depth]] = wordcount[depth];
   4896      if (depth > 0) {
   4897        wordcount[depth - 1] += wordcount[depth];
   4898      }
   4899 
   4900      depth--;
   4901      line_breakcheck();
   4902    } else {
   4903      // Do one more byte at this node.
   4904      idx_T n = arridx[depth] + curi[depth];
   4905      curi[depth]++;
   4906 
   4907      int c = byts[n];
   4908      if (c == 0) {
   4909        // Sound-fold the word.
   4910        tword[depth] = NUL;
   4911        spell_soundfold(slang, tword, true, tsalword);
   4912 
   4913        // We use the "flags" field for the MSB of the wordnr,
   4914        // "region" for the LSB of the wordnr.
   4915        if (tree_add_word(spin, tsalword, spin->si_foldroot,
   4916                          (int)(words_done >> 16), words_done & 0xffff,
   4917                          0) == FAIL) {
   4918          return FAIL;
   4919        }
   4920 
   4921        words_done++;
   4922        wordcount[depth]++;
   4923 
   4924        // Reset the block count each time to avoid compression
   4925        // kicking in.
   4926        spin->si_blocks_cnt = 0;
   4927 
   4928        // Skip over any other NUL bytes (same word with different
   4929        // flags).  But don't go over the end.
   4930        while (n + 1 < slang->sl_fbyts_len && byts[n + 1] == 0) {
   4931          n++;
   4932          curi[depth]++;
   4933        }
   4934      } else {
   4935        // Normal char, go one level deeper.
   4936        tword[depth++] = (char)(uint8_t)c;
   4937        arridx[depth] = idxs[n];
   4938        curi[depth] = 1;
   4939        wordcount[depth] = 0;
   4940      }
   4941    }
   4942  }
   4943 
   4944  smsg(0, _("Total number of words: %d"), words_done);
   4945 
   4946  return OK;
   4947 }
   4948 
   4949 // Make the table that links each word in the soundfold trie to the words it
   4950 // can be produced from.
   4951 // This is not unlike lines in a file, thus use a memfile to be able to access
   4952 // the table efficiently.
   4953 // Returns FAIL when out of memory.
   4954 static int sug_maketable(spellinfo_T *spin)
   4955 {
   4956  garray_T ga;
   4957  int res = OK;
   4958 
   4959  // Allocate a buffer, open a memline for it and create the swap file
   4960  // (uses a temp file, not a .swp file).
   4961  spin->si_spellbuf = open_spellbuf();
   4962 
   4963  // Use a buffer to store the line info, avoids allocating many small
   4964  // pieces of memory.
   4965  ga_init(&ga, 1, 100);
   4966 
   4967  // recursively go through the tree
   4968  if (sug_filltable(spin, spin->si_foldroot->wn_sibling, 0, &ga) == -1) {
   4969    res = FAIL;
   4970  }
   4971 
   4972  ga_clear(&ga);
   4973  return res;
   4974 }
   4975 
   4976 /// Fill the table for one node and its children.
   4977 /// Returns the wordnr at the start of the node.
   4978 /// Returns -1 when out of memory.
   4979 ///
   4980 /// @param gap  place to store line of numbers
   4981 static int sug_filltable(spellinfo_T *spin, wordnode_T *node, int startwordnr, garray_T *gap)
   4982 {
   4983  int wordnr = startwordnr;
   4984 
   4985  for (wordnode_T *p = node; p != NULL; p = p->wn_sibling) {
   4986    if (p->wn_byte == NUL) {
   4987      gap->ga_len = 0;
   4988      int prev_nr = 0;
   4989      for (wordnode_T *np = p; np != NULL && np->wn_byte == NUL; np = np->wn_sibling) {
   4990        ga_grow(gap, 10);
   4991 
   4992        int nr = (np->wn_flags << 16) + (np->wn_region & 0xffff);
   4993        // Compute the offset from the previous nr and store the
   4994        // offset in a way that it takes a minimum number of bytes.
   4995        // It's a bit like utf-8, but without the need to mark
   4996        // following bytes.
   4997        nr -= prev_nr;
   4998        prev_nr += nr;
   4999        gap->ga_len += offset2bytes(nr, (char *)gap->ga_data + gap->ga_len);
   5000      }
   5001 
   5002      // add the NUL byte
   5003      ((char *)gap->ga_data)[gap->ga_len++] = NUL;
   5004 
   5005      if (ml_append_buf(spin->si_spellbuf, (linenr_T)wordnr,
   5006                        gap->ga_data, gap->ga_len, true) == FAIL) {
   5007        return -1;
   5008      }
   5009      wordnr++;
   5010 
   5011      // Remove extra NUL entries, we no longer need them. We don't
   5012      // bother freeing the nodes, they won't be reused anyway.
   5013      while (p->wn_sibling != NULL && p->wn_sibling->wn_byte == NUL) {
   5014        p->wn_sibling = p->wn_sibling->wn_sibling;
   5015      }
   5016 
   5017      // Clear the flags on the remaining NUL node, so that compression
   5018      // works a lot better.
   5019      p->wn_flags = 0;
   5020      p->wn_region = 0;
   5021    } else {
   5022      wordnr = sug_filltable(spin, p->wn_child, wordnr, gap);
   5023      if (wordnr == -1) {
   5024        return -1;
   5025      }
   5026    }
   5027  }
   5028  return wordnr;
   5029 }
   5030 
   5031 // Convert an offset into a minimal number of bytes.
   5032 // Similar to utf_char2byters, but use 8 bits in followup bytes and avoid NUL
   5033 // bytes.
   5034 static int offset2bytes(int nr, char *buf_in)
   5035 {
   5036  uint8_t *buf = (uint8_t *)buf_in;
   5037 
   5038  // Split the number in parts of base 255.  We need to avoid NUL bytes.
   5039  int b1 = nr % 255 + 1;
   5040  int rem = nr / 255;
   5041  int b2 = rem % 255 + 1;
   5042  rem = rem / 255;
   5043  int b3 = rem % 255 + 1;
   5044  int b4 = rem / 255 + 1;
   5045 
   5046  if (b4 > 1 || b3 > 0x1f) {    // 4 bytes
   5047    buf[0] = (uint8_t)(0xe0 + b4);
   5048    buf[1] = (uint8_t)b3;
   5049    buf[2] = (uint8_t)b2;
   5050    buf[3] = (uint8_t)b1;
   5051    return 4;
   5052  }
   5053  if (b3 > 1 || b2 > 0x3f) {   // 3 bytes
   5054    buf[0] = (uint8_t)(0xc0 + b3);
   5055    buf[1] = (uint8_t)b2;
   5056    buf[2] = (uint8_t)b1;
   5057    return 3;
   5058  }
   5059  if (b2 > 1 || b1 > 0x7f) {   // 2 bytes
   5060    buf[0] = (uint8_t)(0x80 + b2);
   5061    buf[1] = (uint8_t)b1;
   5062    return 2;
   5063  }
   5064  // 1 byte
   5065  buf[0] = (uint8_t)b1;
   5066  return 1;
   5067 }
   5068 
   5069 // Write the .sug file in "fname".
   5070 static void sug_write(spellinfo_T *spin, char *fname)
   5071 {
   5072  // Create the file.  Note that an existing file is silently overwritten!
   5073  FILE *fd = os_fopen(fname, "w");
   5074  if (fd == NULL) {
   5075    semsg(_(e_notopen), fname);
   5076    return;
   5077  }
   5078 
   5079  vim_snprintf(IObuff, IOSIZE,
   5080               _("Writing suggestion file %s..."), fname);
   5081  spell_message(spin, IObuff);
   5082 
   5083  // <SUGHEADER>: <fileID> <versionnr> <timestamp>
   5084  if (fwrite(VIMSUGMAGIC, VIMSUGMAGICL, 1, fd) != 1) {  // <fileID>
   5085    emsg(_(e_write));
   5086    goto theend;
   5087  }
   5088  putc(VIMSUGVERSION, fd);                              // <versionnr>
   5089 
   5090  // Write si_sugtime to the file.
   5091  put_time(fd, spin->si_sugtime);                       // <timestamp>
   5092 
   5093  // <SUGWORDTREE>
   5094  spin->si_memtot = 0;
   5095  wordnode_T *tree = spin->si_foldroot->wn_sibling;
   5096 
   5097  // Clear the index and wnode fields in the tree.
   5098  clear_node(tree);
   5099 
   5100  // Count the number of nodes.  Needed to be able to allocate the
   5101  // memory when reading the nodes.  Also fills in index for shared
   5102  // nodes.
   5103  size_t nodecount = (size_t)put_node(NULL, tree, 0, 0, false);
   5104 
   5105  // number of nodes in 4 bytes
   5106  put_bytes(fd, nodecount, 4);                          // <nodecount>
   5107  assert(nodecount + nodecount * sizeof(int) < INT_MAX);
   5108  spin->si_memtot += (int)(nodecount + nodecount * sizeof(int));
   5109 
   5110  // Write the nodes.
   5111  put_node(fd, tree, 0, 0, false);
   5112 
   5113  // <SUGTABLE>: <sugwcount> <sugline> ...
   5114  linenr_T wcount = spin->si_spellbuf->b_ml.ml_line_count;
   5115  assert(wcount >= 0);
   5116  put_bytes(fd, (uintmax_t)wcount, 4);                  // <sugwcount>
   5117 
   5118  for (linenr_T lnum = 1; lnum <= wcount; lnum++) {
   5119    // <sugline>: <sugnr> ... NUL
   5120    char *line = ml_get_buf(spin->si_spellbuf, lnum);
   5121    int len = ml_get_buf_len(spin->si_spellbuf, lnum) + 1;
   5122    if (fwrite(line, (size_t)len, 1, fd) == 0) {
   5123      emsg(_(e_write));
   5124      goto theend;
   5125    }
   5126    spin->si_memtot += len;
   5127  }
   5128 
   5129  // Write another byte to check for errors.
   5130  if (putc(0, fd) == EOF) {
   5131    emsg(_(e_write));
   5132  }
   5133 
   5134  vim_snprintf(IObuff, IOSIZE,
   5135               _("Estimated runtime memory use: %d bytes"), spin->si_memtot);
   5136  spell_message(spin, IObuff);
   5137 
   5138 theend:
   5139  // close the file
   5140  fclose(fd);
   5141 }
   5142 
   5143 /// Create a Vim spell file from one or more word lists.
   5144 /// "fnames[0]" is the output file name.
   5145 /// "fnames[fcount - 1]" is the last input file name.
   5146 /// Exception: when "fnames[0]" ends in ".add" it's used as the input file name
   5147 /// and ".spl" is appended to make the output file name.
   5148 ///
   5149 /// @param ascii  -ascii argument given
   5150 /// @param over_write  overwrite existing output file
   5151 /// @param added_word  invoked through "zg"
   5152 static void mkspell(int fcount, char **fnames, bool ascii, bool over_write, bool added_word)
   5153 {
   5154  char *fname = NULL;
   5155  afffile_T *(afile[MAXREGIONS]);
   5156  bool error = false;
   5157  spellinfo_T spin;
   5158 
   5159  CLEAR_FIELD(spin);
   5160  spin.si_verbose = !added_word;
   5161  spin.si_ascii = ascii;
   5162  spin.si_followup = true;
   5163  spin.si_rem_accents = true;
   5164  ga_init(&spin.si_rep, (int)sizeof(fromto_T), 20);
   5165  ga_init(&spin.si_repsal, (int)sizeof(fromto_T), 20);
   5166  ga_init(&spin.si_sal, (int)sizeof(fromto_T), 20);
   5167  ga_init(&spin.si_map, (int)sizeof(char), 100);
   5168  ga_init(&spin.si_comppat, (int)sizeof(char *), 20);
   5169  ga_init(&spin.si_prefcond, (int)sizeof(char *), 50);
   5170  hash_init(&spin.si_commonwords);
   5171  spin.si_newcompID = 127;      // start compound ID at first maximum
   5172 
   5173  // default: fnames[0] is output file, following are input files
   5174  // When "fcount" is 1 there is only one file.
   5175  char **innames = &fnames[fcount == 1 ? 0 : 1];
   5176  int incount = fcount - 1;
   5177 
   5178  char *wfname = xmalloc(MAXPATHL);
   5179 
   5180  if (fcount >= 1) {
   5181    int len = (int)strlen(fnames[0]);
   5182    if (fcount == 1 && len > 4 && strcmp(fnames[0] + len - 4, ".add") == 0) {
   5183      // For ":mkspell path/en.latin1.add" output file is
   5184      // "path/en.latin1.add.spl".
   5185      incount = 1;
   5186      vim_snprintf(wfname, MAXPATHL, "%s.spl", fnames[0]);
   5187    } else if (fcount == 1) {
   5188      // For ":mkspell path/vim" output file is "path/vim.latin1.spl".
   5189      incount = 1;
   5190      vim_snprintf(wfname, MAXPATHL, SPL_FNAME_TMPL,
   5191                   fnames[0], spin.si_ascii ? "ascii" : spell_enc());
   5192    } else if (len > 4 && strcmp(fnames[0] + len - 4, ".spl") == 0) {
   5193      // Name ends in ".spl", use as the file name.
   5194      xstrlcpy(wfname, fnames[0], MAXPATHL);
   5195    } else {
   5196      // Name should be language, make the file name from it.
   5197      vim_snprintf(wfname, MAXPATHL, SPL_FNAME_TMPL,
   5198                   fnames[0], spin.si_ascii ? "ascii" : spell_enc());
   5199    }
   5200 
   5201    // Check for .ascii.spl.
   5202    if (strstr(path_tail(wfname), SPL_FNAME_ASCII) != NULL) {
   5203      spin.si_ascii = true;
   5204    }
   5205 
   5206    // Check for .add.spl.
   5207    if (strstr(path_tail(wfname), SPL_FNAME_ADD) != NULL) {
   5208      spin.si_add = true;
   5209    }
   5210  }
   5211 
   5212  if (incount <= 0) {
   5213    emsg(_(e_invarg));          // need at least output and input names
   5214  } else if (vim_strchr(path_tail(wfname), '_') != NULL) {
   5215    emsg(_("E751: Output file name must not have region name"));
   5216  } else if (incount > MAXREGIONS) {
   5217    semsg(_("E754: Only up to %d regions supported"), MAXREGIONS);
   5218  } else {
   5219    // Check for overwriting before doing things that may take a lot of
   5220    // time.
   5221    if (!over_write && os_path_exists(wfname)) {
   5222      emsg(_(e_exists));
   5223      goto theend;
   5224    }
   5225    if (os_isdir(wfname)) {
   5226      semsg(_(e_isadir2), wfname);
   5227      goto theend;
   5228    }
   5229 
   5230    fname = xmalloc(MAXPATHL);
   5231 
   5232    // Init the aff and dic pointers.
   5233    // Get the region names if there are more than 2 arguments.
   5234    for (int i = 0; i < incount; i++) {
   5235      afile[i] = NULL;
   5236 
   5237      if (incount > 1) {
   5238        int len = (int)strlen(innames[i]);
   5239        if (strlen(path_tail(innames[i])) < 5
   5240            || innames[i][len - 3] != '_') {
   5241          semsg(_("E755: Invalid region in %s"), innames[i]);
   5242          goto theend;
   5243        }
   5244        spin.si_region_name[i * 2] = (char)(uint8_t)TOLOWER_ASC(innames[i][len - 2]);
   5245        spin.si_region_name[i * 2 + 1] = (char)(uint8_t)TOLOWER_ASC(innames[i][len - 1]);
   5246      }
   5247    }
   5248    spin.si_region_count = incount;
   5249 
   5250    spin.si_foldroot = wordtree_alloc(&spin);
   5251    spin.si_keeproot = wordtree_alloc(&spin);
   5252    spin.si_prefroot = wordtree_alloc(&spin);
   5253 
   5254    // When not producing a .add.spl file clear the character table when
   5255    // we encounter one in the .aff file.  This means we dump the current
   5256    // one in the .spl file if the .aff file doesn't define one.  That's
   5257    // better than guessing the contents, the table will match a
   5258    // previously loaded spell file.
   5259    if (!spin.si_add) {
   5260      spin.si_clear_chartab = true;
   5261    }
   5262 
   5263    // Read all the .aff and .dic files.
   5264    // Text is converted to 'encoding'.
   5265    // Words are stored in the case-folded and keep-case trees.
   5266    for (int i = 0; i < incount && !error; i++) {
   5267      spin.si_conv.vc_type = CONV_NONE;
   5268      spin.si_region = 1 << i;
   5269 
   5270      vim_snprintf(fname, MAXPATHL, "%s.aff", innames[i]);
   5271      if (os_path_exists(fname)) {
   5272        // Read the .aff file.  Will init "spin->si_conv" based on the
   5273        // "SET" line.
   5274        afile[i] = spell_read_aff(&spin, fname);
   5275        if (afile[i] == NULL) {
   5276          error = true;
   5277        } else {
   5278          // Read the .dic file and store the words in the trees.
   5279          vim_snprintf(fname, MAXPATHL, "%s.dic", innames[i]);
   5280          if (spell_read_dic(&spin, fname, afile[i]) == FAIL) {
   5281            error = true;
   5282          }
   5283        }
   5284      } else {
   5285        // No .aff file, try reading the file as a word list.  Store
   5286        // the words in the trees.
   5287        if (spell_read_wordfile(&spin, innames[i]) == FAIL) {
   5288          error = true;
   5289        }
   5290      }
   5291 
   5292      // Free any conversion stuff.
   5293      convert_setup(&spin.si_conv, NULL, NULL);
   5294    }
   5295 
   5296    if (spin.si_compflags != NULL && spin.si_nobreak) {
   5297      msg(_("Warning: both compounding and NOBREAK specified"), 0);
   5298    }
   5299 
   5300    if (!error && !got_int) {
   5301      // Combine tails in the tree.
   5302      spell_message(&spin, _(msg_compressing));
   5303      wordtree_compress(&spin, spin.si_foldroot, "case-folded");
   5304      wordtree_compress(&spin, spin.si_keeproot, "keep-case");
   5305      wordtree_compress(&spin, spin.si_prefroot, "prefixes");
   5306    }
   5307 
   5308    if (!error && !got_int) {
   5309      // Write the info in the spell file.
   5310      vim_snprintf(IObuff, IOSIZE,
   5311                   _("Writing spell file %s..."), wfname);
   5312      spell_message(&spin, IObuff);
   5313 
   5314      error = write_vim_spell(&spin, wfname) == FAIL;
   5315 
   5316      spell_message(&spin, _("Done!"));
   5317      vim_snprintf(IObuff, IOSIZE,
   5318                   _("Estimated runtime memory use: %d bytes"), spin.si_memtot);
   5319      spell_message(&spin, IObuff);
   5320 
   5321      // If the file is loaded need to reload it.
   5322      if (!error) {
   5323        spell_reload_one(wfname, added_word);
   5324      }
   5325    }
   5326 
   5327    // Free the allocated memory.
   5328    ga_clear(&spin.si_rep);
   5329    ga_clear(&spin.si_repsal);
   5330    ga_clear(&spin.si_sal);
   5331    ga_clear(&spin.si_map);
   5332    ga_clear(&spin.si_comppat);
   5333    ga_clear(&spin.si_prefcond);
   5334    hash_clear_all(&spin.si_commonwords, 0);
   5335 
   5336    // Free the .aff file structures.
   5337    for (int i = 0; i < incount; i++) {
   5338      if (afile[i] != NULL) {
   5339        spell_free_aff(afile[i]);
   5340      }
   5341    }
   5342 
   5343    // Free all the bits and pieces at once.
   5344    free_blocks(spin.si_blocks);
   5345 
   5346    // If there is soundfolding info and no NOSUGFILE item create the
   5347    // .sug file with the soundfolded word trie.
   5348    if (spin.si_sugtime != 0 && !error && !got_int) {
   5349      spell_make_sugfile(&spin, wfname);
   5350    }
   5351  }
   5352 
   5353 theend:
   5354  xfree(fname);
   5355  xfree(wfname);
   5356 }
   5357 
   5358 // Display a message for spell file processing when 'verbose' is set or using
   5359 // ":mkspell".  "str" can be IObuff.
   5360 static void spell_message(const spellinfo_T *spin, char *str)
   5361  FUNC_ATTR_NONNULL_ALL
   5362 {
   5363  if (spin->si_verbose || p_verbose > 2) {
   5364    if (!spin->si_verbose) {
   5365      verbose_enter();
   5366    }
   5367    msg(str, 0);
   5368    ui_flush();
   5369    if (!spin->si_verbose) {
   5370      verbose_leave();
   5371    }
   5372  }
   5373 }
   5374 
   5375 // ":[count]spellgood  {word}"
   5376 // ":[count]spellwrong {word}"
   5377 // ":[count]spellundo  {word}"
   5378 // ":[count]spellrare  {word}"
   5379 void ex_spell(exarg_T *eap)
   5380 {
   5381  spell_add_word(eap->arg, (int)strlen(eap->arg),
   5382                 eap->cmdidx == CMD_spellwrong
   5383                 ? SPELL_ADD_BAD
   5384                 : eap->cmdidx == CMD_spellrare ? SPELL_ADD_RARE : SPELL_ADD_GOOD,
   5385                 eap->forceit ? 0 : (int)eap->line2,
   5386                 eap->cmdidx == CMD_spellundo);
   5387 }
   5388 
   5389 /// Add "word[len]" to 'spellfile' as a good or bad word.
   5390 ///
   5391 /// @param what  SPELL_ADD_ values
   5392 /// @param idx  "zG" and "zW": zero, otherwise index in 'spellfile'
   5393 /// @param bool  // true for "zug", "zuG", "zuw" and "zuW"
   5394 void spell_add_word(char *word, int len, SpellAddType what, int idx, bool undo)
   5395 {
   5396  FILE *fd = NULL;
   5397  buf_T *buf = NULL;
   5398  bool new_spf = false;
   5399  char *fname;
   5400  char *fnamebuf = NULL;
   5401  char line[MAXWLEN * 2];
   5402  char *spf;
   5403 
   5404  if (!valid_spell_word(word, word + len)) {
   5405    emsg(_(e_illegal_character_in_word));
   5406    return;
   5407  }
   5408 
   5409  if (idx == 0) {           // use internal wordlist
   5410    if (int_wordlist == NULL) {
   5411      int_wordlist = vim_tempname();
   5412      if (int_wordlist == NULL) {
   5413        return;
   5414      }
   5415    }
   5416    fname = int_wordlist;
   5417  } else {
   5418    int i;
   5419    // If 'spellfile' isn't set figure out a good default value.
   5420    if (*curwin->w_s->b_p_spf == NUL) {
   5421      init_spellfile();
   5422      new_spf = true;
   5423    }
   5424 
   5425    if (*curwin->w_s->b_p_spf == NUL) {
   5426      semsg(_(e_notset), "spellfile");
   5427      return;
   5428    }
   5429    fnamebuf = xmalloc(MAXPATHL);
   5430 
   5431    for (spf = curwin->w_s->b_p_spf, i = 1; *spf != NUL; i++) {
   5432      copy_option_part(&spf, fnamebuf, MAXPATHL, ",");
   5433      if (i == idx) {
   5434        break;
   5435      }
   5436      if (*spf == NUL) {
   5437        semsg(_("E765: 'spellfile' does not have %d entries"), idx);
   5438        xfree(fnamebuf);
   5439        return;
   5440      }
   5441    }
   5442 
   5443    // Check that the user isn't editing the .add file somewhere.
   5444    buf = buflist_findname_exp(fnamebuf);
   5445    if (buf != NULL && buf->b_ml.ml_mfp == NULL) {
   5446      buf = NULL;
   5447    }
   5448    if (buf != NULL && bufIsChanged(buf)) {
   5449      emsg(_(e_bufloaded));
   5450      xfree(fnamebuf);
   5451      return;
   5452    }
   5453 
   5454    fname = fnamebuf;
   5455  }
   5456 
   5457  if (what == SPELL_ADD_BAD || undo) {
   5458    int fpos_next = 0;
   5459    int fpos = 0;
   5460    // When the word appears as good word we need to remove that one,
   5461    // since its flags sort before the one with WF_BANNED.
   5462    fd = os_fopen(fname, "r");
   5463    if (fd != NULL) {
   5464      while (!vim_fgets(line, MAXWLEN * 2, fd)) {
   5465        fpos = fpos_next;
   5466        fpos_next = (int)ftell(fd);
   5467        if (fpos_next < 0) {
   5468          break;  // should never happen
   5469        }
   5470        if (strncmp(word, line, (size_t)len) == 0
   5471            && (line[len] == '/' || (uint8_t)line[len] < ' ')) {
   5472          // Found duplicate word.  Remove it by writing a '#' at
   5473          // the start of the line.  Mixing reading and writing
   5474          // doesn't work for all systems, close the file first.
   5475          fclose(fd);
   5476          fd = os_fopen(fname, "r+");
   5477          if (fd == NULL) {
   5478            break;
   5479          }
   5480          if (fseek(fd, fpos, SEEK_SET) == 0) {
   5481            fputc('#', fd);
   5482            if (undo) {
   5483              home_replace(NULL, fname, NameBuff, MAXPATHL, true);
   5484              smsg(0, _("Word '%.*s' removed from %s"), len, word, NameBuff);
   5485            }
   5486          }
   5487          if (fseek(fd, fpos_next, SEEK_SET) != 0) {
   5488            PERROR(_("Seek error in spellfile"));
   5489            break;
   5490          }
   5491        }
   5492      }
   5493      if (fd != NULL) {
   5494        fclose(fd);
   5495      }
   5496    }
   5497  }
   5498 
   5499  if (!undo) {
   5500    fd = os_fopen(fname, "a");
   5501    if (fd == NULL && new_spf) {
   5502      char *p;
   5503 
   5504      // We just initialized the 'spellfile' option and can't open the
   5505      // file.  We may need to create the "spell" directory first.  We
   5506      // already checked the runtime directory is writable in
   5507      // init_spellfile().
   5508      if (!dir_of_file_exists(fname)
   5509          && (p = path_tail_with_sep(fname)) != fname) {
   5510        char c = *p;
   5511 
   5512        // The directory doesn't exist.  Try creating it and opening
   5513        // the file again.
   5514        *p = NUL;
   5515        os_mkdir(fname, 0755);
   5516        *p = c;
   5517        fd = os_fopen(fname, "a");
   5518      }
   5519    }
   5520 
   5521    if (fd == NULL) {
   5522      semsg(_(e_notopen), fname);
   5523    } else {
   5524      if (what == SPELL_ADD_BAD) {
   5525        fprintf(fd, "%.*s/!\n", len, word);
   5526      } else if (what == SPELL_ADD_RARE) {
   5527        fprintf(fd, "%.*s/?\n", len, word);
   5528      } else {
   5529        fprintf(fd, "%.*s\n", len, word);
   5530      }
   5531      fclose(fd);
   5532 
   5533      home_replace(NULL, fname, NameBuff, MAXPATHL, true);
   5534      smsg(0, _("Word '%.*s' added to %s"), len, word, NameBuff);
   5535    }
   5536  }
   5537 
   5538  if (fd != NULL) {
   5539    // Update the .add.spl file.
   5540    mkspell(1, &fname, false, true, true);
   5541 
   5542    // If the .add file is edited somewhere, reload it.
   5543    if (buf != NULL) {
   5544      buf_reload(buf, buf->b_orig_mode, false);
   5545    }
   5546 
   5547    redraw_all_later(UPD_SOME_VALID);
   5548  }
   5549  xfree(fnamebuf);
   5550 }
   5551 
   5552 // Initialize 'spellfile' for the current buffer.
   5553 //
   5554 // If the location does not exist, create it. Defaults to
   5555 // stdpath("data") + "/site/spell/{spelllang}.{encoding}.add".
   5556 static void init_spellfile(void)
   5557 {
   5558  char *lend;
   5559  bool aspath = false;
   5560  char *lstart = curbuf->b_s.b_p_spl;
   5561 
   5562  if (*curwin->w_s->b_p_spl == NUL || GA_EMPTY(&curwin->w_s->b_langp)) {
   5563    return;
   5564  }
   5565 
   5566  // Find the end of the language name.  Exclude the region.  If there
   5567  // is a path separator remember the start of the tail.
   5568  for (lend = curwin->w_s->b_p_spl; *lend != NUL
   5569       && vim_strchr(",._", (uint8_t)(*lend)) == NULL; lend++) {
   5570    if (vim_ispathsep(*lend)) {
   5571      aspath = true;
   5572      lstart = lend + 1;
   5573    }
   5574  }
   5575 
   5576  char *buf = xmalloc(MAXPATHL);
   5577  size_t buf_len = MAXPATHL;
   5578 
   5579  if (!aspath) {
   5580    char *xdg_path = get_xdg_home(kXDGDataHome);
   5581    xstrlcpy(buf, xdg_path, buf_len);
   5582    xfree(xdg_path);
   5583 
   5584    xstrlcat(buf, "/site/spell", buf_len);
   5585 
   5586    char *failed_dir;
   5587    if (os_mkdir_recurse(buf, 0755, &failed_dir, NULL) != 0) {
   5588      xfree(buf);
   5589      xfree(failed_dir);
   5590      return;
   5591    }
   5592  } else {
   5593    if ((size_t)(lend - curbuf->b_s.b_p_spl) >= buf_len) {
   5594      xfree(buf);
   5595      return;
   5596    }
   5597    xmemcpyz(buf, curbuf->b_s.b_p_spl, (size_t)(lend - curbuf->b_s.b_p_spl));
   5598  }
   5599 
   5600  // Append spelllang
   5601  vim_snprintf(buf + strlen(buf), buf_len - strlen(buf), "/%.*s", (int)(lend - lstart), lstart);
   5602 
   5603  // Append ".ascii.add" or ".{enc}.add"
   5604  char *fname = LANGP_ENTRY(curwin->w_s->b_langp, 0)->lp_slang->sl_fname;
   5605  const char *enc_suffix =
   5606    (fname != NULL && strstr(path_tail(fname), ".ascii.") != NULL) ? "ascii" : spell_enc();
   5607  vim_snprintf(buf + strlen(buf), buf_len - strlen(buf), ".%s.add", enc_suffix);
   5608 
   5609  set_option_value_give_err(kOptSpellfile, CSTR_AS_OPTVAL(buf), OPT_LOCAL);
   5610  xfree(buf);
   5611 }
   5612 
   5613 /// Set the spell character tables from strings in the .spl file.
   5614 ///
   5615 /// @param cnt  length of "flags"
   5616 static void set_spell_charflags(const char *flags_in, int cnt, const char *fol)
   5617 {
   5618  const uint8_t *flags = (uint8_t *)flags_in;
   5619  // We build the new tables here first, so that we can compare with the
   5620  // previous one.
   5621  spelltab_T new_st;
   5622  const char *p = fol;
   5623 
   5624  clear_spell_chartab(&new_st);
   5625 
   5626  for (int i = 0; i < 128; i++) {
   5627    if (i < cnt) {
   5628      new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
   5629      new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
   5630    }
   5631 
   5632    if (*p != NUL) {
   5633      int c = mb_ptr2char_adv(&p);
   5634      new_st.st_fold[i + 128] = (uint8_t)c;
   5635      if (i + 128 != c && new_st.st_isu[i + 128] && c < 256) {
   5636        new_st.st_upper[c] = (uint8_t)(i + 128);
   5637      }
   5638    }
   5639  }
   5640 
   5641  set_spell_finish(&new_st);
   5642 }
   5643 
   5644 static int set_spell_finish(spelltab_T *new_st)
   5645 {
   5646  if (did_set_spelltab) {
   5647    // check that it's the same table
   5648    for (int i = 0; i < 256; i++) {
   5649      if (spelltab.st_isw[i] != new_st->st_isw[i]
   5650          || spelltab.st_isu[i] != new_st->st_isu[i]
   5651          || spelltab.st_fold[i] != new_st->st_fold[i]
   5652          || spelltab.st_upper[i] != new_st->st_upper[i]) {
   5653        emsg(_("E763: Word characters differ between spell files"));
   5654        return FAIL;
   5655      }
   5656    }
   5657  } else {
   5658    // copy the new spelltab into the one being used
   5659    spelltab = *new_st;
   5660    did_set_spelltab = true;
   5661  }
   5662 
   5663  return OK;
   5664 }
   5665 
   5666 // Write the table with prefix conditions to the .spl file.
   5667 // When "fd" is NULL only count the length of what is written.
   5668 static int write_spell_prefcond(FILE *fd, garray_T *gap, size_t *fwv)
   5669 {
   5670  assert(gap->ga_len >= 0);
   5671 
   5672  if (fd != NULL) {
   5673    put_bytes(fd, (uintmax_t)gap->ga_len, 2);           // <prefcondcnt>
   5674  }
   5675  size_t totlen = 2 + (size_t)gap->ga_len;  // <prefcondcnt> and <condlen> bytes
   5676  for (int i = 0; i < gap->ga_len; i++) {
   5677    // <prefcond> : <condlen> <condstr>
   5678    char *p = ((char **)gap->ga_data)[i];
   5679    if (p != NULL) {
   5680      size_t len = strlen(p);
   5681      if (fd != NULL) {
   5682        assert(len <= INT_MAX);
   5683        fputc((int)len, fd);
   5684        *fwv &= fwrite(p, len, 1, fd);
   5685      }
   5686      totlen += len;
   5687    } else if (fd != NULL) {
   5688      fputc(0, fd);
   5689    }
   5690  }
   5691 
   5692  assert(totlen <= INT_MAX);
   5693  return (int)totlen;
   5694 }
   5695 
   5696 // Use map string "map" for languages "lp".
   5697 static void set_map_str(slang_T *lp, const char *map)
   5698 {
   5699  int headc = 0;
   5700 
   5701  if (*map == NUL) {
   5702    lp->sl_has_map = false;
   5703    return;
   5704  }
   5705  lp->sl_has_map = true;
   5706 
   5707  // Init the array and hash tables empty.
   5708  for (int i = 0; i < 256; i++) {
   5709    lp->sl_map_array[i] = 0;
   5710  }
   5711  hash_init(&lp->sl_map_hash);
   5712 
   5713  // The similar characters are stored separated with slashes:
   5714  // "aaa/bbb/ccc/".  Fill sl_map_array[c] with the character before c and
   5715  // before the same slash.  For characters above 255 sl_map_hash is used.
   5716  for (const char *p = map; *p != NUL;) {
   5717    int c = mb_cptr2char_adv(&p);
   5718    if (c == '/') {
   5719      headc = 0;
   5720    } else {
   5721      if (headc == 0) {
   5722        headc = c;
   5723      }
   5724 
   5725      // Characters above 255 don't fit in sl_map_array[], put them in
   5726      // the hash table.  Each entry is the char, a NUL the headchar and
   5727      // a NUL.
   5728      if (c >= 256) {
   5729        int cl = utf_char2len(c);
   5730        int headcl = utf_char2len(headc);
   5731        hash_T hash;
   5732        hashitem_T *hi;
   5733 
   5734        char *b = xmalloc((size_t)(cl + headcl) + 2);
   5735        utf_char2bytes(c, b);
   5736        b[cl] = NUL;
   5737        utf_char2bytes(headc, b + cl + 1);
   5738        b[cl + 1 + headcl] = NUL;
   5739        hash = hash_hash(b);
   5740        hi = hash_lookup(&lp->sl_map_hash, b, strlen(b), hash);
   5741        if (HASHITEM_EMPTY(hi)) {
   5742          hash_add_item(&lp->sl_map_hash, hi, b, hash);
   5743        } else {
   5744          // This should have been checked when generating the .spl
   5745          // file.
   5746          emsg(_(e_duplicate_char_in_map_entry));
   5747          xfree(b);
   5748        }
   5749      } else {
   5750        lp->sl_map_array[c] = headc;
   5751      }
   5752    }
   5753  }
   5754 }