neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

regexp.c (454742B)


      1 // Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
      2 
      3 // By default: do not create debugging logs or files related to regular
      4 // expressions, even when compiling with -DDEBUG.
      5 // Uncomment the second line to get the regexp debugging.
      6 // #undef REGEXP_DEBUG
      7 // #define REGEXP_DEBUG
      8 
      9 #include <assert.h>
     10 #include <ctype.h>
     11 #include <inttypes.h>
     12 #include <limits.h>
     13 #include <stdbool.h>
     14 #include <stddef.h>
     15 #include <stdlib.h>
     16 #include <string.h>
     17 #include <uv.h>
     18 
     19 #include "nvim/ascii_defs.h"
     20 #include "nvim/buffer_defs.h"
     21 #include "nvim/charset.h"
     22 #include "nvim/errors.h"
     23 #include "nvim/eval.h"
     24 #include "nvim/eval/typval.h"
     25 #include "nvim/eval/userfunc.h"
     26 #include "nvim/garray.h"
     27 #include "nvim/garray_defs.h"
     28 #include "nvim/gettext_defs.h"
     29 #include "nvim/globals.h"
     30 #include "nvim/keycodes.h"
     31 #include "nvim/macros_defs.h"
     32 #include "nvim/mark.h"
     33 #include "nvim/mark_defs.h"
     34 #include "nvim/mbyte.h"
     35 #include "nvim/mbyte_defs.h"
     36 #include "nvim/memline.h"
     37 #include "nvim/memory.h"
     38 #include "nvim/message.h"
     39 #include "nvim/option_vars.h"
     40 #include "nvim/os/input.h"
     41 #include "nvim/plines.h"
     42 #include "nvim/pos_defs.h"
     43 #include "nvim/profile.h"
     44 #include "nvim/regexp.h"
     45 #include "nvim/regexp_defs.h"
     46 #include "nvim/strings.h"
     47 #include "nvim/types_defs.h"
     48 #include "nvim/vim_defs.h"
     49 
     50 typedef enum {
     51  RGLF_LINE = 0x01,
     52  RGLF_LENGTH = 0x02,
     53  RGLF_SUBMATCH = 0x04,
     54 } reg_getline_flags_T;
     55 
     56 enum {
     57  /// In the NFA engine: how many braces are allowed.
     58  /// TODO(RE): Use dynamic memory allocation instead of static, like here
     59  NFA_MAX_BRACES = 20,
     60 };
     61 
     62 enum {
     63  /// In the NFA engine: how many states are allowed.
     64  NFA_MAX_STATES = 100000,
     65  NFA_TOO_EXPENSIVE = -1,
     66 };
     67 
     68 /// Which regexp engine to use? Needed for vim_regcomp().
     69 /// Must match with 'regexpengine'.
     70 enum {
     71  AUTOMATIC_ENGINE    = 0,
     72  BACKTRACKING_ENGINE = 1,
     73  NFA_ENGINE          = 2,
     74 };
     75 
     76 /// Structure returned by vim_regcomp() to pass on to vim_regexec().
     77 /// This is the general structure. For the actual matcher, two specific
     78 /// structures are used. See code below.
     79 struct regprog {
     80  regengine_T *engine;
     81  unsigned regflags;
     82  unsigned re_engine;  ///< Automatic, backtracking or NFA engine.
     83  unsigned re_flags;   ///< Second argument for vim_regcomp().
     84  bool re_in_use;      ///< prog is being executed
     85 };
     86 
     87 /// Structure used by the back track matcher.
     88 /// These fields are only to be used in regexp.c!
     89 /// See regexp.c for an explanation.
     90 typedef struct {
     91  // These four members implement regprog_T.
     92  regengine_T *engine;
     93  unsigned regflags;
     94  unsigned re_engine;
     95  unsigned re_flags;
     96  bool re_in_use;
     97 
     98  int regstart;
     99  uint8_t reganch;
    100  uint8_t *regmust;
    101  int regmlen;
    102  uint8_t reghasz;
    103  uint8_t program[];
    104 } bt_regprog_T;
    105 
    106 /// Structure representing a NFA state.
    107 /// An NFA state may have no outgoing edge, when it is a NFA_MATCH state.
    108 typedef struct nfa_state nfa_state_T;
    109 struct nfa_state {
    110  int c;
    111  nfa_state_T *out;
    112  nfa_state_T *out1;
    113  int id;
    114  int lastlist[2];  ///< 0: normal, 1: recursive
    115  int val;
    116 };
    117 
    118 /// Structure used by the NFA matcher.
    119 typedef struct {
    120  // These four members implement regprog_T.
    121  regengine_T *engine;
    122  unsigned regflags;
    123  unsigned re_engine;
    124  unsigned re_flags;
    125  bool re_in_use;
    126 
    127  nfa_state_T *start;   ///< points into state[]
    128 
    129  int reganch;          ///< pattern starts with ^
    130  int regstart;         ///< char at start of pattern
    131  uint8_t *match_text;  ///< plain text to match with
    132 
    133  int has_zend;         ///< pattern contains \ze
    134  int has_backref;      ///< pattern contains \1 .. \9
    135  int reghasz;
    136  char *pattern;
    137  int nsubexp;          ///< number of ()
    138  int nstate;
    139  nfa_state_T state[];
    140 } nfa_regprog_T;
    141 
    142 struct regengine {
    143  /// bt_regcomp or nfa_regcomp
    144  regprog_T *(*regcomp)(uint8_t *, int);
    145  /// bt_regfree or nfa_regfree
    146  void (*regfree)(regprog_T *);
    147  /// bt_regexec_nl or nfa_regexec_nl
    148  int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool);
    149  /// bt_regexec_mult or nfa_regexec_mult
    150  int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *);
    151 #ifdef REGEXP_DEBUG
    152  uint8_t *expr;
    153 #endif
    154 };
    155 
    156 // Structure used to save the current input state, when it needs to be
    157 // restored after trying a match.  Used by reg_save() and reg_restore().
    158 // Also stores the length of "backpos".
    159 typedef struct {
    160  union {
    161    uint8_t *ptr;       // rex.input pointer, for single-line regexp
    162    lpos_T pos;        // rex.input pos, for multi-line regexp
    163  } rs_u;
    164  int rs_len;
    165 } regsave_T;
    166 
    167 // struct to save start/end pointer/position in for \(\)
    168 typedef struct {
    169  union {
    170    uint8_t *ptr;
    171    lpos_T pos;
    172  } se_u;
    173 } save_se_T;
    174 
    175 // Values for rs_state in regitem_T.
    176 typedef enum regstate_E {
    177  RS_NOPEN = 0,         // NOPEN and NCLOSE
    178  RS_MOPEN,             // MOPEN + [0-9]
    179  RS_MCLOSE,            // MCLOSE + [0-9]
    180  RS_ZOPEN,             // ZOPEN + [0-9]
    181  RS_ZCLOSE,            // ZCLOSE + [0-9]
    182  RS_BRANCH,            // BRANCH
    183  RS_BRCPLX_MORE,       // BRACE_COMPLEX and trying one more match
    184  RS_BRCPLX_LONG,       // BRACE_COMPLEX and trying longest match
    185  RS_BRCPLX_SHORT,      // BRACE_COMPLEX and trying shortest match
    186  RS_NOMATCH,           // NOMATCH
    187  RS_BEHIND1,           // BEHIND / NOBEHIND matching rest
    188  RS_BEHIND2,           // BEHIND / NOBEHIND matching behind part
    189  RS_STAR_LONG,         // STAR/PLUS/BRACE_SIMPLE longest match
    190  RS_STAR_SHORT,  // STAR/PLUS/BRACE_SIMPLE shortest match
    191 } regstate_T;
    192 
    193 // When there are alternatives a regstate_T is put on the regstack to remember
    194 // what we are doing.
    195 // Before it may be another type of item, depending on rs_state, to remember
    196 // more things.
    197 typedef struct regitem_S {
    198  regstate_T rs_state;         // what we are doing, one of RS_ above
    199  int16_t rs_no;            // submatch nr or BEHIND/NOBEHIND
    200  uint8_t *rs_scan;         // current node in program
    201  union {
    202    save_se_T sesave;
    203    regsave_T regsave;
    204  } rs_un;                      // room for saving rex.input
    205 } regitem_T;
    206 
    207 // used for BEHIND and NOBEHIND matching
    208 typedef struct regbehind_S {
    209  regsave_T save_after;
    210  regsave_T save_behind;
    211  int save_need_clear_subexpr;
    212  save_se_T save_start[NSUBEXP];
    213  save_se_T save_end[NSUBEXP];
    214 } regbehind_T;
    215 
    216 // Since the out pointers in the list are always
    217 // uninitialized, we use the pointers themselves
    218 // as storage for the Ptrlists.
    219 typedef union Ptrlist Ptrlist;
    220 union Ptrlist {
    221  Ptrlist *next;
    222  nfa_state_T *s;
    223 };
    224 
    225 struct Frag {
    226  nfa_state_T *start;
    227  Ptrlist *out;
    228 };
    229 typedef struct Frag Frag_T;
    230 
    231 typedef struct {
    232  int in_use;       ///< number of subexpr with useful info
    233 
    234  // When REG_MULTI is true list.multi is used, otherwise list.line.
    235  union {
    236    struct multipos {
    237      linenr_T start_lnum;
    238      linenr_T end_lnum;
    239      colnr_T start_col;
    240      colnr_T end_col;
    241    } multi[NSUBEXP];
    242    struct linepos {
    243      uint8_t *start;
    244      uint8_t *end;
    245    } line[NSUBEXP];
    246  } list;
    247  colnr_T orig_start_col;  // list.multi[0].start_col without \zs
    248 } regsub_T;
    249 
    250 typedef struct {
    251  regsub_T norm;      // \( .. \) matches
    252  regsub_T synt;      // \z( .. \) matches
    253 } regsubs_T;
    254 
    255 // nfa_pim_T stores a Postponed Invisible Match.
    256 typedef struct nfa_pim_S nfa_pim_T;
    257 struct nfa_pim_S {
    258  int result;                   // NFA_PIM_*, see below
    259  nfa_state_T *state;           // the invisible match start state
    260  regsubs_T subs;               // submatch info, only party used
    261  union {
    262    lpos_T pos;
    263    uint8_t *ptr;
    264  } end;                        // where the match must end
    265 };
    266 
    267 // nfa_thread_T contains execution information of a NFA state
    268 typedef struct {
    269  nfa_state_T *state;
    270  int count;
    271  nfa_pim_T pim;                // if pim.result != NFA_PIM_UNUSED: postponed
    272                                // invisible match
    273  regsubs_T subs;               // submatch info, only party used
    274 } nfa_thread_T;
    275 
    276 // nfa_list_T contains the alternative NFA execution states.
    277 typedef struct {
    278  nfa_thread_T *t;           ///< allocated array of states
    279  int n;                        ///< nr of states currently in "t"
    280  int len;                      ///< max nr of states in "t"
    281  int id;                       ///< ID of the list
    282  int has_pim;                  ///< true when any state has a PIM
    283 } nfa_list_T;
    284 
    285 #ifdef REGEXP_DEBUG
    286 // show/save debugging data when BT engine is used
    287 # define BT_REGEXP_DUMP
    288 // save the debugging data to a file instead of displaying it
    289 # define BT_REGEXP_LOG
    290 # define BT_REGEXP_DEBUG_LOG
    291 # define BT_REGEXP_DEBUG_LOG_NAME       "bt_regexp_debug.log"
    292 #endif
    293 
    294 // Magic characters have a special meaning, they don't match literally.
    295 // Magic characters are negative.  This separates them from literal characters
    296 // (possibly multi-byte).  Only ASCII characters can be Magic.
    297 #define Magic(x)        ((int)(x) - 256)
    298 #define un_Magic(x)     ((x) + 256)
    299 #define is_Magic(x)     ((x) < 0)
    300 
    301 typedef void (*fptr_T)(int *, int);
    302 
    303 static int no_Magic(int x)
    304 {
    305  if (is_Magic(x)) {
    306    return un_Magic(x);
    307  }
    308  return x;
    309 }
    310 
    311 static int toggle_Magic(int x)
    312 {
    313  if (is_Magic(x)) {
    314    return un_Magic(x);
    315  }
    316  return Magic(x);
    317 }
    318 
    319 // The first byte of the BT regexp internal "program" is actually this magic
    320 // number; the start node begins in the second byte.  It's used to catch the
    321 // most severe mutilation of the program by the caller.
    322 #define REGMAGIC        0234
    323 
    324 // Utility definitions.
    325 #define UCHARAT(p)      ((int)(*(uint8_t *)(p)))
    326 
    327 // Used for an error (down from) vim_regcomp(): give the error message, set
    328 // rc_did_emsg and return NULL
    329 #define EMSG_RET_NULL(m) return (emsg(m), rc_did_emsg = true, (void *)NULL)
    330 #define IEMSG_RET_NULL(m) return (iemsg(m), rc_did_emsg = true, (void *)NULL)
    331 #define EMSG_RET_FAIL(m) return (emsg(m), rc_did_emsg = true, FAIL)
    332 #define EMSG2_RET_NULL(m, c) \
    333  return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, (void *)NULL)
    334 #define EMSG3_RET_NULL(m, c, a) \
    335  return (semsg((m), (c) ? "" : "\\", (a)), rc_did_emsg = true, (void *)NULL)
    336 #define EMSG2_RET_FAIL(m, c) \
    337  return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, FAIL)
    338 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_(e_invalid_item_in_str_brackets), reg_magic == MAGIC_ALL)
    339 
    340 #define MAX_LIMIT       (32767 << 16)
    341 
    342 static const char e_invalid_character_after_str_at[]
    343  = N_("E59: Invalid character after %s@");
    344 static const char e_invalid_use_of_underscore[]
    345  = N_("E63: Invalid use of \\_");
    346 static const char e_pattern_uses_more_memory_than_maxmempattern[]
    347  = N_("E363: Pattern uses more memory than 'maxmempattern'");
    348 static const char e_invalid_item_in_str_brackets[]
    349  = N_("E369: Invalid item in %s%%[]");
    350 static const char e_missing_delimiter_after_search_pattern_str[]
    351  = N_("E654: Missing delimiter after search pattern: %s");
    352 static const char e_missingbracket[] = N_("E769: Missing ] after %s[");
    353 static const char e_reverse_range[] = N_("E944: Reverse range in character class");
    354 static const char e_large_class[] = N_("E945: Range too large in character class");
    355 static const char e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
    356 static const char e_unmatchedp[] = N_("E54: Unmatched %s(");
    357 static const char e_unmatchedpar[] = N_("E55: Unmatched %s)");
    358 static const char e_z_not_allowed[] = N_("E66: \\z( not allowed here");
    359 static const char e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
    360 static const char e_missing_sb[] = N_("E69: Missing ] after %s%%[");
    361 static const char e_empty_sb[] = N_("E70: Empty %s%%[]");
    362 static const char e_recursive[] = N_("E956: Cannot use pattern recursively");
    363 static const char e_regexp_number_after_dot_pos_search_chr[]
    364  = N_("E1204: No Number allowed after .: '\\%%%c'");
    365 static const char e_nfa_regexp_missing_value_in_chr[]
    366  = N_("E1273: (NFA regexp) missing value in '\\%%%c'");
    367 static const char e_atom_engine_must_be_at_start_of_pattern[]
    368  = N_("E1281: Atom '\\%%#=%c' must be at the start of the pattern");
    369 static const char e_substitute_nesting_too_deep[] = N_("E1290: substitute nesting too deep");
    370 static const char e_unicode_val_too_large[]
    371  = N_("E1541: Value too large, max Unicode codepoint is U+10FFFF");
    372 
    373 #define NOT_MULTI       0
    374 #define MULTI_ONE       1
    375 #define MULTI_MULT      2
    376 
    377 // return values for regmatch()
    378 #define RA_FAIL         1       // something failed, abort
    379 #define RA_CONT         2       // continue in inner loop
    380 #define RA_BREAK        3       // break inner loop
    381 #define RA_MATCH        4       // successful match
    382 #define RA_NOMATCH      5       // didn't match
    383 
    384 /// Return NOT_MULTI if c is not a "multi" operator.
    385 /// Return MULTI_ONE if c is a single "multi" operator.
    386 /// Return MULTI_MULT if c is a multi "multi" operator.
    387 static int re_multi_type(int c)
    388 {
    389  if (c == Magic('@') || c == Magic('=') || c == Magic('?')) {
    390    return MULTI_ONE;
    391  }
    392  if (c == Magic('*') || c == Magic('+') || c == Magic('{')) {
    393    return MULTI_MULT;
    394  }
    395  return NOT_MULTI;
    396 }
    397 
    398 static char *reg_prev_sub = NULL;
    399 static size_t reg_prev_sublen = 0;
    400 
    401 // REGEXP_INRANGE contains all characters which are always special in a []
    402 // range after '\'.
    403 // REGEXP_ABBR contains all characters which act as abbreviations after '\'.
    404 // These are:
    405 //  \n  - New line (NL).
    406 //  \r  - Carriage Return (CR).
    407 //  \t  - Tab (TAB).
    408 //  \e  - Escape (ESC).
    409 //  \b  - Backspace (Ctrl_H).
    410 //  \d  - Character code in decimal, eg \d123
    411 //  \o  - Character code in octal, eg \o80
    412 //  \x  - Character code in hex, eg \x4a
    413 //  \u  - Multibyte character code, eg \u20ac
    414 //  \U  - Long multibyte character code, eg \U12345678
    415 static char REGEXP_INRANGE[] = "]^-n\\";
    416 static char REGEXP_ABBR[] = "nrtebdoxuU";
    417 
    418 // Translate '\x' to its control character, except "\n", which is Magic.
    419 static int backslash_trans(int c)
    420 {
    421  switch (c) {
    422  case 'r':
    423    return CAR;
    424  case 't':
    425    return TAB;
    426  case 'e':
    427    return ESC;
    428  case 'b':
    429    return BS;
    430  }
    431  return c;
    432 }
    433 
    434 enum {
    435  CLASS_ALNUM = 0,
    436  CLASS_ALPHA,
    437  CLASS_BLANK,
    438  CLASS_CNTRL,
    439  CLASS_DIGIT,
    440  CLASS_GRAPH,
    441  CLASS_LOWER,
    442  CLASS_PRINT,
    443  CLASS_PUNCT,
    444  CLASS_SPACE,
    445  CLASS_UPPER,
    446  CLASS_XDIGIT,
    447  CLASS_TAB,
    448  CLASS_RETURN,
    449  CLASS_BACKSPACE,
    450  CLASS_ESCAPE,
    451  CLASS_IDENT,
    452  CLASS_KEYWORD,
    453  CLASS_FNAME,
    454  CLASS_NONE = 99,
    455 };
    456 
    457 /// Check for a character class name "[:name:]".  "pp" points to the '['.
    458 /// Returns one of the CLASS_ items. CLASS_NONE means that no item was
    459 /// recognized.  Otherwise "pp" is advanced to after the item.
    460 static int get_char_class(char **pp)
    461 {
    462  // must be sorted by the 'value' field because it is used by bsearch()!
    463  static keyvalue_T char_class_tab[] = {
    464    KEYVALUE_ENTRY(CLASS_ALNUM, "alnum:]"),
    465    KEYVALUE_ENTRY(CLASS_ALPHA, "alpha:]"),
    466    KEYVALUE_ENTRY(CLASS_BACKSPACE, "backspace:]"),
    467    KEYVALUE_ENTRY(CLASS_BLANK, "blank:]"),
    468    KEYVALUE_ENTRY(CLASS_CNTRL, "cntrl:]"),
    469    KEYVALUE_ENTRY(CLASS_DIGIT, "digit:]"),
    470    KEYVALUE_ENTRY(CLASS_ESCAPE, "escape:]"),
    471    KEYVALUE_ENTRY(CLASS_FNAME, "fname:]"),
    472    KEYVALUE_ENTRY(CLASS_GRAPH, "graph:]"),
    473    KEYVALUE_ENTRY(CLASS_IDENT, "ident:]"),
    474    KEYVALUE_ENTRY(CLASS_KEYWORD, "keyword:]"),
    475    KEYVALUE_ENTRY(CLASS_LOWER, "lower:]"),
    476    KEYVALUE_ENTRY(CLASS_PRINT, "print:]"),
    477    KEYVALUE_ENTRY(CLASS_PUNCT, "punct:]"),
    478    KEYVALUE_ENTRY(CLASS_RETURN, "return:]"),
    479    KEYVALUE_ENTRY(CLASS_SPACE, "space:]"),
    480    KEYVALUE_ENTRY(CLASS_TAB, "tab:]"),
    481    KEYVALUE_ENTRY(CLASS_UPPER, "upper:]"),
    482    KEYVALUE_ENTRY(CLASS_XDIGIT, "xdigit:]")
    483  };
    484 
    485  // check that the value of "pp" has a chance of matching
    486  if ((*pp)[1] == ':' && ASCII_ISLOWER((*pp)[2])
    487      && ASCII_ISLOWER((*pp)[3]) && ASCII_ISLOWER((*pp)[4])) {
    488    // this function can be called repeatedly with the same value for "pp"
    489    // so we cache the last found entry.
    490    static keyvalue_T *last_entry = NULL;
    491 
    492    keyvalue_T target = {
    493      .key = 0,
    494      .value = *pp + 2,
    495      .length = 0,  // not used, see cmp_keyvalue_value_n()
    496    };
    497 
    498    keyvalue_T *entry;
    499    if (last_entry != NULL && cmp_keyvalue_value_n(&target, last_entry) == 0) {
    500      entry = last_entry;
    501    } else {
    502      entry = (keyvalue_T *)bsearch(&target, &char_class_tab,
    503                                    ARRAY_SIZE(char_class_tab),
    504                                    sizeof(char_class_tab[0]), cmp_keyvalue_value_n);
    505    }
    506    if (entry != NULL) {
    507      last_entry = entry;
    508      *pp += entry->length + 2;
    509      return entry->key;
    510    }
    511  }
    512  return CLASS_NONE;
    513 }
    514 
    515 // Specific version of character class functions.
    516 // Using a table to keep this fast.
    517 static int16_t class_tab[256];
    518 
    519 #define     RI_DIGIT    0x01
    520 #define     RI_HEX      0x02
    521 #define     RI_OCTAL    0x04
    522 #define     RI_WORD     0x08
    523 #define     RI_HEAD     0x10
    524 #define     RI_ALPHA    0x20
    525 #define     RI_LOWER    0x40
    526 #define     RI_UPPER    0x80
    527 #define     RI_WHITE    0x100
    528 
    529 static void init_class_tab(void)
    530 {
    531  int i;
    532  static int done = false;
    533 
    534  if (done) {
    535    return;
    536  }
    537 
    538  for (i = 0; i < 256; i++) {
    539    if (i >= '0' && i <= '7') {
    540      class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
    541    } else if (i >= '8' && i <= '9') {
    542      class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
    543    } else if (i >= 'a' && i <= 'f') {
    544      class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
    545    } else if (i >= 'g' && i <= 'z') {
    546      class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
    547    } else if (i >= 'A' && i <= 'F') {
    548      class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
    549    } else if (i >= 'G' && i <= 'Z') {
    550      class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
    551    } else if (i == '_') {
    552      class_tab[i] = RI_WORD + RI_HEAD;
    553    } else {
    554      class_tab[i] = 0;
    555    }
    556  }
    557  class_tab[' '] |= RI_WHITE;
    558  class_tab['\t'] |= RI_WHITE;
    559  done = true;
    560 }
    561 
    562 #define ri_digit(c)    ((c) < 0x100 && (class_tab[c] & RI_DIGIT))
    563 #define ri_hex(c)      ((c) < 0x100 && (class_tab[c] & RI_HEX))
    564 #define ri_octal(c)    ((c) < 0x100 && (class_tab[c] & RI_OCTAL))
    565 #define ri_word(c)     ((c) < 0x100 && (class_tab[c] & RI_WORD))
    566 #define ri_head(c)     ((c) < 0x100 && (class_tab[c] & RI_HEAD))
    567 #define ri_alpha(c)    ((c) < 0x100 && (class_tab[c] & RI_ALPHA))
    568 #define ri_lower(c)    ((c) < 0x100 && (class_tab[c] & RI_LOWER))
    569 #define ri_upper(c)    ((c) < 0x100 && (class_tab[c] & RI_UPPER))
    570 #define ri_white(c)    ((c) < 0x100 && (class_tab[c] & RI_WHITE))
    571 
    572 // flags for regflags
    573 #define RF_ICASE    1   // ignore case
    574 #define RF_NOICASE  2   // don't ignore case
    575 #define RF_HASNL    4   // can match a NL
    576 #define RF_ICOMBINE 8   // ignore combining characters
    577 #define RF_LOOKBH   16  // uses "\@<=" or "\@<!"
    578 
    579 // Global work variables for vim_regcomp().
    580 
    581 static char *regparse;          ///< Input-scan pointer.
    582 static int regnpar;             ///< () count.
    583 static bool wants_nfa;          ///< regex should use NFA engine
    584 static int regnzpar;            ///< \z() count.
    585 static int re_has_z;            ///< \z item detected
    586 static unsigned regflags;       ///< RF_ flags for prog
    587 static int had_eol;             ///< true when EOL found by vim_regcomp()
    588 
    589 static magic_T reg_magic;       ///< magicness of the pattern
    590 
    591 static int reg_string;          // matching with a string instead of a buffer
    592                                // line
    593 static int reg_strict;          // "[abc" is illegal
    594 
    595 // META contains all characters that may be magic, except '^' and '$'.
    596 
    597 // uncrustify:off
    598 
    599 // META[] is used often enough to justify turning it into a table.
    600 static uint8_t META_flags[] = {
    601    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    602    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    603 //                 %  &     (  )  *  +        .
    604    0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
    605 //     1  2  3  4  5  6  7  8  9        <  =  >  ?
    606    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
    607 //  @  A     C  D     F     H  I     K  L  M     O
    608    1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
    609 //  P        S     U  V  W  X     Z  [           _
    610    1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
    611 //     a     c  d     f     h  i     k  l  m  n  o
    612    0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
    613 //  p        s     u  v  w  x     z  {  |     ~
    614    1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
    615 };
    616 
    617 // uncrustify:on
    618 
    619 static int curchr;              // currently parsed character
    620 // Previous character.  Note: prevchr is sometimes -1 when we are not at the
    621 // start, eg in /[ ^I]^ the pattern was never found even if it existed,
    622 // because ^ was taken to be magic -- webb
    623 static int prevchr;
    624 static int prevprevchr;         // previous-previous character
    625 static int nextchr;             // used for ungetchr()
    626 
    627 // arguments for reg()
    628 #define REG_NOPAREN     0       // toplevel reg()
    629 #define REG_PAREN       1       // \(\)
    630 #define REG_ZPAREN      2       // \z(\)
    631 #define REG_NPAREN      3       // \%(\)
    632 
    633 typedef struct {
    634  char *regparse;
    635  int prevchr_len;
    636  int curchr;
    637  int prevchr;
    638  int prevprevchr;
    639  int nextchr;
    640  int at_start;
    641  int prev_at_start;
    642  int regnpar;
    643 } parse_state_T;
    644 
    645 static regengine_T bt_regengine;
    646 static regengine_T nfa_regengine;
    647 
    648 #include "regexp.c.generated.h"
    649 
    650 // Return true if compiled regular expression "prog" can match a line break.
    651 int re_multiline(const regprog_T *prog)
    652  FUNC_ATTR_NONNULL_ALL
    653 {
    654  return prog->regflags & RF_HASNL;
    655 }
    656 
    657 // Check for an equivalence class name "[=a=]".  "pp" points to the '['.
    658 // Returns a character representing the class. Zero means that no item was
    659 // recognized.  Otherwise "pp" is advanced to after the item.
    660 static int get_equi_class(char **pp)
    661 {
    662  int c;
    663  int l = 1;
    664  char *p = *pp;
    665 
    666  if (p[1] == '=' && p[2] != NUL) {
    667    l = utfc_ptr2len(p + 2);
    668    if (p[l + 2] == '=' && p[l + 3] == ']') {
    669      c = utf_ptr2char(p + 2);
    670      *pp += l + 4;
    671      return c;
    672    }
    673  }
    674  return 0;
    675 }
    676 
    677 // Check for a collating element "[.a.]".  "pp" points to the '['.
    678 // Returns a character. Zero means that no item was recognized.  Otherwise
    679 // "pp" is advanced to after the item.
    680 // Currently only single characters are recognized!
    681 static int get_coll_element(char **pp)
    682 {
    683  int c;
    684  int l = 1;
    685  char *p = *pp;
    686 
    687  if (p[0] != NUL && p[1] == '.' && p[2] != NUL) {
    688    l = utfc_ptr2len(p + 2);
    689    if (p[l + 2] == '.' && p[l + 3] == ']') {
    690      c = utf_ptr2char(p + 2);
    691      *pp += l + 4;
    692      return c;
    693    }
    694  }
    695  return 0;
    696 }
    697 
    698 static int reg_cpo_lit;  // 'cpoptions' contains 'l' flag
    699 
    700 static void get_cpo_flags(void)
    701 {
    702  reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
    703 }
    704 
    705 /// Skip over a "[]" range.
    706 /// "p" must point to the character after the '['.
    707 /// The returned pointer is on the matching ']', or the terminating NUL.
    708 static char *skip_anyof(char *p)
    709 {
    710  int l;
    711 
    712  if (*p == '^') {  // Complement of range.
    713    p++;
    714  }
    715  if (*p == ']' || *p == '-') {
    716    p++;
    717  }
    718  while (*p != NUL && *p != ']') {
    719    if ((l = utfc_ptr2len(p)) > 1) {
    720      p += l;
    721    } else if (*p == '-') {
    722      p++;
    723      if (*p != ']' && *p != NUL) {
    724        MB_PTR_ADV(p);
    725      }
    726    } else if (*p == '\\'
    727               && (vim_strchr(REGEXP_INRANGE, (uint8_t)p[1]) != NULL
    728                   || (!reg_cpo_lit
    729                       && vim_strchr(REGEXP_ABBR, (uint8_t)p[1]) != NULL))) {
    730      p += 2;
    731    } else if (*p == '[') {
    732      if (get_char_class(&p) == CLASS_NONE
    733          && get_equi_class(&p) == 0
    734          && get_coll_element(&p) == 0
    735          && *p != NUL) {
    736        p++;          // It is not a class name and not NUL
    737      }
    738    } else {
    739      p++;
    740    }
    741  }
    742 
    743  return p;
    744 }
    745 
    746 /// Skip past regular expression.
    747 /// Stop at end of "startp" or where "delim" is found ('/', '?', etc).
    748 /// Take care of characters with a backslash in front of it.
    749 /// Skip strings inside [ and ].
    750 char *skip_regexp(char *startp, int delim, int magic)
    751 {
    752  return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
    753 }
    754 
    755 /// Call skip_regexp() and when the delimiter does not match give an error and
    756 /// return NULL.
    757 char *skip_regexp_err(char *startp, int delim, int magic)
    758 {
    759  char *p = skip_regexp(startp, delim, magic);
    760 
    761  if (*p != delim) {
    762    semsg(_(e_missing_delimiter_after_search_pattern_str), startp);
    763    return NULL;
    764  }
    765  return p;
    766 }
    767 
    768 /// skip_regexp() with extra arguments:
    769 /// When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
    770 /// expression and change "\?" to "?".  If "*newp" is not NULL the expression
    771 /// is changed in-place.
    772 /// If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
    773 /// If "magic_val" is not NULL, returns the effective magicness of the pattern
    774 char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *dropped,
    775                     magic_T *magic_val)
    776 {
    777  magic_T mymagic;
    778  char *p = startp;
    779  size_t startplen = 0;
    780 
    781  if (magic) {
    782    mymagic = MAGIC_ON;
    783  } else {
    784    mymagic = MAGIC_OFF;
    785  }
    786  get_cpo_flags();
    787 
    788  for (; p[0] != NUL; MB_PTR_ADV(p)) {
    789    if (p[0] == dirc) {         // found end of regexp
    790      break;
    791    }
    792    if ((p[0] == '[' && mymagic >= MAGIC_ON)
    793        || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF)) {
    794      p = skip_anyof(p + 1);
    795      if (p[0] == NUL) {
    796        break;
    797      }
    798    } else if (p[0] == '\\' && p[1] != NUL) {
    799      if (dirc == '?' && newp != NULL && p[1] == '?') {
    800        // change "\?" to "?", make a copy first.
    801        if (startplen == 0) {
    802          startplen = strlen(startp);
    803        }
    804        if (*newp == NULL) {
    805          *newp = xstrnsave(startp, startplen);
    806          p = *newp + (p - startp);
    807          startp = *newp;
    808        }
    809        if (dropped != NULL) {
    810          (*dropped)++;
    811        }
    812        memmove(p, p + 1, startplen - (size_t)((p + 1) - startp) + 1);
    813      } else {
    814        p++;            // skip next character
    815      }
    816      if (*p == 'v') {
    817        mymagic = MAGIC_ALL;
    818      } else if (*p == 'V') {
    819        mymagic = MAGIC_NONE;
    820      }
    821    }
    822  }
    823  if (magic_val != NULL) {
    824    *magic_val = mymagic;
    825  }
    826  return p;
    827 }
    828 
    829 // variables used for parsing
    830 static int prevchr_len;    // byte length of previous char
    831 static int at_start;       // True when on the first character
    832 static int prev_at_start;  // True when on the second character
    833 
    834 // Start parsing at "str".
    835 static void initchr(char *str)
    836 {
    837  regparse = str;
    838  prevchr_len = 0;
    839  curchr = prevprevchr = prevchr = nextchr = -1;
    840  at_start = true;
    841  prev_at_start = false;
    842 }
    843 
    844 // Save the current parse state, so that it can be restored and parsing
    845 // starts in the same state again.
    846 static void save_parse_state(parse_state_T *ps)
    847 {
    848  ps->regparse = regparse;
    849  ps->prevchr_len = prevchr_len;
    850  ps->curchr = curchr;
    851  ps->prevchr = prevchr;
    852  ps->prevprevchr = prevprevchr;
    853  ps->nextchr = nextchr;
    854  ps->at_start = at_start;
    855  ps->prev_at_start = prev_at_start;
    856  ps->regnpar = regnpar;
    857 }
    858 
    859 // Restore a previously saved parse state.
    860 static void restore_parse_state(parse_state_T *ps)
    861 {
    862  regparse = ps->regparse;
    863  prevchr_len = ps->prevchr_len;
    864  curchr = ps->curchr;
    865  prevchr = ps->prevchr;
    866  prevprevchr = ps->prevprevchr;
    867  nextchr = ps->nextchr;
    868  at_start = ps->at_start;
    869  prev_at_start = ps->prev_at_start;
    870  regnpar = ps->regnpar;
    871 }
    872 
    873 // Get the next character without advancing.
    874 static int peekchr(void)
    875 {
    876  static int after_slash = false;
    877 
    878  if (curchr != -1) {
    879    return curchr;
    880  }
    881 
    882  switch (curchr = (uint8_t)regparse[0]) {
    883  case '.':
    884  case '[':
    885  case '~':
    886    // magic when 'magic' is on
    887    if (reg_magic >= MAGIC_ON) {
    888      curchr = Magic(curchr);
    889    }
    890    break;
    891  case '(':
    892  case ')':
    893  case '{':
    894  case '%':
    895  case '+':
    896  case '=':
    897  case '?':
    898  case '@':
    899  case '!':
    900  case '&':
    901  case '|':
    902  case '<':
    903  case '>':
    904  case '#':           // future ext.
    905  case '"':           // future ext.
    906  case '\'':          // future ext.
    907  case ',':           // future ext.
    908  case '-':           // future ext.
    909  case ':':           // future ext.
    910  case ';':           // future ext.
    911  case '`':           // future ext.
    912  case '/':           // Can't be used in / command
    913    // magic only after "\v"
    914    if (reg_magic == MAGIC_ALL) {
    915      curchr = Magic(curchr);
    916    }
    917    break;
    918  case '*':
    919    // * is not magic as the very first character, eg "?*ptr", when
    920    // after '^', eg "/^*ptr" and when after "\(", "\|", "\&".  But
    921    // "\(\*" is not magic, thus must be magic if "after_slash"
    922    if (reg_magic >= MAGIC_ON
    923        && !at_start
    924        && !(prev_at_start && prevchr == Magic('^'))
    925        && (after_slash
    926            || (prevchr != Magic('(')
    927                && prevchr != Magic('&')
    928                && prevchr != Magic('|')))) {
    929      curchr = Magic('*');
    930    }
    931    break;
    932  case '^':
    933    // '^' is only magic as the very first character and if it's after
    934    // "\(", "\|", "\&' or "\n"
    935    if (reg_magic >= MAGIC_OFF
    936        && (at_start
    937            || reg_magic == MAGIC_ALL
    938            || prevchr == Magic('(')
    939            || prevchr == Magic('|')
    940            || prevchr == Magic('&')
    941            || prevchr == Magic('n')
    942            || (no_Magic(prevchr) == '('
    943                && prevprevchr == Magic('%')))) {
    944      curchr = Magic('^');
    945      at_start = true;
    946      prev_at_start = false;
    947    }
    948    break;
    949  case '$':
    950    // '$' is only magic as the very last char and if it's in front of
    951    // either "\|", "\)", "\&", or "\n"
    952    if (reg_magic >= MAGIC_OFF) {
    953      uint8_t *p = (uint8_t *)regparse + 1;
    954      bool is_magic_all = (reg_magic == MAGIC_ALL);
    955 
    956      // ignore \c \C \m \M \v \V and \Z after '$'
    957      while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
    958                              || p[1] == 'm' || p[1] == 'M'
    959                              || p[1] == 'v' || p[1] == 'V'
    960                              || p[1] == 'Z')) {
    961        if (p[1] == 'v') {
    962          is_magic_all = true;
    963        } else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V') {
    964          is_magic_all = false;
    965        }
    966        p += 2;
    967      }
    968      if (p[0] == NUL
    969          || (p[0] == '\\'
    970              && (p[1] == '|' || p[1] == '&' || p[1] == ')'
    971                  || p[1] == 'n'))
    972          || (is_magic_all
    973              && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
    974          || reg_magic == MAGIC_ALL) {
    975        curchr = Magic('$');
    976      }
    977    }
    978    break;
    979  case '\\': {
    980    int c = (uint8_t)regparse[1];
    981 
    982    if (c == NUL) {
    983      curchr = '\\';  // trailing '\'
    984    } else if (c <= '~' && META_flags[c]) {
    985      // META contains everything that may be magic sometimes,
    986      // except ^ and $ ("\^" and "\$" are only magic after
    987      // "\V").  We now fetch the next character and toggle its
    988      // magicness.  Therefore, \ is so meta-magic that it is
    989      // not in META.
    990      curchr = -1;
    991      prev_at_start = at_start;
    992      at_start = false;  // be able to say "/\*ptr"
    993      regparse++;
    994      after_slash++;
    995      (void)peekchr();
    996      regparse--;
    997      after_slash--;
    998      curchr = toggle_Magic(curchr);
    999    } else if (vim_strchr(REGEXP_ABBR, c)) {
   1000      // Handle abbreviations, like "\t" for TAB -- webb
   1001      curchr = backslash_trans(c);
   1002    } else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^')) {
   1003      curchr = toggle_Magic(c);
   1004    } else {
   1005      // Next character can never be (made) magic?
   1006      // Then backslashing it won't do anything.
   1007      curchr = utf_ptr2char(regparse + 1);
   1008    }
   1009    break;
   1010  }
   1011 
   1012  default:
   1013    curchr = utf_ptr2char(regparse);
   1014  }
   1015 
   1016  return curchr;
   1017 }
   1018 
   1019 // Eat one lexed character.  Do this in a way that we can undo it.
   1020 static void skipchr(void)
   1021 {
   1022  // peekchr() eats a backslash, do the same here
   1023  if (*regparse == '\\') {
   1024    prevchr_len = 1;
   1025  } else {
   1026    prevchr_len = 0;
   1027  }
   1028  if (regparse[prevchr_len] != NUL) {
   1029    // Exclude composing chars that utfc_ptr2len does include.
   1030    prevchr_len += utf_ptr2len(regparse + prevchr_len);
   1031  }
   1032  regparse += prevchr_len;
   1033  prev_at_start = at_start;
   1034  at_start = false;
   1035  prevprevchr = prevchr;
   1036  prevchr = curchr;
   1037  curchr = nextchr;         // use previously unget char, or -1
   1038  nextchr = -1;
   1039 }
   1040 
   1041 // Skip a character while keeping the value of prev_at_start for at_start.
   1042 // prevchr and prevprevchr are also kept.
   1043 static void skipchr_keepstart(void)
   1044 {
   1045  int as = prev_at_start;
   1046  int pr = prevchr;
   1047  int prpr = prevprevchr;
   1048 
   1049  skipchr();
   1050  at_start = as;
   1051  prevchr = pr;
   1052  prevprevchr = prpr;
   1053 }
   1054 
   1055 // Get the next character from the pattern. We know about magic and such, so
   1056 // therefore we need a lexical analyzer.
   1057 static int getchr(void)
   1058 {
   1059  int chr = peekchr();
   1060 
   1061  skipchr();
   1062  return chr;
   1063 }
   1064 
   1065 // put character back.  Works only once!
   1066 static void ungetchr(void)
   1067 {
   1068  nextchr = curchr;
   1069  curchr = prevchr;
   1070  prevchr = prevprevchr;
   1071  at_start = prev_at_start;
   1072  prev_at_start = false;
   1073 
   1074  // Backup regparse, so that it's at the same position as before the
   1075  // getchr().
   1076  regparse -= prevchr_len;
   1077 }
   1078 
   1079 // Get and return the value of the hex string at the current position.
   1080 // Return -1 if there is no valid hex number.
   1081 // The position is updated:
   1082 //     blahblah\%x20asdf
   1083 //         before-^ ^-after
   1084 // The parameter controls the maximum number of input characters. This will be
   1085 // 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
   1086 static int64_t gethexchrs(int maxinputlen)
   1087 {
   1088  int64_t nr = 0;
   1089  int c;
   1090  int i;
   1091 
   1092  for (i = 0; i < maxinputlen; i++) {
   1093    c = (uint8_t)regparse[0];
   1094    if (!ascii_isxdigit(c)) {
   1095      break;
   1096    }
   1097    nr <<= 4;
   1098    nr |= hex2nr(c);
   1099    regparse++;
   1100  }
   1101 
   1102  if (i == 0) {
   1103    return -1;
   1104  }
   1105  return nr;
   1106 }
   1107 
   1108 // Get and return the value of the decimal string immediately after the
   1109 // current position. Return -1 for invalid.  Consumes all digits.
   1110 static int64_t getdecchrs(void)
   1111 {
   1112  int64_t nr = 0;
   1113  int c;
   1114  int i;
   1115 
   1116  for (i = 0;; i++) {
   1117    c = (uint8_t)regparse[0];
   1118    if (c < '0' || c > '9') {
   1119      break;
   1120    }
   1121    nr *= 10;
   1122    nr += c - '0';
   1123    regparse++;
   1124    curchr = -1;     // no longer valid
   1125  }
   1126 
   1127  if (i == 0) {
   1128    return -1;
   1129  }
   1130  return nr;
   1131 }
   1132 
   1133 // get and return the value of the octal string immediately after the current
   1134 // position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
   1135 // numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
   1136 // treat 8 or 9 as recognised characters. Position is updated:
   1137 //     blahblah\%o210asdf
   1138 //         before-^  ^-after
   1139 static int64_t getoctchrs(void)
   1140 {
   1141  int64_t nr = 0;
   1142  int c;
   1143  int i;
   1144 
   1145  for (i = 0; i < 3 && nr < 040; i++) {
   1146    c = (uint8_t)regparse[0];
   1147    if (c < '0' || c > '7') {
   1148      break;
   1149    }
   1150    nr <<= 3;
   1151    nr |= hex2nr(c);
   1152    regparse++;
   1153  }
   1154 
   1155  if (i == 0) {
   1156    return -1;
   1157  }
   1158  return nr;
   1159 }
   1160 
   1161 // read_limits - Read two integers to be taken as a minimum and maximum.
   1162 // If the first character is '-', then the range is reversed.
   1163 // Should end with 'end'.  If minval is missing, zero is default, if maxval is
   1164 // missing, a very big number is the default.
   1165 static int read_limits(int *minval, int *maxval)
   1166 {
   1167  int reverse = false;
   1168  char *first_char;
   1169  int tmp;
   1170 
   1171  if (*regparse == '-') {
   1172    // Starts with '-', so reverse the range later.
   1173    regparse++;
   1174    reverse = true;
   1175  }
   1176  first_char = regparse;
   1177  *minval = getdigits_int(&regparse, false, 0);
   1178  if (*regparse == ',') {           // There is a comma.
   1179    if (ascii_isdigit(*++regparse)) {
   1180      *maxval = getdigits_int(&regparse, false, MAX_LIMIT);
   1181    } else {
   1182      *maxval = MAX_LIMIT;
   1183    }
   1184  } else if (ascii_isdigit(*first_char)) {
   1185    *maxval = *minval;              // It was \{n} or \{-n}
   1186  } else {
   1187    *maxval = MAX_LIMIT;            // It was \{} or \{-}
   1188  }
   1189  if (*regparse == '\\') {
   1190    regparse++;         // Allow either \{...} or \{...\}
   1191  }
   1192  if (*regparse != '}') {
   1193    EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"), reg_magic == MAGIC_ALL);
   1194  }
   1195 
   1196  // Reverse the range if there was a '-', or make sure it is in the right
   1197  // order otherwise.
   1198  if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval)) {
   1199    tmp = *minval;
   1200    *minval = *maxval;
   1201    *maxval = tmp;
   1202  }
   1203  skipchr();            // let's be friends with the lexer again
   1204  return OK;
   1205 }
   1206 
   1207 // vim_regexec and friends
   1208 
   1209 // Global work variables for vim_regexec().
   1210 
   1211 // Sometimes need to save a copy of a line.  Since alloc()/free() is very
   1212 // slow, we keep one allocated piece of memory and only re-allocate it when
   1213 // it's too small.  It's freed in bt_regexec_both() when finished.
   1214 static uint8_t *reg_tofree = NULL;
   1215 static unsigned reg_tofreelen;
   1216 
   1217 // Structure used to store the execution state of the regex engine.
   1218 // Which ones are set depends on whether a single-line or multi-line match is
   1219 // done:
   1220 //                      single-line             multi-line
   1221 // reg_match            &regmatch_T             NULL
   1222 // reg_mmatch           NULL                    &regmmatch_T
   1223 // reg_startp           reg_match->startp       <invalid>
   1224 // reg_endp             reg_match->endp         <invalid>
   1225 // reg_startpos         <invalid>               reg_mmatch->startpos
   1226 // reg_endpos           <invalid>               reg_mmatch->endpos
   1227 // reg_win              NULL                    window in which to search
   1228 // reg_buf              curbuf                  buffer in which to search
   1229 // reg_firstlnum        <invalid>               first line in which to search
   1230 // reg_maxline          0                       last line nr
   1231 // reg_line_lbr         false or true           false
   1232 typedef struct {
   1233  regmatch_T *reg_match;
   1234  regmmatch_T *reg_mmatch;
   1235 
   1236  uint8_t **reg_startp;
   1237  uint8_t **reg_endp;
   1238  lpos_T *reg_startpos;
   1239  lpos_T *reg_endpos;
   1240 
   1241  win_T *reg_win;
   1242  buf_T *reg_buf;
   1243  linenr_T reg_firstlnum;
   1244  linenr_T reg_maxline;
   1245  bool reg_line_lbr;  // "\n" in string is line break
   1246 
   1247  // The current match-position is remembered with these variables:
   1248  linenr_T lnum;  ///< line number, relative to first line
   1249  uint8_t *line;   ///< start of current line
   1250  uint8_t *input;  ///< current input, points into "line"
   1251 
   1252  int need_clear_subexpr;   ///< subexpressions still need to be cleared
   1253  int need_clear_zsubexpr;  ///< extmatch subexpressions still need to be
   1254                            ///< cleared
   1255 
   1256  // Internal copy of 'ignorecase'.  It is set at each call to vim_regexec().
   1257  // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
   1258  // contains '\c' or '\C' the value is overruled.
   1259  bool reg_ic;
   1260 
   1261  // Similar to "reg_ic", but only for 'combining' characters.  Set with \Z
   1262  // flag in the regexp.  Defaults to false, always.
   1263  bool reg_icombine;
   1264 
   1265  bool reg_nobreak;
   1266 
   1267  // Copy of "rmm_maxcol": maximum column to search for a match.  Zero when
   1268  // there is no maximum.
   1269  colnr_T reg_maxcol;
   1270 
   1271  // State for the NFA engine regexec.
   1272  int nfa_has_zend;     ///< NFA regexp \ze operator encountered.
   1273  int nfa_has_backref;  ///< NFA regexp \1 .. \9 encountered.
   1274  int nfa_nsubexpr;     ///< Number of sub expressions actually being used
   1275                        ///< during execution. 1 if only the whole match
   1276                        ///< (subexpr 0) is used.
   1277  // listid is global, so that it increases on recursive calls to
   1278  // nfa_regmatch(), which means we don't have to clear the lastlist field of
   1279  // all the states.
   1280  int nfa_listid;
   1281  int nfa_alt_listid;
   1282 
   1283  int nfa_has_zsubexpr;  ///< NFA regexp has \z( ), set zsubexpr.
   1284 } regexec_T;
   1285 
   1286 static regexec_T rex;
   1287 static bool rex_in_use = false;
   1288 
   1289 static void reg_breakcheck(void)
   1290 {
   1291  if (!rex.reg_nobreak) {
   1292    fast_breakcheck();
   1293  }
   1294 }
   1295 
   1296 // Return true if character 'c' is included in 'iskeyword' option for
   1297 // "reg_buf" buffer.
   1298 static bool reg_iswordc(int c)
   1299 {
   1300  return vim_iswordc_buf(c, rex.reg_buf);
   1301 }
   1302 
   1303 static bool can_f_submatch = false;  ///< true when submatch() can be used
   1304 
   1305 /// These pointers are used for reg_submatch().  Needed for when the
   1306 /// substitution string is an expression that contains a call to substitute()
   1307 /// and submatch().
   1308 typedef struct {
   1309  regmatch_T *sm_match;
   1310  regmmatch_T *sm_mmatch;
   1311  linenr_T sm_firstlnum;
   1312  linenr_T sm_maxline;
   1313  int sm_line_lbr;
   1314 } regsubmatch_T;
   1315 
   1316 static regsubmatch_T rsm;  ///< can only be used when can_f_submatch is true
   1317 
   1318 /// Common code for reg_getline(), reg_getline_len(), reg_getline_submatch() and
   1319 /// reg_getline_submatch_len().
   1320 ///
   1321 /// @param flags  a bitmask that controls what info is to be returned
   1322 ///               and whether or not submatch is in effect.
   1323 static void reg_getline_common(linenr_T lnum, reg_getline_flags_T flags, char **line,
   1324                               colnr_T *length)
   1325 {
   1326  bool get_line = flags & RGLF_LINE;
   1327  bool get_length = flags & RGLF_LENGTH;
   1328  linenr_T firstlnum;
   1329  linenr_T maxline;
   1330 
   1331  if (flags & RGLF_SUBMATCH) {
   1332    firstlnum = rsm.sm_firstlnum + lnum;
   1333    maxline = rsm.sm_maxline;
   1334  } else {
   1335    firstlnum = rex.reg_firstlnum + lnum;
   1336    maxline = rex.reg_maxline;
   1337  }
   1338 
   1339  // when looking behind for a match/no-match lnum is negative. but we
   1340  // can't go before line 1.
   1341  if (firstlnum < 1) {
   1342    if (get_line) {
   1343      *line = NULL;
   1344    }
   1345    if (get_length) {
   1346      *length = 0;
   1347    }
   1348 
   1349    return;
   1350  }
   1351 
   1352  if (lnum > maxline) {
   1353    // must have matched the "\n" in the last line.
   1354    if (get_line) {
   1355      *line = "";
   1356    }
   1357    if (get_length) {
   1358      *length = 0;
   1359    }
   1360 
   1361    return;
   1362  }
   1363 
   1364  if (get_line) {
   1365    *line = ml_get_buf(rex.reg_buf, firstlnum);
   1366  }
   1367  if (get_length) {
   1368    *length = ml_get_buf_len(rex.reg_buf, firstlnum);
   1369  }
   1370 }
   1371 
   1372 /// Get pointer to the line "lnum", which is relative to "reg_firstlnum".
   1373 static char *reg_getline(linenr_T lnum)
   1374 {
   1375  char *line;
   1376  reg_getline_common(lnum, RGLF_LINE, &line, NULL);
   1377  return line;
   1378 }
   1379 
   1380 /// Get length of line "lnum", which is relative to "reg_firstlnum".
   1381 static colnr_T reg_getline_len(linenr_T lnum)
   1382 {
   1383  colnr_T length;
   1384  reg_getline_common(lnum, RGLF_LENGTH, NULL, &length);
   1385  return length;
   1386 }
   1387 
   1388 static uint8_t *reg_startzp[NSUBEXP];  // Workspace to mark beginning
   1389 static uint8_t *reg_endzp[NSUBEXP];    //   and end of \z(...\) matches
   1390 static lpos_T reg_startzpos[NSUBEXP];   // idem, beginning pos
   1391 static lpos_T reg_endzpos[NSUBEXP];     // idem, end pos
   1392 
   1393 // true if using multi-line regexp.
   1394 #define REG_MULTI       (rex.reg_match == NULL)
   1395 
   1396 // Create a new extmatch and mark it as referenced once.
   1397 static reg_extmatch_T *make_extmatch(void)
   1398  FUNC_ATTR_NONNULL_RET
   1399 {
   1400  reg_extmatch_T *em = xcalloc(1, sizeof(reg_extmatch_T));
   1401  em->refcnt = 1;
   1402  return em;
   1403 }
   1404 
   1405 // Add a reference to an extmatch.
   1406 reg_extmatch_T *ref_extmatch(reg_extmatch_T *em)
   1407 {
   1408  if (em != NULL) {
   1409    em->refcnt++;
   1410  }
   1411  return em;
   1412 }
   1413 
   1414 // Remove a reference to an extmatch.  If there are no references left, free
   1415 // the info.
   1416 void unref_extmatch(reg_extmatch_T *em)
   1417 {
   1418  int i;
   1419 
   1420  if (em != NULL && --em->refcnt <= 0) {
   1421    for (i = 0; i < NSUBEXP; i++) {
   1422      xfree(em->matches[i]);
   1423    }
   1424    xfree(em);
   1425  }
   1426 }
   1427 
   1428 // Get class of previous character.
   1429 static int reg_prev_class(void)
   1430 {
   1431  if (rex.input > rex.line) {
   1432    return mb_get_class_tab((char *)rex.input - 1 -
   1433                            utf_head_off((char *)rex.line, (char *)rex.input - 1),
   1434                            rex.reg_buf->b_chartab);
   1435  }
   1436  return -1;
   1437 }
   1438 
   1439 // Return true if the current rex.input position matches the Visual area.
   1440 static bool reg_match_visual(void)
   1441 {
   1442  pos_T top, bot;
   1443  linenr_T lnum;
   1444  colnr_T col;
   1445  win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
   1446  int mode;
   1447  colnr_T start, end;
   1448  colnr_T start2, end2;
   1449  colnr_T curswant;
   1450 
   1451  // Check if the buffer is the current buffer and not using a string.
   1452  if (rex.reg_buf != curbuf || VIsual.lnum == 0 || !REG_MULTI) {
   1453    return false;
   1454  }
   1455 
   1456  if (VIsual_active) {
   1457    if (lt(VIsual, wp->w_cursor)) {
   1458      top = VIsual;
   1459      bot = wp->w_cursor;
   1460    } else {
   1461      top = wp->w_cursor;
   1462      bot = VIsual;
   1463    }
   1464    mode = VIsual_mode;
   1465    curswant = wp->w_curswant;
   1466  } else {
   1467    if (lt(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end)) {
   1468      top = curbuf->b_visual.vi_start;
   1469      bot = curbuf->b_visual.vi_end;
   1470    } else {
   1471      top = curbuf->b_visual.vi_end;
   1472      bot = curbuf->b_visual.vi_start;
   1473    }
   1474    // a substitute command may have removed some lines
   1475    if (bot.lnum > curbuf->b_ml.ml_line_count) {
   1476      bot.lnum = curbuf->b_ml.ml_line_count;
   1477    }
   1478    mode = curbuf->b_visual.vi_mode;
   1479    curswant = curbuf->b_visual.vi_curswant;
   1480  }
   1481  lnum = rex.lnum + rex.reg_firstlnum;
   1482  if (lnum < top.lnum || lnum > bot.lnum) {
   1483    return false;
   1484  }
   1485 
   1486  col = (colnr_T)(rex.input - rex.line);
   1487  if (mode == 'v') {
   1488    if ((lnum == top.lnum && col < top.col)
   1489        || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e'))) {
   1490      return false;
   1491    }
   1492  } else if (mode == Ctrl_V) {
   1493    getvvcol(wp, &top, &start, NULL, &end);
   1494    getvvcol(wp, &bot, &start2, NULL, &end2);
   1495    if (start2 < start) {
   1496      start = start2;
   1497    }
   1498    if (end2 > end) {
   1499      end = end2;
   1500    }
   1501    if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL) {
   1502      end = MAXCOL;
   1503    }
   1504 
   1505    // getvvcol() flushes rex.line, need to get it again
   1506    rex.line = (uint8_t *)reg_getline(rex.lnum);
   1507    rex.input = rex.line + col;
   1508 
   1509    colnr_T cols = win_linetabsize(wp, rex.reg_firstlnum + rex.lnum, (char *)rex.line, col);
   1510    if (cols < start || cols > end - (*p_sel == 'e')) {
   1511      return false;
   1512    }
   1513  }
   1514  return true;
   1515 }
   1516 
   1517 // Check the regexp program for its magic number.
   1518 // Return true if it's wrong.
   1519 static int prog_magic_wrong(void)
   1520 {
   1521  regprog_T *prog;
   1522 
   1523  prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
   1524  if (prog->engine == &nfa_regengine) {
   1525    // For NFA matcher we don't check the magic
   1526    return false;
   1527  }
   1528 
   1529  if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) {
   1530    emsg(_(e_re_corr));
   1531    return true;
   1532  }
   1533  return false;
   1534 }
   1535 
   1536 // Cleanup the subexpressions, if this wasn't done yet.
   1537 // This construction is used to clear the subexpressions only when they are
   1538 // used (to increase speed).
   1539 static void cleanup_subexpr(void)
   1540 {
   1541  if (!rex.need_clear_subexpr) {
   1542    return;
   1543  }
   1544 
   1545  if (REG_MULTI) {
   1546    // Use 0xff to set lnum to -1
   1547    memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
   1548    memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
   1549  } else {
   1550    memset(rex.reg_startp, 0, sizeof(char *) * NSUBEXP);
   1551    memset(rex.reg_endp, 0, sizeof(char *) * NSUBEXP);
   1552  }
   1553  rex.need_clear_subexpr = false;
   1554 }
   1555 
   1556 static void cleanup_zsubexpr(void)
   1557 {
   1558  if (!rex.need_clear_zsubexpr) {
   1559    return;
   1560  }
   1561 
   1562  if (REG_MULTI) {
   1563    // Use 0xff to set lnum to -1
   1564    memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
   1565    memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
   1566  } else {
   1567    memset(reg_startzp, 0, sizeof(char *) * NSUBEXP);
   1568    memset(reg_endzp, 0, sizeof(char *) * NSUBEXP);
   1569  }
   1570  rex.need_clear_zsubexpr = false;
   1571 }
   1572 
   1573 // Advance rex.lnum, rex.line and rex.input to the next line.
   1574 static void reg_nextline(void)
   1575 {
   1576  rex.line = (uint8_t *)reg_getline(++rex.lnum);
   1577  rex.input = rex.line;
   1578  reg_breakcheck();
   1579 }
   1580 
   1581 // Check whether a backreference matches.
   1582 // Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
   1583 // If "bytelen" is not NULL, it is set to the byte length of the match in the
   1584 // last line.
   1585 // Optional: ignore case if rex.reg_ic is set.
   1586 static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum,
   1587                              colnr_T end_col, int *bytelen)
   1588 {
   1589  linenr_T clnum = start_lnum;
   1590  colnr_T ccol = start_col;
   1591  int len;
   1592  char *p;
   1593 
   1594  if (bytelen != NULL) {
   1595    *bytelen = 0;
   1596  }
   1597  while (true) {
   1598    // Since getting one line may invalidate the other, need to make copy.
   1599    // Slow!
   1600    if (rex.line != reg_tofree) {
   1601      len = (int)strlen((char *)rex.line);
   1602      if (reg_tofree == NULL || len >= (int)reg_tofreelen) {
   1603        len += 50;              // get some extra
   1604        xfree(reg_tofree);
   1605        reg_tofree = xmalloc((size_t)len);
   1606        reg_tofreelen = (unsigned)len;
   1607      }
   1608      STRCPY(reg_tofree, rex.line);
   1609      rex.input = reg_tofree + (rex.input - rex.line);
   1610      rex.line = reg_tofree;
   1611    }
   1612 
   1613    // Get the line to compare with.
   1614    p = reg_getline(clnum);
   1615    assert(p);
   1616 
   1617    if (clnum == end_lnum) {
   1618      len = end_col - ccol;
   1619    } else {
   1620      len = reg_getline_len(clnum) - ccol;
   1621    }
   1622 
   1623    if ((!rex.reg_ic && cstrncmp(p + ccol, (char *)rex.input, &len) != 0)
   1624        || (rex.reg_ic && mb_strnicmp(p + ccol, (char *)rex.input, (size_t)len) != 0)) {
   1625      return RA_NOMATCH;  // doesn't match
   1626    }
   1627    if (bytelen != NULL) {
   1628      *bytelen += len;
   1629    }
   1630    if (clnum == end_lnum) {
   1631      break;  // match and at end!
   1632    }
   1633    if (rex.lnum >= rex.reg_maxline) {
   1634      return RA_NOMATCH;  // text too short
   1635    }
   1636 
   1637    // Advance to next line.
   1638    reg_nextline();
   1639    if (bytelen != NULL) {
   1640      *bytelen = 0;
   1641    }
   1642    clnum++;
   1643    ccol = 0;
   1644    if (got_int) {
   1645      return RA_FAIL;
   1646    }
   1647  }
   1648 
   1649  // found a match!  Note that rex.line may now point to a copy of the line,
   1650  // that should not matter.
   1651  return RA_MATCH;
   1652 }
   1653 
   1654 /// Used in a place where no * or \+ can follow.
   1655 static bool re_mult_next(char *what)
   1656 {
   1657  if (re_multi_type(peekchr()) == MULTI_MULT) {
   1658    semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
   1659    rc_did_emsg = true;
   1660    return false;
   1661  }
   1662  return true;
   1663 }
   1664 
   1665 typedef struct {
   1666  int a, b, c;
   1667 } decomp_T;
   1668 
   1669 // 0xfb20 - 0xfb4f
   1670 static decomp_T decomp_table[0xfb4f - 0xfb20 + 1] = {
   1671  { 0x5e2, 0, 0 },          // 0xfb20       alt ayin
   1672  { 0x5d0, 0, 0 },          // 0xfb21       alt alef
   1673  { 0x5d3, 0, 0 },          // 0xfb22       alt dalet
   1674  { 0x5d4, 0, 0 },          // 0xfb23       alt he
   1675  { 0x5db, 0, 0 },          // 0xfb24       alt kaf
   1676  { 0x5dc, 0, 0 },          // 0xfb25       alt lamed
   1677  { 0x5dd, 0, 0 },          // 0xfb26       alt mem-sofit
   1678  { 0x5e8, 0, 0 },          // 0xfb27       alt resh
   1679  { 0x5ea, 0, 0 },          // 0xfb28       alt tav
   1680  { '+', 0, 0 },            // 0xfb29       alt plus
   1681  { 0x5e9, 0x5c1, 0 },      // 0xfb2a       shin+shin-dot
   1682  { 0x5e9, 0x5c2, 0 },      // 0xfb2b       shin+sin-dot
   1683  { 0x5e9, 0x5c1, 0x5bc },  // 0xfb2c       shin+shin-dot+dagesh
   1684  { 0x5e9, 0x5c2, 0x5bc },  // 0xfb2d       shin+sin-dot+dagesh
   1685  { 0x5d0, 0x5b7, 0 },      // 0xfb2e       alef+patah
   1686  { 0x5d0, 0x5b8, 0 },      // 0xfb2f       alef+qamats
   1687  { 0x5d0, 0x5b4, 0 },      // 0xfb30       alef+hiriq
   1688  { 0x5d1, 0x5bc, 0 },      // 0xfb31       bet+dagesh
   1689  { 0x5d2, 0x5bc, 0 },      // 0xfb32       gimel+dagesh
   1690  { 0x5d3, 0x5bc, 0 },      // 0xfb33       dalet+dagesh
   1691  { 0x5d4, 0x5bc, 0 },      // 0xfb34       he+dagesh
   1692  { 0x5d5, 0x5bc, 0 },      // 0xfb35       vav+dagesh
   1693  { 0x5d6, 0x5bc, 0 },      // 0xfb36       zayin+dagesh
   1694  { 0xfb37, 0, 0 },         // 0xfb37 -- UNUSED
   1695  { 0x5d8, 0x5bc, 0 },      // 0xfb38       tet+dagesh
   1696  { 0x5d9, 0x5bc, 0 },      // 0xfb39       yud+dagesh
   1697  { 0x5da, 0x5bc, 0 },      // 0xfb3a       kaf sofit+dagesh
   1698  { 0x5db, 0x5bc, 0 },      // 0xfb3b       kaf+dagesh
   1699  { 0x5dc, 0x5bc, 0 },      // 0xfb3c       lamed+dagesh
   1700  { 0xfb3d, 0, 0 },         // 0xfb3d -- UNUSED
   1701  { 0x5de, 0x5bc, 0 },      // 0xfb3e       mem+dagesh
   1702  { 0xfb3f, 0, 0 },         // 0xfb3f -- UNUSED
   1703  { 0x5e0, 0x5bc, 0 },      // 0xfb40       nun+dagesh
   1704  { 0x5e1, 0x5bc, 0 },      // 0xfb41       samech+dagesh
   1705  { 0xfb42, 0, 0 },         // 0xfb42 -- UNUSED
   1706  { 0x5e3, 0x5bc, 0 },      // 0xfb43       pe sofit+dagesh
   1707  { 0x5e4, 0x5bc, 0 },      // 0xfb44       pe+dagesh
   1708  { 0xfb45, 0, 0 },         // 0xfb45 -- UNUSED
   1709  { 0x5e6, 0x5bc, 0 },      // 0xfb46       tsadi+dagesh
   1710  { 0x5e7, 0x5bc, 0 },      // 0xfb47       qof+dagesh
   1711  { 0x5e8, 0x5bc, 0 },      // 0xfb48       resh+dagesh
   1712  { 0x5e9, 0x5bc, 0 },      // 0xfb49       shin+dagesh
   1713  { 0x5ea, 0x5bc, 0 },      // 0xfb4a       tav+dagesh
   1714  { 0x5d5, 0x5b9, 0 },      // 0xfb4b       vav+holam
   1715  { 0x5d1, 0x5bf, 0 },      // 0xfb4c       bet+rafe
   1716  { 0x5db, 0x5bf, 0 },      // 0xfb4d       kaf+rafe
   1717  { 0x5e4, 0x5bf, 0 },      // 0xfb4e       pe+rafe
   1718  { 0x5d0, 0x5dc, 0 }       // 0xfb4f       alef-lamed
   1719 };
   1720 
   1721 static void mb_decompose(int c, int *c1, int *c2, int *c3)
   1722 {
   1723  decomp_T d;
   1724 
   1725  if (c >= 0xfb20 && c <= 0xfb4f) {
   1726    d = decomp_table[c - 0xfb20];
   1727    *c1 = d.a;
   1728    *c2 = d.b;
   1729    *c3 = d.c;
   1730  } else {
   1731    *c1 = c;
   1732    *c2 = 0;
   1733    *c3 = 0;
   1734  }
   1735 }
   1736 
   1737 /// Compare two strings, ignore case if rex.reg_ic set.
   1738 /// Return 0 if strings match, non-zero otherwise.
   1739 /// Correct the length "*n" when composing characters are ignored
   1740 /// or when both utf codepoints are considered equal because of
   1741 /// case-folding but have different length (e.g. 's' and 'Å¿')
   1742 static int cstrncmp(char *s1, char *s2, int *n)
   1743 {
   1744  int result;
   1745 
   1746  if (!rex.reg_ic) {
   1747    result = strncmp(s1, s2, (size_t)(*n));
   1748  } else {
   1749    char *p = s1;
   1750    int n2 = 0;
   1751    int n1 = *n;
   1752    // count the number of characters for byte-length of s1
   1753    while (n1 > 0 && *p != NUL) {
   1754      n1 -= utfc_ptr2len(s1);
   1755      MB_PTR_ADV(p);
   1756      n2++;
   1757    }
   1758    // count the number of bytes to advance the same number of chars for s2
   1759    p = s2;
   1760    while (n2-- > 0 && *p != NUL) {
   1761      MB_PTR_ADV(p);
   1762    }
   1763 
   1764    n2 = (int)(p - s2);
   1765 
   1766    result = utf_strnicmp(s1, s2, (size_t)(*n), (size_t)n2);
   1767    if (result == 0 && n2 < *n) {
   1768      *n = n2;
   1769    }
   1770  }
   1771 
   1772  // if it failed and it's utf8 and we want to combineignore:
   1773  if (result != 0 && rex.reg_icombine) {
   1774    const char *str1, *str2;
   1775    int c1, c2, c11, c12;
   1776    int junk;
   1777 
   1778    // we have to handle the strcmp ourselves, since it is necessary to
   1779    // deal with the composing characters by ignoring them:
   1780    str1 = s1;
   1781    str2 = s2;
   1782    c1 = c2 = 0;
   1783    while ((int)(str1 - s1) < *n) {
   1784      c1 = mb_ptr2char_adv(&str1);
   1785      c2 = mb_ptr2char_adv(&str2);
   1786 
   1787      // decompose the character if necessary, into 'base' characters
   1788      // because I don't care about Arabic, I will hard-code the Hebrew
   1789      // which I *do* care about!  So sue me...
   1790      if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) {
   1791        // decomposition necessary?
   1792        mb_decompose(c1, &c11, &junk, &junk);
   1793        mb_decompose(c2, &c12, &junk, &junk);
   1794        c1 = c11;
   1795        c2 = c12;
   1796        if (c11 != c12 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12))) {
   1797          break;
   1798        }
   1799      }
   1800    }
   1801    result = c2 - c1;
   1802    if (result == 0) {
   1803      *n = (int)(str2 - s2);
   1804    }
   1805  }
   1806 
   1807  return result;
   1808 }
   1809 
   1810 /// Wrapper around strchr which accounts for case-insensitive searches and
   1811 /// non-ASCII characters.
   1812 ///
   1813 /// This function is used a lot for simple searches, keep it fast!
   1814 ///
   1815 /// @param  s  string to search
   1816 /// @param  c  character to find in @a s
   1817 ///
   1818 /// @return  NULL if no match, otherwise pointer to the position in @a s
   1819 static inline char *cstrchr(const char *const s, const int c)
   1820  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
   1821  FUNC_ATTR_ALWAYS_INLINE
   1822 {
   1823  if (!rex.reg_ic) {
   1824    return vim_strchr(s, c);
   1825  }
   1826 
   1827  int cc, lc;
   1828  if (c > 0x80) {
   1829    cc = utf_fold(c);
   1830    lc = cc;
   1831  } else if (ASCII_ISUPPER(c)) {
   1832    cc = TOLOWER_ASC(c);
   1833    lc = cc;
   1834  } else if (ASCII_ISLOWER(c)) {
   1835    cc = TOUPPER_ASC(c);
   1836    lc = c;
   1837  } else {
   1838    return vim_strchr(s, c);
   1839  }
   1840 
   1841  for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) {
   1842    const int uc = utf_ptr2char(p);
   1843    if (c > 0x80 || uc > 0x80) {
   1844      // Do not match an illegal byte.  E.g. 0xff matches 0xc3 0xbf, not 0xff.
   1845      // Compare with lower case of the character.
   1846      if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == lc) {
   1847        return (char *)p;
   1848      }
   1849    } else if ((uint8_t)(*p) == c || (uint8_t)(*p) == cc) {
   1850      return (char *)p;
   1851    }
   1852  }
   1853 
   1854  return NULL;
   1855 }
   1856 
   1857 ////////////////////////////////////////////////////////////////
   1858 //                    regsub stuff                            //
   1859 ////////////////////////////////////////////////////////////////
   1860 
   1861 static void do_upper(int *d, int c)
   1862 {
   1863  *d = mb_toupper(c);
   1864 }
   1865 
   1866 static void do_lower(int *d, int c)
   1867 {
   1868  *d = mb_tolower(c);
   1869 }
   1870 
   1871 /// regtilde(): Replace tildes in the pattern by the old pattern.
   1872 ///
   1873 /// Short explanation of the tilde: It stands for the previous replacement
   1874 /// pattern.  If that previous pattern also contains a ~ we should go back a
   1875 /// step further...  But we insert the previous pattern into the current one
   1876 /// and remember that.
   1877 /// This still does not handle the case where "magic" changes.  So require the
   1878 /// user to keep his hands off of "magic".
   1879 ///
   1880 /// The tildes are parsed once before the first call to vim_regsub().
   1881 char *regtilde(char *source, int magic, bool preview)
   1882 {
   1883  char *newsub = source;
   1884  size_t newsublen = 0;
   1885  char tilde[3] = { '~', NUL, NUL };
   1886  size_t tildelen = 1;
   1887  bool error = false;
   1888 
   1889  if (!magic) {
   1890    tilde[0] = '\\';
   1891    tilde[1] = '~';
   1892    tilde[2] = NUL;
   1893    tildelen = 2;
   1894  }
   1895 
   1896  char *p;
   1897  for (p = newsub; *p; p++) {
   1898    if (strncmp(p, tilde, tildelen) == 0) {
   1899      size_t prefixlen = (size_t)(p - newsub);  // not including the tilde
   1900      char *postfix = p + tildelen;
   1901      size_t postfixlen;
   1902      size_t tmpsublen;
   1903 
   1904      if (newsublen == 0) {
   1905        newsublen = strlen(newsub);
   1906      }
   1907      newsublen -= tildelen;
   1908      postfixlen = newsublen - prefixlen;
   1909      tmpsublen = prefixlen + reg_prev_sublen + postfixlen;
   1910 
   1911      if (tmpsublen > 0 && reg_prev_sub != NULL) {
   1912        // Avoid making the text longer than MAXCOL, it will cause
   1913        // trouble at some point.
   1914        if (tmpsublen > MAXCOL) {
   1915          emsg(_(e_resulting_text_too_long));
   1916          error = true;
   1917          break;
   1918        }
   1919 
   1920        char *tmpsub = xmalloc(tmpsublen + 1);
   1921        // copy prefix
   1922        memmove(tmpsub, newsub, prefixlen);
   1923        // interpret tilde
   1924        memmove(tmpsub + prefixlen, reg_prev_sub, reg_prev_sublen);
   1925        // copy postfix
   1926        STRCPY(tmpsub + prefixlen + reg_prev_sublen, postfix);
   1927 
   1928        if (newsub != source) {  // allocated newsub before
   1929          xfree(newsub);
   1930        }
   1931        newsub = tmpsub;
   1932        newsublen = tmpsublen;
   1933        p = newsub + prefixlen + reg_prev_sublen;
   1934      } else {
   1935        memmove(p, postfix, postfixlen + 1);  // remove the tilde (+1 for the NUL)
   1936      }
   1937      p--;
   1938    } else {
   1939      if (*p == '\\' && p[1]) {  // skip escaped characters
   1940        p++;
   1941      }
   1942      p += utfc_ptr2len(p) - 1;
   1943    }
   1944  }
   1945 
   1946  if (error) {
   1947    if (newsub != source) {
   1948      xfree(newsub);
   1949    }
   1950    return source;
   1951  }
   1952 
   1953  // Only change reg_prev_sub when not previewing.
   1954  if (!preview) {
   1955    // Store a copy of newsub  in reg_prev_sub.  It is always allocated,
   1956    // because recursive calls may make the returned string invalid.
   1957    // Only store it if there something to store.
   1958    newsublen = (size_t)(p - newsub);
   1959    if (newsublen == 0) {
   1960      XFREE_CLEAR(reg_prev_sub);
   1961    } else {
   1962      xfree(reg_prev_sub);
   1963      reg_prev_sub = xstrnsave(newsub, newsublen);
   1964    }
   1965    reg_prev_sublen = newsublen;
   1966  }
   1967 
   1968  return newsub;
   1969 }
   1970 
   1971 /// Put the submatches in "argv[argskip]" which is a list passed into
   1972 /// call_func() by vim_regsub_both().
   1973 static int fill_submatch_list(int argc FUNC_ATTR_UNUSED, typval_T *argv, int argskip, ufunc_T *fp)
   1974  FUNC_ATTR_NONNULL_ALL
   1975 {
   1976  typval_T *listarg = argv + argskip;
   1977 
   1978  if (!fp->uf_varargs && fp->uf_args.ga_len <= argskip) {
   1979    // called function doesn't take a submatches argument
   1980    return argskip;
   1981  }
   1982 
   1983  // Relies on sl_list to be the first item in staticList10_T.
   1984  tv_list_init_static10((staticList10_T *)listarg->vval.v_list);
   1985 
   1986  // There are always 10 list items in staticList10_T.
   1987  listitem_T *li = tv_list_first(listarg->vval.v_list);
   1988  for (int i = 0; i < 10; i++) {
   1989    char *s = rsm.sm_match->startp[i];
   1990    if (s == NULL || rsm.sm_match->endp[i] == NULL) {
   1991      s = NULL;
   1992    } else {
   1993      s = xstrnsave(s, (size_t)(rsm.sm_match->endp[i] - s));
   1994    }
   1995    TV_LIST_ITEM_TV(li)->v_type = VAR_STRING;
   1996    TV_LIST_ITEM_TV(li)->vval.v_string = s;
   1997    li = TV_LIST_ITEM_NEXT(argv->vval.v_list, li);
   1998  }
   1999  return argskip + 1;
   2000 }
   2001 
   2002 static void clear_submatch_list(staticList10_T *sl)
   2003 {
   2004  TV_LIST_ITER(&sl->sl_list, li, {
   2005    xfree(TV_LIST_ITEM_TV(li)->vval.v_string);
   2006  });
   2007 }
   2008 
   2009 /// vim_regsub() - perform substitutions after a vim_regexec() or
   2010 /// vim_regexec_multi() match.
   2011 ///
   2012 /// If "flags" has REGSUB_COPY really copy into "dest[destlen]".
   2013 /// Otherwise nothing is copied, only compute the length of the result.
   2014 ///
   2015 /// If "flags" has REGSUB_MAGIC then behave like 'magic' is set.
   2016 ///
   2017 /// If "flags" has REGSUB_BACKSLASH a backslash will be removed later, need to
   2018 /// double them to keep them, and insert a backslash before a CR to avoid it
   2019 /// being replaced with a line break later.
   2020 ///
   2021 /// Note: The matched text must not change between the call of
   2022 /// vim_regexec()/vim_regexec_multi() and vim_regsub()!  It would make the back
   2023 /// references invalid!
   2024 ///
   2025 /// Returns the size of the replacement, including terminating NUL.
   2026 int vim_regsub(regmatch_T *rmp, char *source, typval_T *expr, char *dest, int destlen, int flags)
   2027 {
   2028  regexec_T rex_save;
   2029  bool rex_in_use_save = rex_in_use;
   2030 
   2031  if (rex_in_use) {
   2032    // Being called recursively, save the state.
   2033    rex_save = rex;
   2034  }
   2035  rex_in_use = true;
   2036 
   2037  rex.reg_match = rmp;
   2038  rex.reg_mmatch = NULL;
   2039  rex.reg_maxline = 0;
   2040  rex.reg_buf = curbuf;
   2041  rex.reg_line_lbr = true;
   2042  int result = vim_regsub_both(source, expr, dest, destlen, flags);
   2043 
   2044  rex_in_use = rex_in_use_save;
   2045  if (rex_in_use) {
   2046    rex = rex_save;
   2047  }
   2048 
   2049  return result;
   2050 }
   2051 
   2052 int vim_regsub_multi(regmmatch_T *rmp, linenr_T lnum, char *source, char *dest, int destlen,
   2053                     int flags)
   2054 {
   2055  regexec_T rex_save;
   2056  bool rex_in_use_save = rex_in_use;
   2057 
   2058  if (rex_in_use) {
   2059    // Being called recursively, save the state.
   2060    rex_save = rex;
   2061  }
   2062  rex_in_use = true;
   2063 
   2064  rex.reg_match = NULL;
   2065  rex.reg_mmatch = rmp;
   2066  rex.reg_buf = curbuf;  // always works on the current buffer!
   2067  rex.reg_firstlnum = lnum;
   2068  rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
   2069  rex.reg_line_lbr = false;
   2070  int result = vim_regsub_both(source, NULL, dest, destlen, flags);
   2071 
   2072  rex_in_use = rex_in_use_save;
   2073  if (rex_in_use) {
   2074    rex = rex_save;
   2075  }
   2076 
   2077  return result;
   2078 }
   2079 
   2080 // When nesting more than a couple levels it's probably a mistake.
   2081 #define MAX_REGSUB_NESTING 4
   2082 static char *eval_result[MAX_REGSUB_NESTING] = { NULL, NULL, NULL, NULL };
   2083 
   2084 #if defined(EXITFREE)
   2085 void free_resub_eval_result(void)
   2086 {
   2087  for (int i = 0; i < MAX_REGSUB_NESTING; i++) {
   2088    XFREE_CLEAR(eval_result[i]);
   2089  }
   2090 }
   2091 #endif
   2092 
   2093 static int vim_regsub_both(char *source, typval_T *expr, char *dest, int destlen, int flags)
   2094 {
   2095  char *src;
   2096  char *dst;
   2097  char *s;
   2098  int c;
   2099  int cc;
   2100  int no = -1;
   2101  fptr_T func_all = (fptr_T)NULL;
   2102  fptr_T func_one = (fptr_T)NULL;
   2103  linenr_T clnum = 0;           // init for GCC
   2104  int len = 0;                  // init for GCC
   2105  static int nesting = 0;
   2106  bool copy = flags & REGSUB_COPY;
   2107 
   2108  // Be paranoid...
   2109  if ((source == NULL && expr == NULL) || dest == NULL) {
   2110    emsg(_(e_null));
   2111    return 0;
   2112  }
   2113  if (prog_magic_wrong()) {
   2114    return 0;
   2115  }
   2116  if (nesting == MAX_REGSUB_NESTING) {
   2117    emsg(_(e_substitute_nesting_too_deep));
   2118    return 0;
   2119  }
   2120  int nested = nesting;
   2121  src = source;
   2122  dst = dest;
   2123 
   2124  // When the substitute part starts with "\=" evaluate it as an expression.
   2125  if (expr != NULL || (source[0] == '\\' && source[1] == '=')) {
   2126    // To make sure that the length doesn't change between checking the
   2127    // length and copying the string, and to speed up things, the
   2128    // resulting string is saved from the call with
   2129    // "flags & REGSUB_COPY" == 0 to the call with
   2130    // "flags & REGSUB_COPY" != 0.
   2131    if (copy) {
   2132      if (eval_result[nested] != NULL) {
   2133        size_t eval_len = strlen(eval_result[nested]);
   2134        if (eval_len < (size_t)destlen) {
   2135          STRCPY(dest, eval_result[nested]);
   2136          dst += eval_len;
   2137          XFREE_CLEAR(eval_result[nested]);
   2138        }
   2139      }
   2140    } else {
   2141      const bool prev_can_f_submatch = can_f_submatch;
   2142      regsubmatch_T rsm_save;
   2143 
   2144      XFREE_CLEAR(eval_result[nested]);
   2145 
   2146      // The expression may contain substitute(), which calls us
   2147      // recursively.  Make sure submatch() gets the text from the first
   2148      // level.
   2149      if (can_f_submatch) {
   2150        rsm_save = rsm;
   2151      }
   2152      can_f_submatch = true;
   2153      rsm.sm_match = rex.reg_match;
   2154      rsm.sm_mmatch = rex.reg_mmatch;
   2155      rsm.sm_firstlnum = rex.reg_firstlnum;
   2156      rsm.sm_maxline = rex.reg_maxline;
   2157      rsm.sm_line_lbr = rex.reg_line_lbr;
   2158 
   2159      // Although unlikely, it is possible that the expression invokes a
   2160      // substitute command (it might fail, but still).  Therefore keep
   2161      // an array of eval results.
   2162      nesting++;
   2163 
   2164      if (expr != NULL) {
   2165        typval_T argv[2];
   2166        typval_T rettv;
   2167        staticList10_T matchList = TV_LIST_STATIC10_INIT;
   2168        rettv.v_type = VAR_STRING;
   2169        rettv.vval.v_string = NULL;
   2170        argv[0].v_type = VAR_LIST;
   2171        argv[0].vval.v_list = &matchList.sl_list;
   2172        funcexe_T funcexe = FUNCEXE_INIT;
   2173        funcexe.fe_argv_func = fill_submatch_list;
   2174        funcexe.fe_evaluate = true;
   2175        if (expr->v_type == VAR_FUNC) {
   2176          s = expr->vval.v_string;
   2177          call_func(s, -1, &rettv, 1, argv, &funcexe);
   2178        } else if (expr->v_type == VAR_PARTIAL) {
   2179          partial_T *partial = expr->vval.v_partial;
   2180 
   2181          s = partial_name(partial);
   2182          funcexe.fe_partial = partial;
   2183          call_func(s, -1, &rettv, 1, argv, &funcexe);
   2184        }
   2185        if (tv_list_len(&matchList.sl_list) > 0) {
   2186          // fill_submatch_list() was called.
   2187          clear_submatch_list(&matchList);
   2188        }
   2189        if (rettv.v_type == VAR_UNKNOWN) {
   2190          // something failed, no need to report another error
   2191          eval_result[nested] = NULL;
   2192        } else {
   2193          char buf[NUMBUFLEN];
   2194          eval_result[nested] = (char *)tv_get_string_buf_chk(&rettv, buf);
   2195          if (eval_result[nested] != NULL) {
   2196            eval_result[nested] = xstrdup(eval_result[nested]);
   2197          }
   2198        }
   2199        tv_clear(&rettv);
   2200      } else {
   2201        eval_result[nested] = eval_to_string(source + 2, true, false);
   2202      }
   2203      nesting--;
   2204 
   2205      if (eval_result[nested] != NULL) {
   2206        int had_backslash = false;
   2207 
   2208        for (s = eval_result[nested]; *s != NUL; MB_PTR_ADV(s)) {
   2209          // Change NL to CR, so that it becomes a line break,
   2210          // unless called from vim_regexec_nl().
   2211          // Skip over a backslashed character.
   2212          if (*s == NL && !rsm.sm_line_lbr) {
   2213            *s = CAR;
   2214          } else if (*s == '\\' && s[1] != NUL) {
   2215            s++;
   2216            // Change NL to CR here too, so that this works:
   2217            // :s/abc\\\ndef/\="aaa\\\nbbb"/  on text:
   2218            //   abc{backslash}
   2219            //   def
   2220            // Not when called from vim_regexec_nl().
   2221            if (*s == NL && !rsm.sm_line_lbr) {
   2222              *s = CAR;
   2223            }
   2224            had_backslash = true;
   2225          }
   2226        }
   2227        if (had_backslash && (flags & REGSUB_BACKSLASH)) {
   2228          // Backslashes will be consumed, need to double them.
   2229          s = vim_strsave_escaped(eval_result[nested], "\\");
   2230          xfree(eval_result[nested]);
   2231          eval_result[nested] = s;
   2232        }
   2233 
   2234        dst += strlen(eval_result[nested]);
   2235      }
   2236 
   2237      can_f_submatch = prev_can_f_submatch;
   2238      if (can_f_submatch) {
   2239        rsm = rsm_save;
   2240      }
   2241    }
   2242  } else {
   2243    while ((c = (uint8_t)(*src++)) != NUL) {
   2244      if (c == '&' && (flags & REGSUB_MAGIC)) {
   2245        no = 0;
   2246      } else if (c == '\\' && *src != NUL) {
   2247        if (*src == '&' && !(flags & REGSUB_MAGIC)) {
   2248          src++;
   2249          no = 0;
   2250        } else if ('0' <= *src && *src <= '9') {
   2251          no = *src++ - '0';
   2252        } else if (vim_strchr("uUlLeE", (uint8_t)(*src))) {
   2253          switch (*src++) {
   2254          case 'u':
   2255            func_one = do_upper;
   2256            continue;
   2257          case 'U':
   2258            func_all = do_upper;
   2259            continue;
   2260          case 'l':
   2261            func_one = do_lower;
   2262            continue;
   2263          case 'L':
   2264            func_all = do_lower;
   2265            continue;
   2266          case 'e':
   2267          case 'E':
   2268            func_one = func_all = (fptr_T)NULL;
   2269            continue;
   2270          }
   2271        }
   2272      }
   2273      if (no < 0) {           // Ordinary character.
   2274        if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL) {
   2275          // Copy a special key as-is.
   2276          if (copy) {
   2277            if (dst + 3 > dest + destlen) {
   2278              iemsg("vim_regsub_both(): not enough space");
   2279              return 0;
   2280            }
   2281            *dst++ = (char)c;
   2282            *dst++ = *src++;
   2283            *dst++ = *src++;
   2284          } else {
   2285            dst += 3;
   2286            src += 2;
   2287          }
   2288          continue;
   2289        }
   2290 
   2291        if (c == '\\' && *src != NUL) {
   2292          // Check for abbreviations -- webb
   2293          switch (*src) {
   2294          case 'r':
   2295            c = CAR;        ++src;  break;
   2296          case 'n':
   2297            c = NL;         ++src;  break;
   2298          case 't':
   2299            c = TAB;        ++src;  break;
   2300          // Oh no!  \e already has meaning in subst pat :-(
   2301          // case 'e':   c = ESC;        ++src;  break;
   2302          case 'b':
   2303            c = Ctrl_H;     ++src;  break;
   2304 
   2305          // If "backslash" is true the backslash will be removed
   2306          // later.  Used to insert a literal CR.
   2307          default:
   2308            if (flags & REGSUB_BACKSLASH) {
   2309              if (copy) {
   2310                if (dst + 1 > dest + destlen) {
   2311                  iemsg("vim_regsub_both(): not enough space");
   2312                  return 0;
   2313                }
   2314                *dst = '\\';
   2315              }
   2316              dst++;
   2317            }
   2318            c = (uint8_t)(*src++);
   2319          }
   2320        } else {
   2321          c = utf_ptr2char(src - 1);
   2322        }
   2323 
   2324        // Write to buffer, if copy is set.
   2325        if (func_one != NULL) {
   2326          func_one(&cc, c);
   2327          func_one = NULL;
   2328        } else if (func_all != NULL) {
   2329          func_all(&cc, c);
   2330        } else {
   2331          // just copy
   2332          cc = c;
   2333        }
   2334 
   2335        int totlen = utfc_ptr2len(src - 1);
   2336        int charlen = utf_char2len(cc);
   2337 
   2338        if (copy) {
   2339          if (dst + charlen > dest + destlen) {
   2340            iemsg("vim_regsub_both(): not enough space");
   2341            return 0;
   2342          }
   2343          utf_char2bytes(cc, dst);
   2344        }
   2345        dst += charlen - 1;
   2346        int clen = utf_ptr2len(src - 1);
   2347 
   2348        // If the character length is shorter than "totlen", there
   2349        // are composing characters; copy them as-is.
   2350        if (clen < totlen) {
   2351          if (copy) {
   2352            if (dst + totlen - clen > dest + destlen) {
   2353              iemsg("vim_regsub_both(): not enough space");
   2354              return 0;
   2355            }
   2356            memmove(dst + 1, src - 1 + clen, (size_t)(totlen - clen));
   2357          }
   2358          dst += totlen - clen;
   2359        }
   2360        src += totlen - 1;
   2361        dst++;
   2362      } else {
   2363        if (REG_MULTI) {
   2364          clnum = rex.reg_mmatch->startpos[no].lnum;
   2365          if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0) {
   2366            s = NULL;
   2367          } else {
   2368            s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
   2369            if (rex.reg_mmatch->endpos[no].lnum == clnum) {
   2370              len = rex.reg_mmatch->endpos[no].col
   2371                    - rex.reg_mmatch->startpos[no].col;
   2372            } else {
   2373              len = reg_getline_len(clnum) - rex.reg_mmatch->startpos[no].col;
   2374            }
   2375          }
   2376        } else {
   2377          s = rex.reg_match->startp[no];
   2378          if (rex.reg_match->endp[no] == NULL) {
   2379            s = NULL;
   2380          } else {
   2381            len = (int)(rex.reg_match->endp[no] - s);
   2382          }
   2383        }
   2384        if (s != NULL) {
   2385          while (true) {
   2386            if (len == 0) {
   2387              if (REG_MULTI) {
   2388                if (rex.reg_mmatch->endpos[no].lnum == clnum) {
   2389                  break;
   2390                }
   2391                if (copy) {
   2392                  if (dst + 1 > dest + destlen) {
   2393                    iemsg("vim_regsub_both(): not enough space");
   2394                    return 0;
   2395                  }
   2396                  *dst = CAR;
   2397                }
   2398                dst++;
   2399                s = reg_getline(++clnum);
   2400                if (rex.reg_mmatch->endpos[no].lnum == clnum) {
   2401                  len = rex.reg_mmatch->endpos[no].col;
   2402                } else {
   2403                  len = reg_getline_len(clnum);
   2404                }
   2405              } else {
   2406                break;
   2407              }
   2408            } else if (*s == NUL) {  // we hit NUL.
   2409              if (copy) {
   2410                iemsg(_(e_re_damg));
   2411              }
   2412              goto exit;
   2413            } else {
   2414              if ((flags & REGSUB_BACKSLASH) && (*s == CAR || *s == '\\')) {
   2415                // Insert a backslash in front of a CR, otherwise
   2416                // it will be replaced by a line break.
   2417                // Number of backslashes will be halved later,
   2418                // double them here.
   2419                if (copy) {
   2420                  if (dst + 2 > dest + destlen) {
   2421                    iemsg("vim_regsub_both(): not enough space");
   2422                    return 0;
   2423                  }
   2424                  dst[0] = '\\';
   2425                  dst[1] = *s;
   2426                }
   2427                dst += 2;
   2428              } else {
   2429                c = utf_ptr2char(s);
   2430 
   2431                if (func_one != (fptr_T)NULL) {
   2432                  func_one(&cc, c);
   2433                  func_one = NULL;
   2434                } else if (func_all != (fptr_T)NULL) {
   2435                  func_all(&cc, c);
   2436                } else {  // just copy
   2437                  cc = c;
   2438                }
   2439 
   2440                {
   2441                  int l;
   2442                  int charlen;
   2443 
   2444                  // Copy composing characters separately, one
   2445                  // at a time.
   2446                  l = utf_ptr2len(s) - 1;
   2447 
   2448                  s += l;
   2449                  len -= l;
   2450                  charlen = utf_char2len(cc);
   2451                  if (copy) {
   2452                    if (dst + charlen > dest + destlen) {
   2453                      iemsg("vim_regsub_both(): not enough space");
   2454                      return 0;
   2455                    }
   2456                    utf_char2bytes(cc, dst);
   2457                  }
   2458                  dst += charlen - 1;
   2459                }
   2460                dst++;
   2461              }
   2462 
   2463              s++;
   2464              len--;
   2465            }
   2466          }
   2467        }
   2468        no = -1;
   2469      }
   2470    }
   2471  }
   2472  if (copy) {
   2473    *dst = NUL;
   2474  }
   2475 
   2476 exit:
   2477  return (int)((dst - dest) + 1);
   2478 }
   2479 
   2480 static char *reg_getline_submatch(linenr_T lnum)
   2481 {
   2482  char *line;
   2483  reg_getline_common(lnum, RGLF_LINE | RGLF_SUBMATCH, &line, NULL);
   2484  return line;
   2485 }
   2486 
   2487 static colnr_T reg_getline_submatch_len(linenr_T lnum)
   2488 {
   2489  colnr_T length;
   2490  reg_getline_common(lnum, RGLF_LENGTH | RGLF_SUBMATCH, NULL, &length);
   2491  return length;
   2492 }
   2493 
   2494 /// Used for the submatch() function: get the string from the n'th submatch in
   2495 /// allocated memory.
   2496 ///
   2497 /// @return  NULL when not in a ":s" command and for a non-existing submatch.
   2498 char *reg_submatch(int no)
   2499 {
   2500  char *retval = NULL;
   2501  char *s;
   2502  int round;
   2503  linenr_T lnum;
   2504 
   2505  if (!can_f_submatch || no < 0) {
   2506    return NULL;
   2507  }
   2508 
   2509  if (rsm.sm_match == NULL) {
   2510    ssize_t len;
   2511 
   2512    // First round: compute the length and allocate memory.
   2513    // Second round: copy the text.
   2514    for (round = 1; round <= 2; round++) {
   2515      lnum = rsm.sm_mmatch->startpos[no].lnum;
   2516      if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0) {
   2517        return NULL;
   2518      }
   2519 
   2520      s = reg_getline_submatch(lnum);
   2521      if (s == NULL) {  // anti-crash check, cannot happen?
   2522        break;
   2523      }
   2524      s += rsm.sm_mmatch->startpos[no].col;
   2525      if (rsm.sm_mmatch->endpos[no].lnum == lnum) {
   2526        // Within one line: take form start to end col.
   2527        len = rsm.sm_mmatch->endpos[no].col - rsm.sm_mmatch->startpos[no].col;
   2528        if (round == 2) {
   2529          xmemcpyz(retval, s, (size_t)len);
   2530        }
   2531        len++;
   2532      } else {
   2533        // Multiple lines: take start line from start col, middle
   2534        // lines completely and end line up to end col.
   2535        len = reg_getline_submatch_len(lnum) - rsm.sm_mmatch->startpos[no].col;
   2536        if (round == 2) {
   2537          STRCPY(retval, s);
   2538          retval[len] = '\n';
   2539        }
   2540        len++;
   2541        lnum++;
   2542        while (lnum < rsm.sm_mmatch->endpos[no].lnum) {
   2543          s = reg_getline_submatch(lnum);
   2544          if (round == 2) {
   2545            STRCPY(retval + len, s);
   2546          }
   2547          len += reg_getline_submatch_len(lnum);
   2548          if (round == 2) {
   2549            retval[len] = '\n';
   2550          }
   2551          len++;
   2552          lnum++;
   2553        }
   2554        if (round == 2) {
   2555          strncpy(retval + len,  // NOLINT(runtime/printf)
   2556                  reg_getline_submatch(lnum),
   2557                  (size_t)rsm.sm_mmatch->endpos[no].col);
   2558        }
   2559        len += rsm.sm_mmatch->endpos[no].col;
   2560        if (round == 2) {
   2561          retval[len] = NUL;
   2562        }
   2563        len++;
   2564      }
   2565 
   2566      if (retval == NULL) {
   2567        retval = xmalloc((size_t)len);
   2568      }
   2569    }
   2570  } else {
   2571    s = rsm.sm_match->startp[no];
   2572    if (s == NULL || rsm.sm_match->endp[no] == NULL) {
   2573      retval = NULL;
   2574    } else {
   2575      retval = xstrnsave(s, (size_t)(rsm.sm_match->endp[no] - s));
   2576    }
   2577  }
   2578 
   2579  return retval;
   2580 }
   2581 
   2582 // Used for the submatch() function with the optional non-zero argument: get
   2583 // the list of strings from the n'th submatch in allocated memory with NULs
   2584 // represented in NLs.
   2585 // Returns a list of allocated strings.  Returns NULL when not in a ":s"
   2586 // command, for a non-existing submatch and for any error.
   2587 list_T *reg_submatch_list(int no)
   2588 {
   2589  if (!can_f_submatch || no < 0) {
   2590    return NULL;
   2591  }
   2592 
   2593  linenr_T slnum;
   2594  linenr_T elnum;
   2595  list_T *list;
   2596  const char *s;
   2597 
   2598  if (rsm.sm_match == NULL) {
   2599    slnum = rsm.sm_mmatch->startpos[no].lnum;
   2600    elnum = rsm.sm_mmatch->endpos[no].lnum;
   2601    if (slnum < 0 || elnum < 0) {
   2602      return NULL;
   2603    }
   2604 
   2605    colnr_T scol = rsm.sm_mmatch->startpos[no].col;
   2606    colnr_T ecol = rsm.sm_mmatch->endpos[no].col;
   2607 
   2608    list = tv_list_alloc(elnum - slnum + 1);
   2609 
   2610    s = reg_getline_submatch(slnum) + scol;
   2611    if (slnum == elnum) {
   2612      tv_list_append_string(list, s, ecol - scol);
   2613    } else {
   2614      int max_lnum = elnum - slnum;
   2615      tv_list_append_string(list, s, -1);
   2616      for (int i = 1; i < max_lnum; i++) {
   2617        s = reg_getline_submatch(slnum + i);
   2618        tv_list_append_string(list, s, -1);
   2619      }
   2620      s = reg_getline_submatch(elnum);
   2621      tv_list_append_string(list, s, ecol);
   2622    }
   2623  } else {
   2624    s = rsm.sm_match->startp[no];
   2625    if (s == NULL || rsm.sm_match->endp[no] == NULL) {
   2626      return NULL;
   2627    }
   2628    list = tv_list_alloc(1);
   2629    tv_list_append_string(list, s, rsm.sm_match->endp[no] - s);
   2630  }
   2631 
   2632  tv_list_ref(list);
   2633  return list;
   2634 }
   2635 
   2636 /// Initialize the values used for matching against multiple lines
   2637 ///
   2638 /// @param win   window in which to search or NULL
   2639 /// @param buf   buffer in which to search
   2640 /// @param lnum  nr of line to start looking for match
   2641 static void init_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum)
   2642 {
   2643  rex.reg_match = NULL;
   2644  rex.reg_mmatch = rmp;
   2645  rex.reg_buf = buf;
   2646  rex.reg_win = win;
   2647  rex.reg_firstlnum = lnum;
   2648  rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
   2649  rex.reg_line_lbr = false;
   2650  rex.reg_ic = rmp->rmm_ic;
   2651  rex.reg_icombine = false;
   2652  rex.reg_nobreak = rmp->regprog->re_flags & RE_NOBREAK;
   2653  rex.reg_maxcol = rmp->rmm_maxcol;
   2654 }
   2655 
   2656 // regexp_bt.c {{{1
   2657 
   2658 // Backtracking regular expression implementation.
   2659 //
   2660 // NOTICE:
   2661 //
   2662 // This is NOT the original regular expression code as written by Henry
   2663 // Spencer.  This code has been modified specifically for use with the VIM
   2664 // editor, and should not be used separately from Vim.  If you want a good
   2665 // regular expression library, get the original code.  The copyright notice
   2666 // that follows is from the original.
   2667 //
   2668 // END NOTICE
   2669 //
   2670 //      Copyright (c) 1986 by University of Toronto.
   2671 //      Written by Henry Spencer.  Not derived from licensed software.
   2672 //
   2673 //      Permission is granted to anyone to use this software for any
   2674 //      purpose on any computer system, and to redistribute it freely,
   2675 //      subject to the following restrictions:
   2676 //
   2677 //      1. The author is not responsible for the consequences of use of
   2678 //              this software, no matter how awful, even if they arise
   2679 //              from defects in it.
   2680 //
   2681 //      2. The origin of this software must not be misrepresented, either
   2682 //              by explicit claim or by omission.
   2683 //
   2684 //      3. Altered versions must be plainly marked as such, and must not
   2685 //              be misrepresented as being the original software.
   2686 //
   2687 // Beware that some of this code is subtly aware of the way operator
   2688 // precedence is structured in regular expressions.  Serious changes in
   2689 // regular-expression syntax might require a total rethink.
   2690 //
   2691 // Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
   2692 // Webb, Ciaran McCreesh and Bram Moolenaar.
   2693 // Named character class support added by Walter Briscoe (1998 Jul 01)
   2694 
   2695 // The "internal use only" fields in regexp_defs.h are present to pass info from
   2696 // compile to execute that permits the execute phase to run lots faster on
   2697 // simple cases.  They are:
   2698 //
   2699 // regstart     char that must begin a match; NUL if none obvious; Can be a
   2700 //              multi-byte character.
   2701 // reganch      is the match anchored (at beginning-of-line only)?
   2702 // regmust      string (pointer into program) that match must include, or NULL
   2703 // regmlen      length of regmust string
   2704 // regflags     RF_ values or'ed together
   2705 //
   2706 // Regstart and reganch permit very fast decisions on suitable starting points
   2707 // for a match, cutting down the work a lot.  Regmust permits fast rejection
   2708 // of lines that cannot possibly match.  The regmust tests are costly enough
   2709 // that vim_regcomp() supplies a regmust only if the r.e. contains something
   2710 // potentially expensive (at present, the only such thing detected is * or +
   2711 // at the start of the r.e., which can involve a lot of backup).  Regmlen is
   2712 // supplied because the test in vim_regexec() needs it and vim_regcomp() is
   2713 // computing it anyway.
   2714 
   2715 // Structure for regexp "program".  This is essentially a linear encoding
   2716 // of a nondeterministic finite-state machine (aka syntax charts or
   2717 // "railroad normal form" in parsing technology).  Each node is an opcode
   2718 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
   2719 // all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
   2720 // pointer with a BRANCH on both ends of it is connecting two alternatives.
   2721 // (Here we have one of the subtle syntax dependencies: an individual BRANCH
   2722 // (as opposed to a collection of them) is never concatenated with anything
   2723 // because of operator precedence).  The "next" pointer of a BRACES_COMPLEX
   2724 // node points to the node after the stuff to be repeated.
   2725 // The operand of some types of node is a literal string; for others, it is a
   2726 // node leading into a sub-FSM.  In particular, the operand of a BRANCH node
   2727 // is the first node of the branch.
   2728 // (NB this is *not* a tree structure: the tail of the branch connects to the
   2729 // thing following the set of BRANCHes.)
   2730 //
   2731 // pattern      is coded like:
   2732 //
   2733 //                        +-----------------+
   2734 //                        |                 V
   2735 // <aa>\|<bb>   BRANCH <aa> BRANCH <bb> --> END
   2736 //                   |      ^    |          ^
   2737 //                   +------+    +----------+
   2738 //
   2739 //
   2740 //                     +------------------+
   2741 //                     V                  |
   2742 // <aa>*        BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
   2743 //                   |      |               ^                      ^
   2744 //                   |      +---------------+                      |
   2745 //                   +---------------------------------------------+
   2746 //
   2747 //
   2748 //                     +----------------------+
   2749 //                     V                      |
   2750 // <aa>\+       BRANCH <aa> --> BRANCH --> BACK  BRANCH --> NOTHING --> END
   2751 //                   |               |           ^                      ^
   2752 //                   |               +-----------+                      |
   2753 //                   +--------------------------------------------------+
   2754 //
   2755 //
   2756 //                                      +-------------------------+
   2757 //                                      V                         |
   2758 // <aa>\{}      BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK  END
   2759 //                   |                              |                ^
   2760 //                   |                              +----------------+
   2761 //                   +-----------------------------------------------+
   2762 //
   2763 //
   2764 // <aa>\@!<bb>  BRANCH NOMATCH <aa> --> END  <bb> --> END
   2765 //                   |       |                ^       ^
   2766 //                   |       +----------------+       |
   2767 //                   +--------------------------------+
   2768 //
   2769 //                                                    +---------+
   2770 //                                                    |         V
   2771 // \z[abc]      BRANCH BRANCH  a  BRANCH  b  BRANCH  c  BRANCH  NOTHING --> END
   2772 //                   |      |          |          |     ^                   ^
   2773 //                   |      |          |          +-----+                   |
   2774 //                   |      |          +----------------+                   |
   2775 //                   |      +---------------------------+                   |
   2776 //                   +------------------------------------------------------+
   2777 //
   2778 // They all start with a BRANCH for "\|" alternatives, even when there is only
   2779 // one alternative.
   2780 
   2781 // The opcodes are:
   2782 
   2783 // definition   number             opnd?    meaning
   2784 #define END             0       //      End of program or NOMATCH operand.
   2785 #define BOL             1       //      Match "" at beginning of line.
   2786 #define EOL             2       //      Match "" at end of line.
   2787 #define BRANCH          3       // node Match this alternative, or the
   2788                                //      next...
   2789 #define BACK            4       //      Match "", "next" ptr points backward.
   2790 #define EXACTLY         5       // str  Match this string.
   2791 #define NOTHING         6       //      Match empty string.
   2792 #define STAR            7       // node Match this (simple) thing 0 or more
   2793                                //      times.
   2794 #define PLUS            8       // node Match this (simple) thing 1 or more
   2795                                //      times.
   2796 #define MATCH           9       // node match the operand zero-width
   2797 #define NOMATCH         10      // node check for no match with operand
   2798 #define BEHIND          11      // node look behind for a match with operand
   2799 #define NOBEHIND        12      // node look behind for no match with operand
   2800 #define SUBPAT          13      // node match the operand here
   2801 #define BRACE_SIMPLE    14      // node Match this (simple) thing between m and
   2802                                //      n times (\{m,n\}).
   2803 #define BOW             15      //      Match "" after [^a-zA-Z0-9_]
   2804 #define EOW             16      //      Match "" at    [^a-zA-Z0-9_]
   2805 #define BRACE_LIMITS    17      // nr nr  define the min & max for BRACE_SIMPLE
   2806                                //      and BRACE_COMPLEX.
   2807 #define NEWL            18      //      Match line-break
   2808 #define BHPOS           19      //      End position for BEHIND or NOBEHIND
   2809 
   2810 // character classes: 20-48 normal, 50-78 include a line-break
   2811 #define ADD_NL          30
   2812 #define FIRST_NL        ANY + ADD_NL
   2813 #define ANY             20      //      Match any one character.
   2814 #define ANYOF           21      // str  Match any character in this string.
   2815 #define ANYBUT          22      // str  Match any character not in this
   2816                                //      string.
   2817 #define IDENT           23      //      Match identifier char
   2818 #define SIDENT          24      //      Match identifier char but no digit
   2819 #define KWORD           25      //      Match keyword char
   2820 #define SKWORD          26      //      Match word char but no digit
   2821 #define FNAME           27      //      Match file name char
   2822 #define SFNAME          28      //      Match file name char but no digit
   2823 #define PRINT           29      //      Match printable char
   2824 #define SPRINT          30      //      Match printable char but no digit
   2825 #define WHITE           31      //      Match whitespace char
   2826 #define NWHITE          32      //      Match non-whitespace char
   2827 #define DIGIT           33      //      Match digit char
   2828 #define NDIGIT          34      //      Match non-digit char
   2829 #define HEX             35      //      Match hex char
   2830 #define NHEX            36      //      Match non-hex char
   2831 #define OCTAL           37      //      Match octal char
   2832 #define NOCTAL          38      //      Match non-octal char
   2833 #define WORD            39      //      Match word char
   2834 #define NWORD           40      //      Match non-word char
   2835 #define HEAD            41      //      Match head char
   2836 #define NHEAD           42      //      Match non-head char
   2837 #define ALPHA           43      //      Match alpha char
   2838 #define NALPHA          44      //      Match non-alpha char
   2839 #define LOWER           45      //      Match lowercase char
   2840 #define NLOWER          46      //      Match non-lowercase char
   2841 #define UPPER           47      //      Match uppercase char
   2842 #define NUPPER          48      //      Match non-uppercase char
   2843 #define LAST_NL         NUPPER + ADD_NL
   2844 #define WITH_NL(op)     ((op) >= FIRST_NL && (op) <= LAST_NL)
   2845 
   2846 #define MOPEN           80   // -89 Mark this point in input as start of
   2847                             //     \( … \) subexpr.  MOPEN + 0 marks start of
   2848                             //     match.
   2849 #define MCLOSE          90   // -99 Analogous to MOPEN.  MCLOSE + 0 marks
   2850                             //     end of match.
   2851 #define BACKREF         100  // -109 node Match same string again \1-\9.
   2852 
   2853 #define ZOPEN          110  // -119 Mark this point in input as start of
   2854                            //  \z( … \) subexpr.
   2855 #define ZCLOSE         120  // -129 Analogous to ZOPEN.
   2856 #define ZREF           130  // -139 node Match external submatch \z1-\z9
   2857 
   2858 #define BRACE_COMPLEX   140  // -149 node Match nodes between m & n times
   2859 
   2860 #define NOPEN           150     // Mark this point in input as start of
   2861                                // \%( subexpr.
   2862 #define NCLOSE          151     // Analogous to NOPEN.
   2863 
   2864 #define MULTIBYTECODE   200     // mbc  Match one multi-byte character
   2865 #define RE_BOF          201     //      Match "" at beginning of file.
   2866 #define RE_EOF          202     //      Match "" at end of file.
   2867 #define CURSOR          203     //      Match location of cursor.
   2868 
   2869 #define RE_LNUM         204     // nr cmp  Match line number
   2870 #define RE_COL          205     // nr cmp  Match column number
   2871 #define RE_VCOL         206     // nr cmp  Match virtual column number
   2872 
   2873 #define RE_MARK         207     // mark cmp  Match mark position
   2874 #define RE_VISUAL       208     //      Match Visual area
   2875 #define RE_COMPOSING    209     // any composing characters
   2876 
   2877 // Flags to be passed up and down.
   2878 #define HASWIDTH        0x1     // Known never to match null string.
   2879 #define SIMPLE          0x2     // Simple enough to be STAR/PLUS operand.
   2880 #define SPSTART         0x4     // Starts with * or +.
   2881 #define HASNL           0x8     // Contains some \n.
   2882 #define HASLOOKBH       0x10    // Contains "\@<=" or "\@<!".
   2883 #define WORST           0       // Worst case.
   2884 
   2885 static int prevchr_len;         ///< byte length of previous char
   2886 static int num_complex_braces;  ///< Complex \{...} count
   2887 static uint8_t *regcode;         ///< Code-emit pointer, or JUST_CALC_SIZE
   2888 static int64_t regsize;            ///< Code size.
   2889 static int reg_toolong;         ///< true when offset out of range
   2890 static uint8_t had_endbrace[NSUBEXP];  ///< flags, true if end of () found
   2891 static int64_t brace_min[10];        ///< Minimums for complex brace repeats
   2892 static int64_t brace_max[10];        ///< Maximums for complex brace repeats
   2893 static int brace_count[10];       ///< Current counts for complex brace repeats
   2894 static int one_exactly = false;   ///< only do one char for EXACTLY
   2895 
   2896 // When making changes to classchars also change nfa_classcodes.
   2897 static uint8_t *classchars = (uint8_t *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
   2898 static int classcodes[] = {
   2899  ANY, IDENT, SIDENT, KWORD, SKWORD,
   2900  FNAME, SFNAME, PRINT, SPRINT,
   2901  WHITE, NWHITE, DIGIT, NDIGIT,
   2902  HEX, NHEX, OCTAL, NOCTAL,
   2903  WORD, NWORD, HEAD, NHEAD,
   2904  ALPHA, NALPHA, LOWER, NLOWER,
   2905  UPPER, NUPPER
   2906 };
   2907 
   2908 // When regcode is set to this value, code is not emitted and size is computed
   2909 // instead.
   2910 #define JUST_CALC_SIZE  ((uint8_t *)-1)
   2911 
   2912 // used for STAR, PLUS and BRACE_SIMPLE matching
   2913 typedef struct regstar_S {
   2914  int nextb;            // next byte
   2915  int nextb_ic;         // next byte reverse case
   2916  int64_t count;
   2917  int64_t minval;
   2918  int64_t maxval;
   2919 } regstar_T;
   2920 
   2921 // used to store input position when a BACK was encountered, so that we now if
   2922 // we made any progress since the last time.
   2923 typedef struct backpos_S {
   2924  uint8_t *bp_scan;         // "scan" where BACK was encountered
   2925  regsave_T bp_pos;           // last input position
   2926 } backpos_T;
   2927 
   2928 // "regstack" and "backpos" are used by regmatch().  They are kept over calls
   2929 // to avoid invoking malloc() and free() often.
   2930 // "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
   2931 // or regbehind_T.
   2932 // "backpos_T" is a table with backpos_T for BACK
   2933 static garray_T regstack = GA_EMPTY_INIT_VALUE;
   2934 static garray_T backpos = GA_EMPTY_INIT_VALUE;
   2935 
   2936 static regsave_T behind_pos;
   2937 
   2938 // Both for regstack and backpos tables we use the following strategy of
   2939 // allocation (to reduce malloc/free calls):
   2940 // - Initial size is fairly small.
   2941 // - When needed, the tables are grown bigger (8 times at first, double after
   2942 //   that).
   2943 // - After executing the match we free the memory only if the array has grown.
   2944 //   Thus the memory is kept allocated when it's at the initial size.
   2945 // This makes it fast while not keeping a lot of memory allocated.
   2946 // A three times speed increase was observed when using many simple patterns.
   2947 #define REGSTACK_INITIAL        2048
   2948 #define BACKPOS_INITIAL         64
   2949 
   2950 // Opcode notes:
   2951 //
   2952 // BRANCH       The set of branches constituting a single choice are hooked
   2953 //              together with their "next" pointers, since precedence prevents
   2954 //              anything being concatenated to any individual branch.  The
   2955 //              "next" pointer of the last BRANCH in a choice points to the
   2956 //              thing following the whole choice.  This is also where the
   2957 //              final "next" pointer of each individual branch points; each
   2958 //              branch starts with the operand node of a BRANCH node.
   2959 //
   2960 // BACK         Normal "next" pointers all implicitly point forward; BACK
   2961 //              exists to make loop structures possible.
   2962 //
   2963 // STAR,PLUS    '=', and complex '*' and '+', are implemented as circular
   2964 //              BRANCH structures using BACK.  Simple cases (one character
   2965 //              per match) are implemented with STAR and PLUS for speed
   2966 //              and to minimize recursive plunges.
   2967 //
   2968 // BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
   2969 //              node, and defines the min and max limits to be used for that
   2970 //              node.
   2971 //
   2972 // MOPEN,MCLOSE ...are numbered at compile time.
   2973 // ZOPEN,ZCLOSE ...ditto
   2974 ///
   2975 //
   2976 //
   2977 // A node is one char of opcode followed by two chars of "next" pointer.
   2978 // "Next" pointers are stored as two 8-bit bytes, high order first.  The
   2979 // value is a positive offset from the opcode of the node containing it.
   2980 // An operand, if any, simply follows the node.  (Note that much of the
   2981 // code generation knows about this implicit relationship.)
   2982 //
   2983 // Using two bytes for the "next" pointer is vast overkill for most things,
   2984 // but allows patterns to get big without disasters.
   2985 #define OP(p)           ((int)(*(p)))
   2986 #define NEXT(p)         (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
   2987 #define OPERAND(p)      ((p) + 3)
   2988 // Obtain an operand that was stored as four bytes, MSB first.
   2989 #define OPERAND_MIN(p)  (((int64_t)(p)[3] << 24) + ((int64_t)(p)[4] << 16) \
   2990                         + ((int64_t)(p)[5] << 8) + (int64_t)(p)[6])
   2991 // Obtain a second operand stored as four bytes.
   2992 #define OPERAND_MAX(p)  OPERAND_MIN((p) + 4)
   2993 // Obtain a second single-byte operand stored after a four bytes operand.
   2994 #define OPERAND_CMP(p)  (p)[7]
   2995 
   2996 static uint8_t *reg(int paren, int *flagp);
   2997 
   2998 #ifdef BT_REGEXP_DUMP
   2999 static void     regdump(uint8_t *, bt_regprog_T *);
   3000 #endif
   3001 
   3002 #ifdef REGEXP_DEBUG
   3003 static uint8_t *regprop(uint8_t *);
   3004 
   3005 static int regnarrate = 0;
   3006 #endif
   3007 
   3008 // Setup to parse the regexp.  Used once to get the length and once to do it.
   3009 static void regcomp_start(uint8_t *expr, int re_flags)                        // see vim_regcomp()
   3010 {
   3011  initchr((char *)expr);
   3012  if (re_flags & RE_MAGIC) {
   3013    reg_magic = MAGIC_ON;
   3014  } else {
   3015    reg_magic = MAGIC_OFF;
   3016  }
   3017  reg_string = (re_flags & RE_STRING);
   3018  reg_strict = (re_flags & RE_STRICT);
   3019  get_cpo_flags();
   3020 
   3021  num_complex_braces = 0;
   3022  regnpar = 1;
   3023  CLEAR_FIELD(had_endbrace);
   3024  regnzpar = 1;
   3025  re_has_z = 0;
   3026  regsize = 0L;
   3027  reg_toolong = false;
   3028  regflags = 0;
   3029  had_eol = false;
   3030 }
   3031 
   3032 // Return true if MULTIBYTECODE should be used instead of EXACTLY for
   3033 // character "c".
   3034 static bool use_multibytecode(int c)
   3035 {
   3036  return utf_char2len(c) > 1
   3037         && (re_multi_type(peekchr()) != NOT_MULTI
   3038             || utf_iscomposing_legacy(c));
   3039 }
   3040 
   3041 // Emit (if appropriate) a byte of code
   3042 static void regc(int b)
   3043 {
   3044  if (regcode == JUST_CALC_SIZE) {
   3045    regsize++;
   3046  } else {
   3047    *regcode++ = (uint8_t)b;
   3048  }
   3049 }
   3050 
   3051 // Emit (if appropriate) a multi-byte character of code
   3052 static void regmbc(int c)
   3053 {
   3054  if (regcode == JUST_CALC_SIZE) {
   3055    regsize += utf_char2len(c);
   3056  } else {
   3057    regcode += utf_char2bytes(c, (char *)regcode);
   3058  }
   3059 }
   3060 
   3061 // Produce the bytes for equivalence class "c".
   3062 // Currently only handles latin1, latin9 and utf-8.
   3063 // NOTE: When changing this function, also change nfa_emit_equi_class()
   3064 static void reg_equi_class(int c)
   3065 {
   3066  {
   3067    switch (c) {
   3068    // Do not use '\300' style, it results in a negative number.
   3069    case 'A':
   3070    case 0xc0:
   3071    case 0xc1:
   3072    case 0xc2:
   3073    case 0xc3:
   3074    case 0xc4:
   3075    case 0xc5:
   3076    case 0x100:
   3077    case 0x102:
   3078    case 0x104:
   3079    case 0x1cd:
   3080    case 0x1de:
   3081    case 0x1e0:
   3082    case 0x1fa:
   3083    case 0x202:
   3084    case 0x226:
   3085    case 0x23a:
   3086    case 0x1e00:
   3087    case 0x1ea0:
   3088    case 0x1ea2:
   3089    case 0x1ea4:
   3090    case 0x1ea6:
   3091    case 0x1ea8:
   3092    case 0x1eaa:
   3093    case 0x1eac:
   3094    case 0x1eae:
   3095    case 0x1eb0:
   3096    case 0x1eb2:
   3097    case 0x1eb4:
   3098    case 0x1eb6:
   3099      regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
   3100      regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
   3101      regmbc(0x100); regmbc(0x102); regmbc(0x104);
   3102      regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
   3103      regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
   3104      regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
   3105      regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
   3106      regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
   3107      regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
   3108      regmbc(0x1eb4); regmbc(0x1eb6);
   3109      return;
   3110    case 'B':
   3111    case 0x181:
   3112    case 0x243:
   3113    case 0x1e02:
   3114    case 0x1e04:
   3115    case 0x1e06:
   3116      regmbc('B');
   3117      regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
   3118      regmbc(0x1e04); regmbc(0x1e06);
   3119      return;
   3120    case 'C':
   3121    case 0xc7:
   3122    case 0x106:
   3123    case 0x108:
   3124    case 0x10a:
   3125    case 0x10c:
   3126    case 0x187:
   3127    case 0x23b:
   3128    case 0x1e08:
   3129    case 0xa792:
   3130      regmbc('C'); regmbc(0xc7);
   3131      regmbc(0x106); regmbc(0x108); regmbc(0x10a);
   3132      regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
   3133      regmbc(0x1e08); regmbc(0xa792);
   3134      return;
   3135    case 'D':
   3136    case 0x10e:
   3137    case 0x110:
   3138    case 0x18a:
   3139    case 0x1e0a:
   3140    case 0x1e0c:
   3141    case 0x1e0e:
   3142    case 0x1e10:
   3143    case 0x1e12:
   3144      regmbc('D'); regmbc(0x10e); regmbc(0x110);
   3145      regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
   3146      regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
   3147      return;
   3148    case 'E':
   3149    case 0xc8:
   3150    case 0xc9:
   3151    case 0xca:
   3152    case 0xcb:
   3153    case 0x112:
   3154    case 0x114:
   3155    case 0x116:
   3156    case 0x118:
   3157    case 0x11a:
   3158    case 0x204:
   3159    case 0x206:
   3160    case 0x228:
   3161    case 0x246:
   3162    case 0x1e14:
   3163    case 0x1e16:
   3164    case 0x1e18:
   3165    case 0x1e1a:
   3166    case 0x1e1c:
   3167    case 0x1eb8:
   3168    case 0x1eba:
   3169    case 0x1ebc:
   3170    case 0x1ebe:
   3171    case 0x1ec0:
   3172    case 0x1ec2:
   3173    case 0x1ec4:
   3174    case 0x1ec6:
   3175      regmbc('E'); regmbc(0xc8); regmbc(0xc9);
   3176      regmbc(0xca); regmbc(0xcb); regmbc(0x112);
   3177      regmbc(0x114); regmbc(0x116); regmbc(0x118);
   3178      regmbc(0x11a); regmbc(0x204); regmbc(0x206);
   3179      regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
   3180      regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
   3181      regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
   3182      regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
   3183      regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
   3184      return;
   3185    case 'F':
   3186    case 0x191:
   3187    case 0x1e1e:
   3188    case 0xa798:
   3189      regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
   3190      regmbc(0xa798);
   3191      return;
   3192    case 'G':
   3193    case 0x11c:
   3194    case 0x11e:
   3195    case 0x120:
   3196    case 0x122:
   3197    case 0x193:
   3198    case 0x1e4:
   3199    case 0x1e6:
   3200    case 0x1f4:
   3201    case 0x1e20:
   3202    case 0xa7a0:
   3203      regmbc('G'); regmbc(0x11c); regmbc(0x11e);
   3204      regmbc(0x120); regmbc(0x122); regmbc(0x193);
   3205      regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
   3206      regmbc(0x1e20); regmbc(0xa7a0);
   3207      return;
   3208    case 'H':
   3209    case 0x124:
   3210    case 0x126:
   3211    case 0x21e:
   3212    case 0x1e22:
   3213    case 0x1e24:
   3214    case 0x1e26:
   3215    case 0x1e28:
   3216    case 0x1e2a:
   3217    case 0x2c67:
   3218      regmbc('H'); regmbc(0x124); regmbc(0x126);
   3219      regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
   3220      regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
   3221      regmbc(0x2c67);
   3222      return;
   3223    case 'I':
   3224    case 0xcc:
   3225    case 0xcd:
   3226    case 0xce:
   3227    case 0xcf:
   3228    case 0x128:
   3229    case 0x12a:
   3230    case 0x12c:
   3231    case 0x12e:
   3232    case 0x130:
   3233    case 0x197:
   3234    case 0x1cf:
   3235    case 0x208:
   3236    case 0x20a:
   3237    case 0x1e2c:
   3238    case 0x1e2e:
   3239    case 0x1ec8:
   3240    case 0x1eca:
   3241      regmbc('I'); regmbc(0xcc); regmbc(0xcd);
   3242      regmbc(0xce); regmbc(0xcf); regmbc(0x128);
   3243      regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
   3244      regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
   3245      regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
   3246      regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
   3247      return;
   3248    case 'J':
   3249    case 0x134:
   3250    case 0x248:
   3251      regmbc('J'); regmbc(0x134); regmbc(0x248);
   3252      return;
   3253    case 'K':
   3254    case 0x136:
   3255    case 0x198:
   3256    case 0x1e8:
   3257    case 0x1e30:
   3258    case 0x1e32:
   3259    case 0x1e34:
   3260    case 0x2c69:
   3261    case 0xa740:
   3262      regmbc('K'); regmbc(0x136); regmbc(0x198);
   3263      regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
   3264      regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
   3265      return;
   3266    case 'L':
   3267    case 0x139:
   3268    case 0x13b:
   3269    case 0x13d:
   3270    case 0x13f:
   3271    case 0x141:
   3272    case 0x23d:
   3273    case 0x1e36:
   3274    case 0x1e38:
   3275    case 0x1e3a:
   3276    case 0x1e3c:
   3277    case 0x2c60:
   3278      regmbc('L'); regmbc(0x139); regmbc(0x13b);
   3279      regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
   3280      regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
   3281      regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
   3282      return;
   3283    case 'M':
   3284    case 0x1e3e:
   3285    case 0x1e40:
   3286    case 0x1e42:
   3287      regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
   3288      regmbc(0x1e42);
   3289      return;
   3290    case 'N':
   3291    case 0xd1:
   3292    case 0x143:
   3293    case 0x145:
   3294    case 0x147:
   3295    case 0x1f8:
   3296    case 0x1e44:
   3297    case 0x1e46:
   3298    case 0x1e48:
   3299    case 0x1e4a:
   3300    case 0xa7a4:
   3301      regmbc('N'); regmbc(0xd1);
   3302      regmbc(0x143); regmbc(0x145); regmbc(0x147);
   3303      regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
   3304      regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
   3305      return;
   3306    case 'O':
   3307    case 0xd2:
   3308    case 0xd3:
   3309    case 0xd4:
   3310    case 0xd5:
   3311    case 0xd6:
   3312    case 0xd8:
   3313    case 0x14c:
   3314    case 0x14e:
   3315    case 0x150:
   3316    case 0x19f:
   3317    case 0x1a0:
   3318    case 0x1d1:
   3319    case 0x1ea:
   3320    case 0x1ec:
   3321    case 0x1fe:
   3322    case 0x20c:
   3323    case 0x20e:
   3324    case 0x22a:
   3325    case 0x22c:
   3326    case 0x22e:
   3327    case 0x230:
   3328    case 0x1e4c:
   3329    case 0x1e4e:
   3330    case 0x1e50:
   3331    case 0x1e52:
   3332    case 0x1ecc:
   3333    case 0x1ece:
   3334    case 0x1ed0:
   3335    case 0x1ed2:
   3336    case 0x1ed4:
   3337    case 0x1ed6:
   3338    case 0x1ed8:
   3339    case 0x1eda:
   3340    case 0x1edc:
   3341    case 0x1ede:
   3342    case 0x1ee0:
   3343    case 0x1ee2:
   3344      regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
   3345      regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
   3346      regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
   3347      regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
   3348      regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
   3349      regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
   3350      regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
   3351      regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
   3352      regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
   3353      regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
   3354      regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
   3355      regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
   3356      regmbc(0x1ee2);
   3357      return;
   3358    case 'P':
   3359    case 0x1a4:
   3360    case 0x1e54:
   3361    case 0x1e56:
   3362    case 0x2c63:
   3363      regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
   3364      regmbc(0x1e56); regmbc(0x2c63);
   3365      return;
   3366    case 'Q':
   3367    case 0x24a:
   3368      regmbc('Q'); regmbc(0x24a);
   3369      return;
   3370    case 'R':
   3371    case 0x154:
   3372    case 0x156:
   3373    case 0x158:
   3374    case 0x210:
   3375    case 0x212:
   3376    case 0x24c:
   3377    case 0x1e58:
   3378    case 0x1e5a:
   3379    case 0x1e5c:
   3380    case 0x1e5e:
   3381    case 0x2c64:
   3382    case 0xa7a6:
   3383      regmbc('R'); regmbc(0x154); regmbc(0x156);
   3384      regmbc(0x210); regmbc(0x212); regmbc(0x158);
   3385      regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
   3386      regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
   3387      regmbc(0xa7a6);
   3388      return;
   3389    case 'S':
   3390    case 0x15a:
   3391    case 0x15c:
   3392    case 0x15e:
   3393    case 0x160:
   3394    case 0x218:
   3395    case 0x1e60:
   3396    case 0x1e62:
   3397    case 0x1e64:
   3398    case 0x1e66:
   3399    case 0x1e68:
   3400    case 0x2c7e:
   3401    case 0xa7a8:
   3402      regmbc('S'); regmbc(0x15a); regmbc(0x15c);
   3403      regmbc(0x15e); regmbc(0x160); regmbc(0x218);
   3404      regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
   3405      regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
   3406      regmbc(0xa7a8);
   3407      return;
   3408    case 'T':
   3409    case 0x162:
   3410    case 0x164:
   3411    case 0x166:
   3412    case 0x1ac:
   3413    case 0x1ae:
   3414    case 0x21a:
   3415    case 0x23e:
   3416    case 0x1e6a:
   3417    case 0x1e6c:
   3418    case 0x1e6e:
   3419    case 0x1e70:
   3420      regmbc('T'); regmbc(0x162); regmbc(0x164);
   3421      regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
   3422      regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
   3423      regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
   3424      return;
   3425    case 'U':
   3426    case 0xd9:
   3427    case 0xda:
   3428    case 0xdb:
   3429    case 0xdc:
   3430    case 0x168:
   3431    case 0x16a:
   3432    case 0x16c:
   3433    case 0x16e:
   3434    case 0x170:
   3435    case 0x172:
   3436    case 0x1af:
   3437    case 0x1d3:
   3438    case 0x1d5:
   3439    case 0x1d7:
   3440    case 0x1d9:
   3441    case 0x1db:
   3442    case 0x214:
   3443    case 0x216:
   3444    case 0x244:
   3445    case 0x1e72:
   3446    case 0x1e74:
   3447    case 0x1e76:
   3448    case 0x1e78:
   3449    case 0x1e7a:
   3450    case 0x1ee4:
   3451    case 0x1ee6:
   3452    case 0x1ee8:
   3453    case 0x1eea:
   3454    case 0x1eec:
   3455    case 0x1eee:
   3456    case 0x1ef0:
   3457      regmbc('U'); regmbc(0xd9); regmbc(0xda);
   3458      regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
   3459      regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
   3460      regmbc(0x170); regmbc(0x172); regmbc(0x1af);
   3461      regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
   3462      regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
   3463      regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
   3464      regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
   3465      regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
   3466      regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
   3467      regmbc(0x1eee); regmbc(0x1ef0);
   3468      return;
   3469    case 'V':
   3470    case 0x1b2:
   3471    case 0x1e7c:
   3472    case 0x1e7e:
   3473      regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
   3474      regmbc(0x1e7e);
   3475      return;
   3476    case 'W':
   3477    case 0x174:
   3478    case 0x1e80:
   3479    case 0x1e82:
   3480    case 0x1e84:
   3481    case 0x1e86:
   3482    case 0x1e88:
   3483      regmbc('W'); regmbc(0x174); regmbc(0x1e80);
   3484      regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
   3485      regmbc(0x1e88);
   3486      return;
   3487    case 'X':
   3488    case 0x1e8a:
   3489    case 0x1e8c:
   3490      regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
   3491      return;
   3492    case 'Y':
   3493    case 0xdd:
   3494    case 0x176:
   3495    case 0x178:
   3496    case 0x1b3:
   3497    case 0x232:
   3498    case 0x24e:
   3499    case 0x1e8e:
   3500    case 0x1ef2:
   3501    case 0x1ef6:
   3502    case 0x1ef4:
   3503    case 0x1ef8:
   3504      regmbc('Y'); regmbc(0xdd); regmbc(0x176);
   3505      regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
   3506      regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
   3507      regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
   3508      return;
   3509    case 'Z':
   3510    case 0x179:
   3511    case 0x17b:
   3512    case 0x17d:
   3513    case 0x1b5:
   3514    case 0x1e90:
   3515    case 0x1e92:
   3516    case 0x1e94:
   3517    case 0x2c6b:
   3518      regmbc('Z'); regmbc(0x179); regmbc(0x17b);
   3519      regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
   3520      regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
   3521      return;
   3522    case 'a':
   3523    case 0xe0:
   3524    case 0xe1:
   3525    case 0xe2:
   3526    case 0xe3:
   3527    case 0xe4:
   3528    case 0xe5:
   3529    case 0x101:
   3530    case 0x103:
   3531    case 0x105:
   3532    case 0x1ce:
   3533    case 0x1df:
   3534    case 0x1e1:
   3535    case 0x1fb:
   3536    case 0x201:
   3537    case 0x203:
   3538    case 0x227:
   3539    case 0x1d8f:
   3540    case 0x1e01:
   3541    case 0x1e9a:
   3542    case 0x1ea1:
   3543    case 0x1ea3:
   3544    case 0x1ea5:
   3545    case 0x1ea7:
   3546    case 0x1ea9:
   3547    case 0x1eab:
   3548    case 0x1ead:
   3549    case 0x1eaf:
   3550    case 0x1eb1:
   3551    case 0x1eb3:
   3552    case 0x1eb5:
   3553    case 0x1eb7:
   3554    case 0x2c65:
   3555      regmbc('a'); regmbc(0xe0); regmbc(0xe1);
   3556      regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
   3557      regmbc(0xe5); regmbc(0x101); regmbc(0x103);
   3558      regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
   3559      regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
   3560      regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
   3561      regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
   3562      regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
   3563      regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
   3564      regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
   3565      regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
   3566      return;
   3567    case 'b':
   3568    case 0x180:
   3569    case 0x253:
   3570    case 0x1d6c:
   3571    case 0x1d80:
   3572    case 0x1e03:
   3573    case 0x1e05:
   3574    case 0x1e07:
   3575      regmbc('b');
   3576      regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
   3577      regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
   3578      regmbc(0x1e07);
   3579      return;
   3580    case 'c':
   3581    case 0xe7:
   3582    case 0x107:
   3583    case 0x109:
   3584    case 0x10b:
   3585    case 0x10d:
   3586    case 0x188:
   3587    case 0x23c:
   3588    case 0x1e09:
   3589    case 0xa793:
   3590    case 0xa794:
   3591      regmbc('c'); regmbc(0xe7); regmbc(0x107);
   3592      regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
   3593      regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
   3594      regmbc(0xa793); regmbc(0xa794);
   3595      return;
   3596    case 'd':
   3597    case 0x10f:
   3598    case 0x111:
   3599    case 0x257:
   3600    case 0x1d6d:
   3601    case 0x1d81:
   3602    case 0x1d91:
   3603    case 0x1e0b:
   3604    case 0x1e0d:
   3605    case 0x1e0f:
   3606    case 0x1e11:
   3607    case 0x1e13:
   3608      regmbc('d'); regmbc(0x10f); regmbc(0x111);
   3609      regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
   3610      regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
   3611      regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
   3612      return;
   3613    case 'e':
   3614    case 0xe8:
   3615    case 0xe9:
   3616    case 0xea:
   3617    case 0xeb:
   3618    case 0x113:
   3619    case 0x115:
   3620    case 0x117:
   3621    case 0x119:
   3622    case 0x11b:
   3623    case 0x205:
   3624    case 0x207:
   3625    case 0x229:
   3626    case 0x247:
   3627    case 0x1d92:
   3628    case 0x1e15:
   3629    case 0x1e17:
   3630    case 0x1e19:
   3631    case 0x1e1b:
   3632    case 0x1eb9:
   3633    case 0x1ebb:
   3634    case 0x1e1d:
   3635    case 0x1ebd:
   3636    case 0x1ebf:
   3637    case 0x1ec1:
   3638    case 0x1ec3:
   3639    case 0x1ec5:
   3640    case 0x1ec7:
   3641      regmbc('e'); regmbc(0xe8); regmbc(0xe9);
   3642      regmbc(0xea); regmbc(0xeb); regmbc(0x113);
   3643      regmbc(0x115); regmbc(0x117); regmbc(0x119);
   3644      regmbc(0x11b); regmbc(0x205); regmbc(0x207);
   3645      regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
   3646      regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
   3647      regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
   3648      regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
   3649      regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
   3650      regmbc(0x1ec7);
   3651      return;
   3652    case 'f':
   3653    case 0x192:
   3654    case 0x1d6e:
   3655    case 0x1d82:
   3656    case 0x1e1f:
   3657    case 0xa799:
   3658      regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
   3659      regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
   3660      return;
   3661    case 'g':
   3662    case 0x11d:
   3663    case 0x11f:
   3664    case 0x121:
   3665    case 0x123:
   3666    case 0x1e5:
   3667    case 0x1e7:
   3668    case 0x260:
   3669    case 0x1f5:
   3670    case 0x1d83:
   3671    case 0x1e21:
   3672    case 0xa7a1:
   3673      regmbc('g'); regmbc(0x11d); regmbc(0x11f);
   3674      regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
   3675      regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
   3676      regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
   3677      return;
   3678    case 'h':
   3679    case 0x125:
   3680    case 0x127:
   3681    case 0x21f:
   3682    case 0x1e23:
   3683    case 0x1e25:
   3684    case 0x1e27:
   3685    case 0x1e29:
   3686    case 0x1e2b:
   3687    case 0x1e96:
   3688    case 0x2c68:
   3689    case 0xa795:
   3690      regmbc('h'); regmbc(0x125); regmbc(0x127);
   3691      regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
   3692      regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
   3693      regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
   3694      return;
   3695    case 'i':
   3696    case 0xec:
   3697    case 0xed:
   3698    case 0xee:
   3699    case 0xef:
   3700    case 0x129:
   3701    case 0x12b:
   3702    case 0x12d:
   3703    case 0x12f:
   3704    case 0x1d0:
   3705    case 0x209:
   3706    case 0x20b:
   3707    case 0x268:
   3708    case 0x1d96:
   3709    case 0x1e2d:
   3710    case 0x1e2f:
   3711    case 0x1ec9:
   3712    case 0x1ecb:
   3713      regmbc('i'); regmbc(0xec); regmbc(0xed);
   3714      regmbc(0xee); regmbc(0xef); regmbc(0x129);
   3715      regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
   3716      regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
   3717      regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
   3718      regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
   3719      return;
   3720    case 'j':
   3721    case 0x135:
   3722    case 0x1f0:
   3723    case 0x249:
   3724      regmbc('j'); regmbc(0x135); regmbc(0x1f0);
   3725      regmbc(0x249);
   3726      return;
   3727    case 'k':
   3728    case 0x137:
   3729    case 0x199:
   3730    case 0x1e9:
   3731    case 0x1d84:
   3732    case 0x1e31:
   3733    case 0x1e33:
   3734    case 0x1e35:
   3735    case 0x2c6a:
   3736    case 0xa741:
   3737      regmbc('k'); regmbc(0x137); regmbc(0x199);
   3738      regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
   3739      regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
   3740      regmbc(0xa741);
   3741      return;
   3742    case 'l':
   3743    case 0x13a:
   3744    case 0x13c:
   3745    case 0x13e:
   3746    case 0x140:
   3747    case 0x142:
   3748    case 0x19a:
   3749    case 0x1e37:
   3750    case 0x1e39:
   3751    case 0x1e3b:
   3752    case 0x1e3d:
   3753    case 0x2c61:
   3754      regmbc('l'); regmbc(0x13a); regmbc(0x13c);
   3755      regmbc(0x13e); regmbc(0x140); regmbc(0x142);
   3756      regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
   3757      regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
   3758      return;
   3759    case 'm':
   3760    case 0x1d6f:
   3761    case 0x1e3f:
   3762    case 0x1e41:
   3763    case 0x1e43:
   3764      regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
   3765      regmbc(0x1e41); regmbc(0x1e43);
   3766      return;
   3767    case 'n':
   3768    case 0xf1:
   3769    case 0x144:
   3770    case 0x146:
   3771    case 0x148:
   3772    case 0x149:
   3773    case 0x1f9:
   3774    case 0x1d70:
   3775    case 0x1d87:
   3776    case 0x1e45:
   3777    case 0x1e47:
   3778    case 0x1e49:
   3779    case 0x1e4b:
   3780    case 0xa7a5:
   3781      regmbc('n'); regmbc(0xf1); regmbc(0x144);
   3782      regmbc(0x146); regmbc(0x148); regmbc(0x149);
   3783      regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
   3784      regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
   3785      regmbc(0x1e4b); regmbc(0xa7a5);
   3786      return;
   3787    case 'o':
   3788    case 0xf2:
   3789    case 0xf3:
   3790    case 0xf4:
   3791    case 0xf5:
   3792    case 0xf6:
   3793    case 0xf8:
   3794    case 0x14d:
   3795    case 0x14f:
   3796    case 0x151:
   3797    case 0x1a1:
   3798    case 0x1d2:
   3799    case 0x1eb:
   3800    case 0x1ed:
   3801    case 0x1ff:
   3802    case 0x20d:
   3803    case 0x20f:
   3804    case 0x22b:
   3805    case 0x22d:
   3806    case 0x22f:
   3807    case 0x231:
   3808    case 0x275:
   3809    case 0x1e4d:
   3810    case 0x1e4f:
   3811    case 0x1e51:
   3812    case 0x1e53:
   3813    case 0x1ecd:
   3814    case 0x1ecf:
   3815    case 0x1ed1:
   3816    case 0x1ed3:
   3817    case 0x1ed5:
   3818    case 0x1ed7:
   3819    case 0x1ed9:
   3820    case 0x1edb:
   3821    case 0x1edd:
   3822    case 0x1edf:
   3823    case 0x1ee1:
   3824    case 0x1ee3:
   3825      regmbc('o'); regmbc(0xf2); regmbc(0xf3);
   3826      regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
   3827      regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
   3828      regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
   3829      regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
   3830      regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
   3831      regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
   3832      regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
   3833      regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
   3834      regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
   3835      regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
   3836      regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
   3837      regmbc(0x1ee1); regmbc(0x1ee3);
   3838      return;
   3839    case 'p':
   3840    case 0x1a5:
   3841    case 0x1d71:
   3842    case 0x1d88:
   3843    case 0x1d7d:
   3844    case 0x1e55:
   3845    case 0x1e57:
   3846      regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
   3847      regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
   3848      regmbc(0x1e57);
   3849      return;
   3850    case 'q':
   3851    case 0x24b:
   3852    case 0x2a0:
   3853      regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
   3854      return;
   3855    case 'r':
   3856    case 0x155:
   3857    case 0x157:
   3858    case 0x159:
   3859    case 0x211:
   3860    case 0x213:
   3861    case 0x24d:
   3862    case 0x27d:
   3863    case 0x1d72:
   3864    case 0x1d73:
   3865    case 0x1d89:
   3866    case 0x1e59:
   3867    case 0x1e5b:
   3868    case 0x1e5d:
   3869    case 0x1e5f:
   3870    case 0xa7a7:
   3871      regmbc('r'); regmbc(0x155); regmbc(0x157);
   3872      regmbc(0x159); regmbc(0x211); regmbc(0x213);
   3873      regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
   3874      regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
   3875      regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
   3876      regmbc(0xa7a7);
   3877      return;
   3878    case 's':
   3879    case 0x15b:
   3880    case 0x15d:
   3881    case 0x15f:
   3882    case 0x161:
   3883    case 0x1e61:
   3884    case 0x219:
   3885    case 0x23f:
   3886    case 0x1d74:
   3887    case 0x1d8a:
   3888    case 0x1e63:
   3889    case 0x1e65:
   3890    case 0x1e67:
   3891    case 0x1e69:
   3892    case 0xa7a9:
   3893      regmbc('s'); regmbc(0x15b); regmbc(0x15d);
   3894      regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
   3895      regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
   3896      regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
   3897      regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
   3898      return;
   3899    case 't':
   3900    case 0x163:
   3901    case 0x165:
   3902    case 0x167:
   3903    case 0x1ab:
   3904    case 0x1ad:
   3905    case 0x21b:
   3906    case 0x288:
   3907    case 0x1d75:
   3908    case 0x1e6b:
   3909    case 0x1e6d:
   3910    case 0x1e6f:
   3911    case 0x1e71:
   3912    case 0x1e97:
   3913    case 0x2c66:
   3914      regmbc('t'); regmbc(0x163); regmbc(0x165);
   3915      regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
   3916      regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
   3917      regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
   3918      regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
   3919      return;
   3920    case 'u':
   3921    case 0xf9:
   3922    case 0xfa:
   3923    case 0xfb:
   3924    case 0xfc:
   3925    case 0x169:
   3926    case 0x16b:
   3927    case 0x16d:
   3928    case 0x16f:
   3929    case 0x171:
   3930    case 0x173:
   3931    case 0x1b0:
   3932    case 0x1d4:
   3933    case 0x1d6:
   3934    case 0x1d8:
   3935    case 0x1da:
   3936    case 0x1dc:
   3937    case 0x215:
   3938    case 0x217:
   3939    case 0x289:
   3940    case 0x1e73:
   3941    case 0x1d7e:
   3942    case 0x1d99:
   3943    case 0x1e75:
   3944    case 0x1e77:
   3945    case 0x1e79:
   3946    case 0x1e7b:
   3947    case 0x1ee5:
   3948    case 0x1ee7:
   3949    case 0x1ee9:
   3950    case 0x1eeb:
   3951    case 0x1eed:
   3952    case 0x1eef:
   3953    case 0x1ef1:
   3954      regmbc('u'); regmbc(0xf9); regmbc(0xfa);
   3955      regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
   3956      regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
   3957      regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
   3958      regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
   3959      regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
   3960      regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
   3961      regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
   3962      regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
   3963      regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
   3964      regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
   3965      regmbc(0x1ef1);
   3966      return;
   3967    case 'v':
   3968    case 0x28b:
   3969    case 0x1d8c:
   3970    case 0x1e7d:
   3971    case 0x1e7f:
   3972      regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
   3973      regmbc(0x1e7d); regmbc(0x1e7f);
   3974      return;
   3975    case 'w':
   3976    case 0x175:
   3977    case 0x1e81:
   3978    case 0x1e83:
   3979    case 0x1e85:
   3980    case 0x1e87:
   3981    case 0x1e89:
   3982    case 0x1e98:
   3983      regmbc('w'); regmbc(0x175); regmbc(0x1e81);
   3984      regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
   3985      regmbc(0x1e89); regmbc(0x1e98);
   3986      return;
   3987    case 'x':
   3988    case 0x1e8b:
   3989    case 0x1e8d:
   3990      regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
   3991      return;
   3992    case 'y':
   3993    case 0xfd:
   3994    case 0xff:
   3995    case 0x177:
   3996    case 0x1b4:
   3997    case 0x233:
   3998    case 0x24f:
   3999    case 0x1e8f:
   4000    case 0x1e99:
   4001    case 0x1ef3:
   4002    case 0x1ef5:
   4003    case 0x1ef7:
   4004    case 0x1ef9:
   4005      regmbc('y'); regmbc(0xfd); regmbc(0xff);
   4006      regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
   4007      regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
   4008      regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
   4009      regmbc(0x1ef9);
   4010      return;
   4011    case 'z':
   4012    case 0x17a:
   4013    case 0x17c:
   4014    case 0x17e:
   4015    case 0x1b6:
   4016    case 0x1d76:
   4017    case 0x1d8e:
   4018    case 0x1e91:
   4019    case 0x1e93:
   4020    case 0x1e95:
   4021    case 0x2c6c:
   4022      regmbc('z'); regmbc(0x17a); regmbc(0x17c);
   4023      regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
   4024      regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
   4025      regmbc(0x1e95); regmbc(0x2c6c);
   4026      return;
   4027    }
   4028  }
   4029  regmbc(c);
   4030 }
   4031 
   4032 // Emit a node.
   4033 // Return pointer to generated code.
   4034 static uint8_t *regnode(int op)
   4035 {
   4036  uint8_t *ret;
   4037 
   4038  ret = regcode;
   4039  if (ret == JUST_CALC_SIZE) {
   4040    regsize += 3;
   4041  } else {
   4042    *regcode++ = (uint8_t)op;
   4043    *regcode++ = NUL;                   // Null "next" pointer.
   4044    *regcode++ = NUL;
   4045  }
   4046  return ret;
   4047 }
   4048 
   4049 // Write a four bytes number at "p" and return pointer to the next char.
   4050 static uint8_t *re_put_uint32(uint8_t *p, uint32_t val)
   4051 {
   4052  *p++ = (uint8_t)((val >> 24) & 0377);
   4053  *p++ = (uint8_t)((val >> 16) & 0377);
   4054  *p++ = (uint8_t)((val >> 8) & 0377);
   4055  *p++ = (uint8_t)(val & 0377);
   4056  return p;
   4057 }
   4058 
   4059 // regnext - dig the "next" pointer out of a node
   4060 // Returns NULL when calculating size, when there is no next item and when
   4061 // there is an error.
   4062 static uint8_t *regnext(uint8_t *p)
   4063  FUNC_ATTR_NONNULL_ALL
   4064 {
   4065  int offset;
   4066 
   4067  if (p == JUST_CALC_SIZE || reg_toolong) {
   4068    return NULL;
   4069  }
   4070 
   4071  offset = NEXT(p);
   4072  if (offset == 0) {
   4073    return NULL;
   4074  }
   4075 
   4076  if (OP(p) == BACK) {
   4077    return p - offset;
   4078  } else {
   4079    return p + offset;
   4080  }
   4081 }
   4082 
   4083 // Set the next-pointer at the end of a node chain.
   4084 static void regtail(uint8_t *p, const uint8_t *val)
   4085 {
   4086  int offset;
   4087 
   4088  if (p == JUST_CALC_SIZE) {
   4089    return;
   4090  }
   4091 
   4092  // Find last node.
   4093  uint8_t *scan = p;
   4094  while (true) {
   4095    uint8_t *temp = regnext(scan);
   4096    if (temp == NULL) {
   4097      break;
   4098    }
   4099    scan = temp;
   4100  }
   4101 
   4102  if (OP(scan) == BACK) {
   4103    offset = (int)(scan - val);
   4104  } else {
   4105    offset = (int)(val - scan);
   4106  }
   4107  // When the offset uses more than 16 bits it can no longer fit in the two
   4108  // bytes available.  Use a global flag to avoid having to check return
   4109  // values in too many places.
   4110  if (offset > 0xffff) {
   4111    reg_toolong = true;
   4112  } else {
   4113    *(scan + 1) = (uint8_t)(((unsigned)offset >> 8) & 0377);
   4114    *(scan + 2) = (uint8_t)(offset & 0377);
   4115  }
   4116 }
   4117 
   4118 // Like regtail, on item after a BRANCH; nop if none.
   4119 static void regoptail(uint8_t *p, uint8_t *val)
   4120 {
   4121  // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
   4122  if (p == NULL || p == JUST_CALC_SIZE
   4123      || (OP(p) != BRANCH
   4124          && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9))) {
   4125    return;
   4126  }
   4127  regtail(OPERAND(p), val);
   4128 }
   4129 
   4130 // Insert an operator in front of already-emitted operand
   4131 //
   4132 // Means relocating the operand.
   4133 static void reginsert(int op, uint8_t *opnd)
   4134 {
   4135  uint8_t *src;
   4136  uint8_t *dst;
   4137  uint8_t *place;
   4138 
   4139  if (regcode == JUST_CALC_SIZE) {
   4140    regsize += 3;
   4141    return;
   4142  }
   4143  src = regcode;
   4144  regcode += 3;
   4145  dst = regcode;
   4146  while (src > opnd) {
   4147    *--dst = *--src;
   4148  }
   4149 
   4150  place = opnd;                 // Op node, where operand used to be.
   4151  *place++ = (uint8_t)op;
   4152  *place++ = NUL;
   4153  *place = NUL;
   4154 }
   4155 
   4156 // Insert an operator in front of already-emitted operand.
   4157 // Add a number to the operator.
   4158 static void reginsert_nr(int op, int64_t val, uint8_t *opnd)
   4159 {
   4160  uint8_t *src;
   4161  uint8_t *dst;
   4162  uint8_t *place;
   4163 
   4164  if (regcode == JUST_CALC_SIZE) {
   4165    regsize += 7;
   4166    return;
   4167  }
   4168  src = regcode;
   4169  regcode += 7;
   4170  dst = regcode;
   4171  while (src > opnd) {
   4172    *--dst = *--src;
   4173  }
   4174 
   4175  place = opnd;                 // Op node, where operand used to be.
   4176  *place++ = (uint8_t)op;
   4177  *place++ = NUL;
   4178  *place++ = NUL;
   4179  assert(val >= 0 && (uintmax_t)val <= UINT32_MAX);
   4180  re_put_uint32(place, (uint32_t)val);
   4181 }
   4182 
   4183 // Insert an operator in front of already-emitted operand.
   4184 // The operator has the given limit values as operands.  Also set next pointer.
   4185 //
   4186 // Means relocating the operand.
   4187 static void reginsert_limits(int op, int64_t minval, int64_t maxval, uint8_t *opnd)
   4188 {
   4189  uint8_t *src;
   4190  uint8_t *dst;
   4191  uint8_t *place;
   4192 
   4193  if (regcode == JUST_CALC_SIZE) {
   4194    regsize += 11;
   4195    return;
   4196  }
   4197  src = regcode;
   4198  regcode += 11;
   4199  dst = regcode;
   4200  while (src > opnd) {
   4201    *--dst = *--src;
   4202  }
   4203 
   4204  place = opnd;                 // Op node, where operand used to be.
   4205  *place++ = (uint8_t)op;
   4206  *place++ = NUL;
   4207  *place++ = NUL;
   4208  assert(minval >= 0 && (uintmax_t)minval <= UINT32_MAX);
   4209  place = re_put_uint32(place, (uint32_t)minval);
   4210  assert(maxval >= 0 && (uintmax_t)maxval <= UINT32_MAX);
   4211  place = re_put_uint32(place, (uint32_t)maxval);
   4212  regtail(opnd, place);
   4213 }
   4214 
   4215 /// Return true if the back reference is legal. We must have seen the close
   4216 /// brace.
   4217 /// TODO(vim): Should also check that we don't refer to something repeated
   4218 /// (+*=): what instance of the repetition should we match?
   4219 static int seen_endbrace(int refnum)
   4220 {
   4221  if (!had_endbrace[refnum]) {
   4222    uint8_t *p;
   4223 
   4224    // Trick: check if "@<=" or "@<!" follows, in which case
   4225    // the \1 can appear before the referenced match.
   4226    for (p = (uint8_t *)regparse; *p != NUL; p++) {
   4227      if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '=')) {
   4228        break;
   4229      }
   4230    }
   4231 
   4232    if (*p == NUL) {
   4233      emsg(_("E65: Illegal back reference"));
   4234      rc_did_emsg = true;
   4235      return false;
   4236    }
   4237  }
   4238  return true;
   4239 }
   4240 
   4241 // Parse the lowest level.
   4242 //
   4243 // Optimization:  gobbles an entire sequence of ordinary characters so that
   4244 // it can turn them into a single node, which is smaller to store and
   4245 // faster to run.  Don't do this when one_exactly is set.
   4246 static uint8_t *regatom(int *flagp)
   4247 {
   4248  uint8_t *ret;
   4249  int flags;
   4250  int c;
   4251  uint8_t *p;
   4252  int extra = 0;
   4253  int save_prev_at_start = prev_at_start;
   4254 
   4255  *flagp = WORST;               // Tentatively.
   4256 
   4257  c = getchr();
   4258  switch (c) {
   4259  case Magic('^'):
   4260    ret = regnode(BOL);
   4261    break;
   4262 
   4263  case Magic('$'):
   4264    ret = regnode(EOL);
   4265    had_eol = true;
   4266    break;
   4267 
   4268  case Magic('<'):
   4269    ret = regnode(BOW);
   4270    break;
   4271 
   4272  case Magic('>'):
   4273    ret = regnode(EOW);
   4274    break;
   4275 
   4276  case Magic('_'):
   4277    c = no_Magic(getchr());
   4278    if (c == '^') {             // "\_^" is start-of-line
   4279      ret = regnode(BOL);
   4280      break;
   4281    }
   4282    if (c == '$') {             // "\_$" is end-of-line
   4283      ret = regnode(EOL);
   4284      had_eol = true;
   4285      break;
   4286    }
   4287 
   4288    extra = ADD_NL;
   4289    *flagp |= HASNL;
   4290 
   4291    // "\_[" is character range plus newline
   4292    if (c == '[') {
   4293      goto collection;
   4294    }
   4295 
   4296    // "\_x" is character class plus newline
   4297    FALLTHROUGH;
   4298 
   4299  // Character classes.
   4300  case Magic('.'):
   4301  case Magic('i'):
   4302  case Magic('I'):
   4303  case Magic('k'):
   4304  case Magic('K'):
   4305  case Magic('f'):
   4306  case Magic('F'):
   4307  case Magic('p'):
   4308  case Magic('P'):
   4309  case Magic('s'):
   4310  case Magic('S'):
   4311  case Magic('d'):
   4312  case Magic('D'):
   4313  case Magic('x'):
   4314  case Magic('X'):
   4315  case Magic('o'):
   4316  case Magic('O'):
   4317  case Magic('w'):
   4318  case Magic('W'):
   4319  case Magic('h'):
   4320  case Magic('H'):
   4321  case Magic('a'):
   4322  case Magic('A'):
   4323  case Magic('l'):
   4324  case Magic('L'):
   4325  case Magic('u'):
   4326  case Magic('U'):
   4327    p = (uint8_t *)vim_strchr((char *)classchars, no_Magic(c));
   4328    if (p == NULL) {
   4329      EMSG_RET_NULL(_(e_invalid_use_of_underscore));
   4330    }
   4331    // When '.' is followed by a composing char ignore the dot, so that
   4332    // the composing char is matched here.
   4333    if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
   4334      c = getchr();
   4335      goto do_multibyte;
   4336    }
   4337    ret = regnode(classcodes[p - classchars] + extra);
   4338    *flagp |= HASWIDTH | SIMPLE;
   4339    break;
   4340 
   4341  case Magic('n'):
   4342    if (reg_string) {
   4343      // In a string "\n" matches a newline character.
   4344      ret = regnode(EXACTLY);
   4345      regc(NL);
   4346      regc(NUL);
   4347      *flagp |= HASWIDTH | SIMPLE;
   4348    } else {
   4349      // In buffer text "\n" matches the end of a line.
   4350      ret = regnode(NEWL);
   4351      *flagp |= HASWIDTH | HASNL;
   4352    }
   4353    break;
   4354 
   4355  case Magic('('):
   4356    if (one_exactly) {
   4357      EMSG_ONE_RET_NULL;
   4358    }
   4359    ret = reg(REG_PAREN, &flags);
   4360    if (ret == NULL) {
   4361      return NULL;
   4362    }
   4363    *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
   4364    break;
   4365 
   4366  case NUL:
   4367  case Magic('|'):
   4368  case Magic('&'):
   4369  case Magic(')'):
   4370    if (one_exactly) {
   4371      EMSG_ONE_RET_NULL;
   4372    }
   4373    // Supposed to be caught earlier.
   4374    IEMSG_RET_NULL(_(e_internal_error_in_regexp));
   4375  // NOTREACHED
   4376 
   4377  case Magic('='):
   4378  case Magic('?'):
   4379  case Magic('+'):
   4380  case Magic('@'):
   4381  case Magic('{'):
   4382  case Magic('*'):
   4383    c = no_Magic(c);
   4384    EMSG3_RET_NULL(_("E64: %s%c follows nothing"),
   4385                   (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
   4386  // NOTREACHED
   4387 
   4388  case Magic('~'):              // previous substitute pattern
   4389    if (reg_prev_sub != NULL) {
   4390      uint8_t *lp;
   4391 
   4392      ret = regnode(EXACTLY);
   4393      lp = (uint8_t *)reg_prev_sub;
   4394      while (*lp != NUL) {
   4395        regc(*lp++);
   4396      }
   4397      regc(NUL);
   4398      if (*reg_prev_sub != NUL) {
   4399        *flagp |= HASWIDTH;
   4400        if ((lp - (uint8_t *)reg_prev_sub) == 1) {
   4401          *flagp |= SIMPLE;
   4402        }
   4403      }
   4404    } else {
   4405      EMSG_RET_NULL(_(e_nopresub));
   4406    }
   4407    break;
   4408 
   4409  case Magic('1'):
   4410  case Magic('2'):
   4411  case Magic('3'):
   4412  case Magic('4'):
   4413  case Magic('5'):
   4414  case Magic('6'):
   4415  case Magic('7'):
   4416  case Magic('8'):
   4417  case Magic('9'): {
   4418    int refnum;
   4419 
   4420    refnum = c - Magic('0');
   4421    if (!seen_endbrace(refnum)) {
   4422      return NULL;
   4423    }
   4424    ret = regnode(BACKREF + refnum);
   4425  }
   4426  break;
   4427 
   4428  case Magic('z'):
   4429    c = no_Magic(getchr());
   4430    switch (c) {
   4431    case '(':
   4432      if ((reg_do_extmatch & REX_SET) == 0) {
   4433        EMSG_RET_NULL(_(e_z_not_allowed));
   4434      }
   4435      if (one_exactly) {
   4436        EMSG_ONE_RET_NULL;
   4437      }
   4438      ret = reg(REG_ZPAREN, &flags);
   4439      if (ret == NULL) {
   4440        return NULL;
   4441      }
   4442      *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
   4443      re_has_z = REX_SET;
   4444      break;
   4445 
   4446    case '1':
   4447    case '2':
   4448    case '3':
   4449    case '4':
   4450    case '5':
   4451    case '6':
   4452    case '7':
   4453    case '8':
   4454    case '9':
   4455      if ((reg_do_extmatch & REX_USE) == 0) {
   4456        EMSG_RET_NULL(_(e_z1_not_allowed));
   4457      }
   4458      ret = regnode(ZREF + c - '0');
   4459      re_has_z = REX_USE;
   4460      break;
   4461 
   4462    case 's':
   4463      ret = regnode(MOPEN + 0);
   4464      if (!re_mult_next("\\zs")) {
   4465        return NULL;
   4466      }
   4467      break;
   4468 
   4469    case 'e':
   4470      ret = regnode(MCLOSE + 0);
   4471      if (!re_mult_next("\\ze")) {
   4472        return NULL;
   4473      }
   4474      break;
   4475 
   4476    default:
   4477      EMSG_RET_NULL(_("E68: Invalid character after \\z"));
   4478    }
   4479    break;
   4480 
   4481  case Magic('%'):
   4482    c = no_Magic(getchr());
   4483    switch (c) {
   4484    // () without a back reference
   4485    case '(':
   4486      if (one_exactly) {
   4487        EMSG_ONE_RET_NULL;
   4488      }
   4489      ret = reg(REG_NPAREN, &flags);
   4490      if (ret == NULL) {
   4491        return NULL;
   4492      }
   4493      *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
   4494      break;
   4495 
   4496    // Catch \%^ and \%$ regardless of where they appear in the
   4497    // pattern -- regardless of whether or not it makes sense.
   4498    case '^':
   4499      ret = regnode(RE_BOF);
   4500      break;
   4501 
   4502    case '$':
   4503      ret = regnode(RE_EOF);
   4504      break;
   4505 
   4506    case '#':
   4507      if (regparse[0] == '=' && regparse[1] >= 48 && regparse[1] <= 50) {
   4508        // misplaced \%#=1
   4509        semsg(_(e_atom_engine_must_be_at_start_of_pattern), regparse[1]);
   4510        return FAIL;
   4511      }
   4512      ret = regnode(CURSOR);
   4513      break;
   4514 
   4515    case 'V':
   4516      ret = regnode(RE_VISUAL);
   4517      break;
   4518 
   4519    case 'C':
   4520      ret = regnode(RE_COMPOSING);
   4521      break;
   4522 
   4523    // \%[abc]: Emit as a list of branches, all ending at the last
   4524    // branch which matches nothing.
   4525    case '[':
   4526      if (one_exactly) {                        // doesn't nest
   4527        EMSG_ONE_RET_NULL;
   4528      }
   4529      {
   4530        uint8_t *lastbranch;
   4531        uint8_t *lastnode = NULL;
   4532        uint8_t *br;
   4533 
   4534        ret = NULL;
   4535        while ((c = getchr()) != ']') {
   4536          if (c == NUL) {
   4537            EMSG2_RET_NULL(_(e_missing_sb),
   4538                           reg_magic == MAGIC_ALL);
   4539          }
   4540          br = regnode(BRANCH);
   4541          if (ret == NULL) {
   4542            ret = br;
   4543          } else {
   4544            regtail(lastnode, br);
   4545            if (reg_toolong) {
   4546              return NULL;
   4547            }
   4548          }
   4549 
   4550          ungetchr();
   4551          one_exactly = true;
   4552          lastnode = regatom(flagp);
   4553          one_exactly = false;
   4554          if (lastnode == NULL) {
   4555            return NULL;
   4556          }
   4557        }
   4558        if (ret == NULL) {
   4559          EMSG2_RET_NULL(_(e_empty_sb),
   4560                         reg_magic == MAGIC_ALL);
   4561        }
   4562        lastbranch = regnode(BRANCH);
   4563        br = regnode(NOTHING);
   4564        if (ret != JUST_CALC_SIZE) {
   4565          regtail(lastnode, br);
   4566          regtail(lastbranch, br);
   4567          // connect all branches to the NOTHING
   4568          // branch at the end
   4569          for (br = ret; br != lastnode;) {
   4570            if (OP(br) == BRANCH) {
   4571              regtail(br, lastbranch);
   4572              if (reg_toolong) {
   4573                return NULL;
   4574              }
   4575              br = OPERAND(br);
   4576            } else {
   4577              br = regnext(br);
   4578            }
   4579          }
   4580        }
   4581        *flagp &= ~(HASWIDTH | SIMPLE);
   4582        break;
   4583      }
   4584 
   4585    case 'd':               // %d123 decimal
   4586    case 'o':               // %o123 octal
   4587    case 'x':               // %xab hex 2
   4588    case 'u':               // %uabcd hex 4
   4589    case 'U':               // %U1234abcd hex 8
   4590    {
   4591      int64_t i;
   4592 
   4593      switch (c) {
   4594      case 'd':
   4595        i = getdecchrs(); break;
   4596      case 'o':
   4597        i = getoctchrs(); break;
   4598      case 'x':
   4599        i = gethexchrs(2); break;
   4600      case 'u':
   4601        i = gethexchrs(4); break;
   4602      case 'U':
   4603        i = gethexchrs(8); break;
   4604      default:
   4605        i = -1; break;
   4606      }
   4607 
   4608      if (i < 0 || i > INT_MAX) {
   4609        EMSG2_RET_NULL(_("E678: Invalid character after %s%%[dxouU]"),
   4610                       reg_magic == MAGIC_ALL);
   4611      }
   4612      if (use_multibytecode((int)i)) {
   4613        ret = regnode(MULTIBYTECODE);
   4614      } else {
   4615        ret = regnode(EXACTLY);
   4616      }
   4617      if (i == 0) {
   4618        regc(0x0a);
   4619      } else {
   4620        regmbc((int)i);
   4621      }
   4622      regc(NUL);
   4623      *flagp |= HASWIDTH;
   4624      break;
   4625    }
   4626 
   4627    default:
   4628      if (ascii_isdigit(c) || c == '<' || c == '>' || c == '\'' || c == '.') {
   4629        uint32_t n = 0;
   4630        int cmp;
   4631        bool cur = false;
   4632        bool got_digit = false;
   4633 
   4634        cmp = c;
   4635        if (cmp == '<' || cmp == '>') {
   4636          c = getchr();
   4637        }
   4638        if (no_Magic(c) == '.') {
   4639          cur = true;
   4640          c = getchr();
   4641        }
   4642        while (ascii_isdigit(c)) {
   4643          got_digit = true;
   4644          n = n * 10 + (uint32_t)(c - '0');
   4645          c = getchr();
   4646        }
   4647        if (no_Magic(c) == '\'' && n == 0) {
   4648          // "\%'m", "\%<'m" and "\%>'m": Mark
   4649          c = getchr();
   4650          ret = regnode(RE_MARK);
   4651          if (ret == JUST_CALC_SIZE) {
   4652            regsize += 2;
   4653          } else {
   4654            *regcode++ = (uint8_t)c;
   4655            *regcode++ = (uint8_t)cmp;
   4656          }
   4657          break;
   4658        } else if ((c == 'l' || c == 'c' || c == 'v') && (cur || got_digit)) {
   4659          if (cur && n) {
   4660            semsg(_(e_regexp_number_after_dot_pos_search_chr), no_Magic(c));
   4661            rc_did_emsg = true;
   4662            return NULL;
   4663          }
   4664          if (c == 'l') {
   4665            if (cur) {
   4666              n = (uint32_t)curwin->w_cursor.lnum;
   4667            }
   4668            ret = regnode(RE_LNUM);
   4669            if (save_prev_at_start) {
   4670              at_start = true;
   4671            }
   4672          } else if (c == 'c') {
   4673            if (cur) {
   4674              n = (uint32_t)curwin->w_cursor.col;
   4675              n++;
   4676            }
   4677            ret = regnode(RE_COL);
   4678          } else {
   4679            if (cur) {
   4680              colnr_T vcol = 0;
   4681              getvvcol(curwin, &curwin->w_cursor, NULL, NULL, &vcol);
   4682              n = (uint32_t)(++vcol);
   4683            }
   4684            ret = regnode(RE_VCOL);
   4685          }
   4686          if (ret == JUST_CALC_SIZE) {
   4687            regsize += 5;
   4688          } else {
   4689            // put the number and the optional
   4690            // comparator after the opcode
   4691            regcode = re_put_uint32(regcode, n);
   4692            *regcode++ = (uint8_t)cmp;
   4693          }
   4694          break;
   4695        }
   4696      }
   4697 
   4698      EMSG2_RET_NULL(_("E71: Invalid character after %s%%"),
   4699                     reg_magic == MAGIC_ALL);
   4700    }
   4701    break;
   4702 
   4703  case Magic('['):
   4704 collection:
   4705    {
   4706      uint8_t *lp;
   4707 
   4708      // If there is no matching ']', we assume the '[' is a normal
   4709      // character.  This makes 'incsearch' and ":help [" work.
   4710      lp = (uint8_t *)skip_anyof(regparse);
   4711      if (*lp == ']') {         // there is a matching ']'
   4712        int startc = -1;                // > 0 when next '-' is a range
   4713        int endc;
   4714 
   4715        // In a character class, different parsing rules apply.
   4716        // Not even \ is special anymore, nothing is.
   4717        if (*regparse == '^') {             // Complement of range.
   4718          ret = regnode(ANYBUT + extra);
   4719          regparse++;
   4720        } else {
   4721          ret = regnode(ANYOF + extra);
   4722        }
   4723 
   4724        // At the start ']' and '-' mean the literal character.
   4725        if (*regparse == ']' || *regparse == '-') {
   4726          startc = (uint8_t)(*regparse);
   4727          regc(*regparse++);
   4728        }
   4729 
   4730        while (*regparse != NUL && *regparse != ']') {
   4731          if (*regparse == '-') {
   4732            regparse++;
   4733            // The '-' is not used for a range at the end and
   4734            // after or before a '\n'.
   4735            if (*regparse == ']' || *regparse == NUL
   4736                || startc == -1
   4737                || (regparse[0] == '\\' && regparse[1] == 'n')) {
   4738              regc('-');
   4739              startc = '-';                     // [--x] is a range
   4740            } else {
   4741              // Also accept "a-[.z.]"
   4742              endc = 0;
   4743              if (*regparse == '[') {
   4744                endc = get_coll_element(&regparse);
   4745              }
   4746              if (endc == 0) {
   4747                endc = mb_ptr2char_adv((const char **)&regparse);
   4748              }
   4749 
   4750              // Handle \o40, \x20 and \u20AC style sequences
   4751              if (endc == '\\' && !reg_cpo_lit) {
   4752                endc = coll_get_char();
   4753              }
   4754 
   4755              if (startc > endc) {
   4756                EMSG_RET_NULL(_(e_reverse_range));
   4757              }
   4758              if (utf_char2len(startc) > 1
   4759                  || utf_char2len(endc) > 1) {
   4760                // Limit to a range of 256 chars
   4761                if (endc > startc + 256) {
   4762                  EMSG_RET_NULL(_(e_large_class));
   4763                }
   4764                while (++startc <= endc) {
   4765                  regmbc(startc);
   4766                }
   4767              } else {
   4768                while (++startc <= endc) {
   4769                  regc(startc);
   4770                }
   4771              }
   4772              startc = -1;
   4773            }
   4774          }
   4775          // Only "\]", "\^", "\]" and "\\" are special in Vi.  Vim
   4776          // accepts "\t", "\e", etc., but only when the 'l' flag in
   4777          // 'cpoptions' is not included.
   4778          else if (*regparse == '\\'
   4779                   && (vim_strchr(REGEXP_INRANGE, (uint8_t)regparse[1]) != NULL
   4780                       || (!reg_cpo_lit
   4781                           && vim_strchr(REGEXP_ABBR,
   4782                                         (uint8_t)regparse[1]) != NULL))) {
   4783            regparse++;
   4784            if (*regparse == 'n') {
   4785              // '\n' in range: also match NL
   4786              if (ret != JUST_CALC_SIZE) {
   4787                // Using \n inside [^] does not change what
   4788                // matches. "[^\n]" is the same as ".".
   4789                if (*ret == ANYOF) {
   4790                  *ret = ANYOF + ADD_NL;
   4791                  *flagp |= HASNL;
   4792                }
   4793                // else: must have had a \n already
   4794              }
   4795              regparse++;
   4796              startc = -1;
   4797            } else if (*regparse == 'd'
   4798                       || *regparse == 'o'
   4799                       || *regparse == 'x'
   4800                       || *regparse == 'u'
   4801                       || *regparse == 'U') {
   4802              startc = coll_get_char();
   4803              // max UTF-8 Codepoint is U+10FFFF,
   4804              // but allow values until INT_MAX
   4805              if (startc == INT_MAX) {
   4806                EMSG_RET_NULL(_(e_unicode_val_too_large));
   4807              }
   4808              if (startc == 0) {
   4809                regc(0x0a);
   4810              } else {
   4811                regmbc(startc);
   4812              }
   4813            } else {
   4814              startc = backslash_trans(*regparse++);
   4815              regc(startc);
   4816            }
   4817          } else if (*regparse == '[') {
   4818            int c_class;
   4819            int cu;
   4820 
   4821            c_class = get_char_class(&regparse);
   4822            startc = -1;
   4823            // Characters assumed to be 8 bits!
   4824            switch (c_class) {
   4825            case CLASS_NONE:
   4826              c_class = get_equi_class(&regparse);
   4827              if (c_class != 0) {
   4828                // produce equivalence class
   4829                reg_equi_class(c_class);
   4830              } else if ((c_class = get_coll_element(&regparse)) != 0) {
   4831                // produce a collating element
   4832                regmbc(c_class);
   4833              } else {
   4834                // literal '[', allow [[-x] as a range
   4835                startc = (uint8_t)(*regparse++);
   4836                regc(startc);
   4837              }
   4838              break;
   4839            case CLASS_ALNUM:
   4840              for (cu = 1; cu < 128; cu++) {
   4841                if (isalnum(cu)) {
   4842                  regmbc(cu);
   4843                }
   4844              }
   4845              break;
   4846            case CLASS_ALPHA:
   4847              for (cu = 1; cu < 128; cu++) {
   4848                if (isalpha(cu)) {
   4849                  regmbc(cu);
   4850                }
   4851              }
   4852              break;
   4853            case CLASS_BLANK:
   4854              regc(' ');
   4855              regc('\t');
   4856              break;
   4857            case CLASS_CNTRL:
   4858              for (cu = 1; cu <= 127; cu++) {
   4859                if (iscntrl(cu)) {
   4860                  regmbc(cu);
   4861                }
   4862              }
   4863              break;
   4864            case CLASS_DIGIT:
   4865              for (cu = 1; cu <= 127; cu++) {
   4866                if (ascii_isdigit(cu)) {
   4867                  regmbc(cu);
   4868                }
   4869              }
   4870              break;
   4871            case CLASS_GRAPH:
   4872              for (cu = 1; cu <= 127; cu++) {
   4873                if (isgraph(cu)) {
   4874                  regmbc(cu);
   4875                }
   4876              }
   4877              break;
   4878            case CLASS_LOWER:
   4879              for (cu = 1; cu <= 255; cu++) {
   4880                if (mb_islower(cu) && cu != 170 && cu != 186) {
   4881                  regmbc(cu);
   4882                }
   4883              }
   4884              break;
   4885            case CLASS_PRINT:
   4886              for (cu = 1; cu <= 255; cu++) {
   4887                if (vim_isprintc(cu)) {
   4888                  regmbc(cu);
   4889                }
   4890              }
   4891              break;
   4892            case CLASS_PUNCT:
   4893              for (cu = 1; cu < 128; cu++) {
   4894                if (ispunct(cu)) {
   4895                  regmbc(cu);
   4896                }
   4897              }
   4898              break;
   4899            case CLASS_SPACE:
   4900              for (cu = 9; cu <= 13; cu++) {
   4901                regc(cu);
   4902              }
   4903              regc(' ');
   4904              break;
   4905            case CLASS_UPPER:
   4906              for (cu = 1; cu <= 255; cu++) {
   4907                if (mb_isupper(cu)) {
   4908                  regmbc(cu);
   4909                }
   4910              }
   4911              break;
   4912            case CLASS_XDIGIT:
   4913              for (cu = 1; cu <= 255; cu++) {
   4914                if (ascii_isxdigit(cu)) {
   4915                  regmbc(cu);
   4916                }
   4917              }
   4918              break;
   4919            case CLASS_TAB:
   4920              regc('\t');
   4921              break;
   4922            case CLASS_RETURN:
   4923              regc('\r');
   4924              break;
   4925            case CLASS_BACKSPACE:
   4926              regc('\b');
   4927              break;
   4928            case CLASS_ESCAPE:
   4929              regc(ESC);
   4930              break;
   4931            case CLASS_IDENT:
   4932              for (cu = 1; cu <= 255; cu++) {
   4933                if (vim_isIDc(cu)) {
   4934                  regmbc(cu);
   4935                }
   4936              }
   4937              break;
   4938            case CLASS_KEYWORD:
   4939              for (cu = 1; cu <= 255; cu++) {
   4940                if (reg_iswordc(cu)) {
   4941                  regmbc(cu);
   4942                }
   4943              }
   4944              break;
   4945            case CLASS_FNAME:
   4946              for (cu = 1; cu <= 255; cu++) {
   4947                if (vim_isfilec(cu)) {
   4948                  regmbc(cu);
   4949                }
   4950              }
   4951              break;
   4952            }
   4953          } else {
   4954            // produce a multibyte character, including any
   4955            // following composing characters.
   4956            startc = utf_ptr2char(regparse);
   4957            int len = utfc_ptr2len(regparse);
   4958            if (utf_char2len(startc) != len) {
   4959              // composing chars
   4960              startc = -1;
   4961            }
   4962            while (--len >= 0) {
   4963              regc(*regparse++);
   4964            }
   4965          }
   4966        }
   4967        regc(NUL);
   4968        prevchr_len = 1;                // last char was the ']'
   4969        if (*regparse != ']') {
   4970          EMSG_RET_NULL(_(e_toomsbra));                 // Cannot happen?
   4971        }
   4972        skipchr();                  // let's be friends with the lexer again
   4973        *flagp |= HASWIDTH | SIMPLE;
   4974        break;
   4975      } else if (reg_strict) {
   4976        EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF);
   4977      }
   4978    }
   4979    FALLTHROUGH;
   4980 
   4981  default: {
   4982    int len;
   4983 
   4984    // A multi-byte character is handled as a separate atom if it's
   4985    // before a multi and when it's a composing char.
   4986    if (use_multibytecode(c)) {
   4987 do_multibyte:
   4988      ret = regnode(MULTIBYTECODE);
   4989      regmbc(c);
   4990      *flagp |= HASWIDTH | SIMPLE;
   4991      break;
   4992    }
   4993 
   4994    ret = regnode(EXACTLY);
   4995 
   4996    // Append characters as long as:
   4997    // - there is no following multi, we then need the character in
   4998    //   front of it as a single character operand
   4999    // - not running into a Magic character
   5000    // - "one_exactly" is not set
   5001    // But always emit at least one character.  Might be a Multi,
   5002    // e.g., a "[" without matching "]".
   5003    for (len = 0; c != NUL && (len == 0
   5004                               || (re_multi_type(peekchr()) == NOT_MULTI
   5005                                   && !one_exactly
   5006                                   && !is_Magic(c))); len++) {
   5007      c = no_Magic(c);
   5008      {
   5009        regmbc(c);
   5010        {
   5011          int l;
   5012 
   5013          // Need to get composing character too.
   5014          GraphemeState state = GRAPHEME_STATE_INIT;
   5015          while (true) {
   5016            l = utf_ptr2len(regparse);
   5017            if (!utf_composinglike(regparse, regparse + l, &state)) {
   5018              break;
   5019            }
   5020            regmbc(utf_ptr2char(regparse));
   5021            skipchr();
   5022          }
   5023        }
   5024      }
   5025      c = getchr();
   5026    }
   5027    ungetchr();
   5028 
   5029    regc(NUL);
   5030    *flagp |= HASWIDTH;
   5031    if (len == 1) {
   5032      *flagp |= SIMPLE;
   5033    }
   5034  }
   5035  break;
   5036  }
   5037 
   5038  return ret;
   5039 }
   5040 
   5041 // Parse something followed by possible [*+=].
   5042 //
   5043 // Note that the branching code sequences used for = and the general cases
   5044 // of * and + are somewhat optimized:  they use the same NOTHING node as
   5045 // both the endmarker for their branch list and the body of the last branch.
   5046 // It might seem that this node could be dispensed with entirely, but the
   5047 // endmarker role is not redundant.
   5048 static uint8_t *regpiece(int *flagp)
   5049 {
   5050  uint8_t *ret;
   5051  int op;
   5052  uint8_t *next;
   5053  int flags;
   5054  int minval;
   5055  int maxval;
   5056 
   5057  ret = regatom(&flags);
   5058  if (ret == NULL) {
   5059    return NULL;
   5060  }
   5061 
   5062  op = peekchr();
   5063  if (re_multi_type(op) == NOT_MULTI) {
   5064    *flagp = flags;
   5065    return ret;
   5066  }
   5067  // default flags
   5068  *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
   5069 
   5070  skipchr();
   5071  switch (op) {
   5072  case Magic('*'):
   5073    if (flags & SIMPLE) {
   5074      reginsert(STAR, ret);
   5075    } else {
   5076      // Emit x* as (x&|), where & means "self".
   5077      reginsert(BRANCH, ret);           // Either x
   5078      regoptail(ret, regnode(BACK));            // and loop
   5079      regoptail(ret, ret);              // back
   5080      regtail(ret, regnode(BRANCH));            // or
   5081      regtail(ret, regnode(NOTHING));           // null.
   5082    }
   5083    break;
   5084 
   5085  case Magic('+'):
   5086    if (flags & SIMPLE) {
   5087      reginsert(PLUS, ret);
   5088    } else {
   5089      // Emit x+ as x(&|), where & means "self".
   5090      next = regnode(BRANCH);           // Either
   5091      regtail(ret, next);
   5092      regtail(regnode(BACK), ret);              // loop back
   5093      regtail(next, regnode(BRANCH));           // or
   5094      regtail(ret, regnode(NOTHING));           // null.
   5095    }
   5096    *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
   5097    break;
   5098 
   5099  case Magic('@'): {
   5100    int lop = END;
   5101    int64_t nr = getdecchrs();
   5102 
   5103    switch (no_Magic(getchr())) {
   5104    case '=':
   5105      lop = MATCH; break;                                 // \@=
   5106    case '!':
   5107      lop = NOMATCH; break;                               // \@!
   5108    case '>':
   5109      lop = SUBPAT; break;                                // \@>
   5110    case '<':
   5111      switch (no_Magic(getchr())) {
   5112      case '=':
   5113        lop = BEHIND; break;                               // \@<=
   5114      case '!':
   5115        lop = NOBEHIND; break;                             // \@<!
   5116      }
   5117    }
   5118    if (lop == END) {
   5119      EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
   5120                     reg_magic == MAGIC_ALL);
   5121    }
   5122    // Look behind must match with behind_pos.
   5123    if (lop == BEHIND || lop == NOBEHIND) {
   5124      regtail(ret, regnode(BHPOS));
   5125      *flagp |= HASLOOKBH;
   5126    }
   5127    regtail(ret, regnode(END));             // operand ends
   5128    if (lop == BEHIND || lop == NOBEHIND) {
   5129      if (nr < 0) {
   5130        nr = 0;                 // no limit is same as zero limit
   5131      }
   5132      reginsert_nr(lop, (uint32_t)nr, ret);
   5133    } else {
   5134      reginsert(lop, ret);
   5135    }
   5136    break;
   5137  }
   5138 
   5139  case Magic('?'):
   5140  case Magic('='):
   5141    // Emit x= as (x|)
   5142    reginsert(BRANCH, ret);                     // Either x
   5143    regtail(ret, regnode(BRANCH));              // or
   5144    next = regnode(NOTHING);                    // null.
   5145    regtail(ret, next);
   5146    regoptail(ret, next);
   5147    break;
   5148 
   5149  case Magic('{'):
   5150    if (!read_limits(&minval, &maxval)) {
   5151      return NULL;
   5152    }
   5153    if (flags & SIMPLE) {
   5154      reginsert(BRACE_SIMPLE, ret);
   5155      reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
   5156    } else {
   5157      if (num_complex_braces >= 10) {
   5158        EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"),
   5159                       reg_magic == MAGIC_ALL);
   5160      }
   5161      reginsert(BRACE_COMPLEX + num_complex_braces, ret);
   5162      regoptail(ret, regnode(BACK));
   5163      regoptail(ret, ret);
   5164      reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
   5165      num_complex_braces++;
   5166    }
   5167    if (minval > 0 && maxval > 0) {
   5168      *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
   5169    }
   5170    break;
   5171  }
   5172  if (re_multi_type(peekchr()) != NOT_MULTI) {
   5173    // Can't have a multi follow a multi.
   5174    if (peekchr() == Magic('*')) {
   5175      EMSG2_RET_NULL(_("E61: Nested %s*"), reg_magic >= MAGIC_ON);
   5176    }
   5177    EMSG3_RET_NULL(_("E62: Nested %s%c"), reg_magic == MAGIC_ALL, no_Magic(peekchr()));
   5178  }
   5179 
   5180  return ret;
   5181 }
   5182 
   5183 // Parse one alternative of an | or & operator.
   5184 // Implements the concatenation operator.
   5185 static uint8_t *regconcat(int *flagp)
   5186 {
   5187  uint8_t *first = NULL;
   5188  uint8_t *chain = NULL;
   5189  uint8_t *latest;
   5190  int flags;
   5191  int cont = true;
   5192 
   5193  *flagp = WORST;               // Tentatively.
   5194 
   5195  while (cont) {
   5196    switch (peekchr()) {
   5197    case NUL:
   5198    case Magic('|'):
   5199    case Magic('&'):
   5200    case Magic(')'):
   5201      cont = false;
   5202      break;
   5203    case Magic('Z'):
   5204      regflags |= RF_ICOMBINE;
   5205      skipchr_keepstart();
   5206      break;
   5207    case Magic('c'):
   5208      regflags |= RF_ICASE;
   5209      skipchr_keepstart();
   5210      break;
   5211    case Magic('C'):
   5212      regflags |= RF_NOICASE;
   5213      skipchr_keepstart();
   5214      break;
   5215    case Magic('v'):
   5216      reg_magic = MAGIC_ALL;
   5217      skipchr_keepstart();
   5218      curchr = -1;
   5219      break;
   5220    case Magic('m'):
   5221      reg_magic = MAGIC_ON;
   5222      skipchr_keepstart();
   5223      curchr = -1;
   5224      break;
   5225    case Magic('M'):
   5226      reg_magic = MAGIC_OFF;
   5227      skipchr_keepstart();
   5228      curchr = -1;
   5229      break;
   5230    case Magic('V'):
   5231      reg_magic = MAGIC_NONE;
   5232      skipchr_keepstart();
   5233      curchr = -1;
   5234      break;
   5235    default:
   5236      latest = regpiece(&flags);
   5237      if (latest == NULL || reg_toolong) {
   5238        return NULL;
   5239      }
   5240      *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
   5241      if (chain == NULL) {                      // First piece.
   5242        *flagp |= flags & SPSTART;
   5243      } else {
   5244        regtail(chain, latest);
   5245      }
   5246      chain = latest;
   5247      if (first == NULL) {
   5248        first = latest;
   5249      }
   5250      break;
   5251    }
   5252  }
   5253  if (first == NULL) {          // Loop ran zero times.
   5254    first = regnode(NOTHING);
   5255  }
   5256  return first;
   5257 }
   5258 
   5259 // Parse one alternative of an | operator.
   5260 // Implements the & operator.
   5261 static uint8_t *regbranch(int *flagp)
   5262 {
   5263  uint8_t *ret;
   5264  uint8_t *chain = NULL;
   5265  uint8_t *latest;
   5266  int flags;
   5267 
   5268  *flagp = WORST | HASNL;               // Tentatively.
   5269 
   5270  ret = regnode(BRANCH);
   5271  while (true) {
   5272    latest = regconcat(&flags);
   5273    if (latest == NULL) {
   5274      return NULL;
   5275    }
   5276    // If one of the branches has width, the whole thing has.  If one of
   5277    // the branches anchors at start-of-line, the whole thing does.
   5278    // If one of the branches uses look-behind, the whole thing does.
   5279    *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
   5280    // If one of the branches doesn't match a line-break, the whole thing
   5281    // doesn't.
   5282    *flagp &= ~HASNL | (flags & HASNL);
   5283    if (chain != NULL) {
   5284      regtail(chain, latest);
   5285    }
   5286    if (peekchr() != Magic('&')) {
   5287      break;
   5288    }
   5289    skipchr();
   5290    regtail(latest, regnode(END));     // operand ends
   5291    if (reg_toolong) {
   5292      break;
   5293    }
   5294    reginsert(MATCH, latest);
   5295    chain = latest;
   5296  }
   5297 
   5298  return ret;
   5299 }
   5300 
   5301 /// Parse regular expression, i.e. main body or parenthesized thing.
   5302 ///
   5303 /// Caller must absorb opening parenthesis.
   5304 ///
   5305 /// Combining parenthesis handling with the base level of regular expression
   5306 /// is a trifle forced, but the need to tie the tails of the branches to what
   5307 /// follows makes it hard to avoid.
   5308 ///
   5309 /// @param paren  REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
   5310 static uint8_t *reg(int paren, int *flagp)
   5311 {
   5312  uint8_t *ret;
   5313  uint8_t *br;
   5314  uint8_t *ender;
   5315  int parno = 0;
   5316  int flags;
   5317 
   5318  *flagp = HASWIDTH;            // Tentatively.
   5319 
   5320  if (paren == REG_ZPAREN) {
   5321    // Make a ZOPEN node.
   5322    if (regnzpar >= NSUBEXP) {
   5323      EMSG_RET_NULL(_("E50: Too many \\z("));
   5324    }
   5325    parno = regnzpar;
   5326    regnzpar++;
   5327    ret = regnode(ZOPEN + parno);
   5328  } else if (paren == REG_PAREN) {
   5329    // Make a MOPEN node.
   5330    if (regnpar >= NSUBEXP) {
   5331      EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
   5332    }
   5333    parno = regnpar;
   5334    regnpar++;
   5335    ret = regnode(MOPEN + parno);
   5336  } else if (paren == REG_NPAREN) {
   5337    // Make a NOPEN node.
   5338    ret = regnode(NOPEN);
   5339  } else {
   5340    ret = NULL;
   5341  }
   5342 
   5343  // Pick up the branches, linking them together.
   5344  br = regbranch(&flags);
   5345  if (br == NULL) {
   5346    return NULL;
   5347  }
   5348  if (ret != NULL) {
   5349    regtail(ret, br);           // [MZ]OPEN -> first.
   5350  } else {
   5351    ret = br;
   5352  }
   5353  // If one of the branches can be zero-width, the whole thing can.
   5354  // If one of the branches has * at start or matches a line-break, the
   5355  // whole thing can.
   5356  if (!(flags & HASWIDTH)) {
   5357    *flagp &= ~HASWIDTH;
   5358  }
   5359  *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
   5360  while (peekchr() == Magic('|')) {
   5361    skipchr();
   5362    br = regbranch(&flags);
   5363    if (br == NULL || reg_toolong) {
   5364      return NULL;
   5365    }
   5366    regtail(ret, br);           // BRANCH -> BRANCH.
   5367    if (!(flags & HASWIDTH)) {
   5368      *flagp &= ~HASWIDTH;
   5369    }
   5370    *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
   5371  }
   5372 
   5373  // Make a closing node, and hook it on the end.
   5374  ender = regnode(paren == REG_ZPAREN ? ZCLOSE + parno
   5375                                      : paren == REG_PAREN ? MCLOSE + parno
   5376                                                           : paren == REG_NPAREN ? NCLOSE : END);
   5377  regtail(ret, ender);
   5378 
   5379  // Hook the tails of the branches to the closing node.
   5380  for (br = ret; br != NULL; br = regnext(br)) {
   5381    regoptail(br, ender);
   5382  }
   5383 
   5384  // Check for proper termination.
   5385  if (paren != REG_NOPAREN && getchr() != Magic(')')) {
   5386    if (paren == REG_ZPAREN) {
   5387      EMSG_RET_NULL(_("E52: Unmatched \\z("));
   5388    } else if (paren == REG_NPAREN) {
   5389      EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
   5390    } else {
   5391      EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
   5392    }
   5393  } else if (paren == REG_NOPAREN && peekchr() != NUL) {
   5394    if (curchr == Magic(')')) {
   5395      EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
   5396    } else {
   5397      EMSG_RET_NULL(_(e_trailing));             // "Can't happen".
   5398    }
   5399    // NOTREACHED
   5400  }
   5401  // Here we set the flag allowing back references to this set of
   5402  // parentheses.
   5403  if (paren == REG_PAREN) {
   5404    had_endbrace[parno] = true;  // have seen the close paren
   5405  }
   5406  return ret;
   5407 }
   5408 
   5409 // bt_regcomp() - compile a regular expression into internal code for the
   5410 // traditional back track matcher.
   5411 // Returns the program in allocated space.  Returns NULL for an error.
   5412 //
   5413 // We can't allocate space until we know how big the compiled form will be,
   5414 // but we can't compile it (and thus know how big it is) until we've got a
   5415 // place to put the code.  So we cheat:  we compile it twice, once with code
   5416 // generation turned off and size counting turned on, and once "for real".
   5417 // This also means that we don't allocate space until we are sure that the
   5418 // thing really will compile successfully, and we never have to move the
   5419 // code and thus invalidate pointers into it.  (Note that it has to be in
   5420 // one piece because free() must be able to free it all.)
   5421 //
   5422 // Whether upper/lower case is to be ignored is decided when executing the
   5423 // program, it does not matter here.
   5424 //
   5425 // Beware that the optimization-preparation code in here knows about some
   5426 // of the structure of the compiled regexp.
   5427 // "re_flags": RE_MAGIC and/or RE_STRING.
   5428 static regprog_T *bt_regcomp(uint8_t *expr, int re_flags)
   5429 {
   5430  uint8_t *scan;
   5431  uint8_t *longest;
   5432  int len;
   5433  int flags;
   5434 
   5435  if (expr == NULL) {
   5436    IEMSG_RET_NULL(_(e_null));
   5437  }
   5438 
   5439  init_class_tab();
   5440 
   5441  // First pass: determine size, legality.
   5442  regcomp_start(expr, re_flags);
   5443  regcode = JUST_CALC_SIZE;
   5444  regc(REGMAGIC);
   5445  if (reg(REG_NOPAREN, &flags) == NULL) {
   5446    return NULL;
   5447  }
   5448 
   5449  // Allocate space.
   5450  bt_regprog_T *r = xmalloc(offsetof(bt_regprog_T, program) + (size_t)regsize);
   5451  r->re_in_use = false;
   5452 
   5453  // Second pass: emit code.
   5454  regcomp_start(expr, re_flags);
   5455  regcode = r->program;
   5456  regc(REGMAGIC);
   5457  if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong) {
   5458    xfree(r);
   5459    if (reg_toolong) {
   5460      EMSG_RET_NULL(_("E339: Pattern too long"));
   5461    }
   5462    return NULL;
   5463  }
   5464 
   5465  // Dig out information for optimizations.
   5466  r->regstart = NUL;            // Worst-case defaults.
   5467  r->reganch = 0;
   5468  r->regmust = NULL;
   5469  r->regmlen = 0;
   5470  r->regflags = regflags;
   5471  if (flags & HASNL) {
   5472    r->regflags |= RF_HASNL;
   5473  }
   5474  if (flags & HASLOOKBH) {
   5475    r->regflags |= RF_LOOKBH;
   5476  }
   5477  // Remember whether this pattern has any \z specials in it.
   5478  r->reghasz = (uint8_t)re_has_z;
   5479  scan = &r->program[1];  // First BRANCH.
   5480  if (OP(regnext(scan)) == END) {   // Only one top-level choice.
   5481    scan = OPERAND(scan);
   5482 
   5483    // Starting-point info.
   5484    if (OP(scan) == BOL || OP(scan) == RE_BOF) {
   5485      r->reganch++;
   5486      scan = regnext(scan);
   5487    }
   5488 
   5489    if (OP(scan) == EXACTLY) {
   5490      r->regstart = utf_ptr2char((char *)OPERAND(scan));
   5491    } else if (OP(scan) == BOW
   5492               || OP(scan) == EOW
   5493               || OP(scan) == NOTHING
   5494               || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
   5495               || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE) {
   5496      uint8_t *regnext_scan = regnext(scan);
   5497      if (OP(regnext_scan) == EXACTLY) {
   5498        r->regstart = utf_ptr2char((char *)OPERAND(regnext_scan));
   5499      }
   5500    }
   5501 
   5502    // If there's something expensive in the r.e., find the longest
   5503    // literal string that must appear and make it the regmust.  Resolve
   5504    // ties in favor of later strings, since the regstart check works
   5505    // with the beginning of the r.e. and avoiding duplication
   5506    // strengthens checking.  Not a strong reason, but sufficient in the
   5507    // absence of others.
   5508 
   5509    // When the r.e. starts with BOW, it is faster to look for a regmust
   5510    // first. Used a lot for "#" and "*" commands. (Added by mool).
   5511    if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
   5512        && !(flags & HASNL)) {
   5513      longest = NULL;
   5514      len = 0;
   5515      for (; scan != NULL; scan = regnext(scan)) {
   5516        if (OP(scan) == EXACTLY) {
   5517          size_t scanlen = strlen((char *)OPERAND(scan));
   5518          if (scanlen >= (size_t)len) {
   5519            longest = OPERAND(scan);
   5520            len = (int)scanlen;
   5521          }
   5522        }
   5523      }
   5524      r->regmust = longest;
   5525      r->regmlen = len;
   5526    }
   5527  }
   5528 #ifdef BT_REGEXP_DUMP
   5529  regdump(expr, r);
   5530 #endif
   5531  r->engine = &bt_regengine;
   5532  return (regprog_T *)r;
   5533 }
   5534 
   5535 // Check if during the previous call to vim_regcomp the EOL item "$" has been
   5536 // found.  This is messy, but it works fine.
   5537 int vim_regcomp_had_eol(void)
   5538 {
   5539  return had_eol;
   5540 }
   5541 
   5542 // Get a number after a backslash that is inside [].
   5543 // When nothing is recognized return a backslash.
   5544 static int coll_get_char(void)
   5545 {
   5546  int64_t nr = -1;
   5547 
   5548  switch (*regparse++) {
   5549  case 'd':
   5550    nr = getdecchrs(); break;
   5551  case 'o':
   5552    nr = getoctchrs(); break;
   5553  case 'x':
   5554    nr = gethexchrs(2); break;
   5555  case 'u':
   5556    nr = gethexchrs(4); break;
   5557  case 'U':
   5558    nr = gethexchrs(8); break;
   5559  }
   5560  if (nr < 0) {
   5561    // If getting the number fails be backwards compatible: the character
   5562    // is a backslash.
   5563    regparse--;
   5564    nr = '\\';
   5565  }
   5566  if (nr > INT_MAX) {
   5567    nr = INT_MAX;
   5568  }
   5569  return (int)nr;
   5570 }
   5571 
   5572 // Free a compiled regexp program, returned by bt_regcomp().
   5573 static void bt_regfree(regprog_T *prog)
   5574 {
   5575  xfree(prog);
   5576 }
   5577 
   5578 #define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
   5579 
   5580 // The arguments from BRACE_LIMITS are stored here.  They are actually local
   5581 // to regmatch(), but they are here to reduce the amount of stack space used
   5582 // (it can be called recursively many times).
   5583 static int64_t bl_minval;
   5584 static int64_t bl_maxval;
   5585 
   5586 // Save the input line and position in a regsave_T.
   5587 static void reg_save(regsave_T *save, garray_T *gap)
   5588  FUNC_ATTR_NONNULL_ALL
   5589 {
   5590  if (REG_MULTI) {
   5591    save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
   5592    save->rs_u.pos.lnum = rex.lnum;
   5593  } else {
   5594    save->rs_u.ptr = rex.input;
   5595  }
   5596  save->rs_len = gap->ga_len;
   5597 }
   5598 
   5599 // Restore the input line and position from a regsave_T.
   5600 static void reg_restore(regsave_T *save, garray_T *gap)
   5601  FUNC_ATTR_NONNULL_ALL
   5602 {
   5603  if (REG_MULTI) {
   5604    if (rex.lnum != save->rs_u.pos.lnum) {
   5605      // only call reg_getline() when the line number changed to save
   5606      // a bit of time
   5607      rex.lnum = save->rs_u.pos.lnum;
   5608      rex.line = (uint8_t *)reg_getline(rex.lnum);
   5609    }
   5610    rex.input = rex.line + save->rs_u.pos.col;
   5611  } else {
   5612    rex.input = save->rs_u.ptr;
   5613  }
   5614  gap->ga_len = save->rs_len;
   5615 }
   5616 
   5617 // Return true if current position is equal to saved position.
   5618 static bool reg_save_equal(const regsave_T *save)
   5619  FUNC_ATTR_NONNULL_ALL
   5620 {
   5621  if (REG_MULTI) {
   5622    return rex.lnum == save->rs_u.pos.lnum
   5623           && rex.input == rex.line + save->rs_u.pos.col;
   5624  }
   5625  return rex.input == save->rs_u.ptr;
   5626 }
   5627 
   5628 // Save the sub-expressions before attempting a match.
   5629 #define save_se(savep, posp, pp) \
   5630  REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
   5631 
   5632 // After a failed match restore the sub-expressions.
   5633 #define restore_se(savep, posp, pp) { \
   5634  if (REG_MULTI) \
   5635  *(posp) = (savep)->se_u.pos; \
   5636  else \
   5637  *(pp) = (savep)->se_u.ptr; }
   5638 
   5639 // Tentatively set the sub-expression start to the current position (after
   5640 // calling regmatch() they will have changed).  Need to save the existing
   5641 // values for when there is no match.
   5642 // Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
   5643 // depending on REG_MULTI.
   5644 static void save_se_multi(save_se_T *savep, lpos_T *posp)
   5645 {
   5646  savep->se_u.pos = *posp;
   5647  posp->lnum = rex.lnum;
   5648  posp->col = (colnr_T)(rex.input - rex.line);
   5649 }
   5650 
   5651 static void save_se_one(save_se_T *savep, uint8_t **pp)
   5652 {
   5653  savep->se_u.ptr = *pp;
   5654  *pp = rex.input;
   5655 }
   5656 
   5657 /// regrepeat - repeatedly match something simple, return how many.
   5658 /// Advances rex.input (and rex.lnum) to just after the matched chars.
   5659 ///
   5660 /// @param maxcount  maximum number of matches allowed
   5661 static int regrepeat(uint8_t *p, int64_t maxcount)
   5662 {
   5663  int64_t count = 0;
   5664  uint8_t *opnd;
   5665  int mask;
   5666  int testval = 0;
   5667 
   5668  uint8_t *scan = rex.input;  // Make local copy of rex.input for speed.
   5669  opnd = OPERAND(p);
   5670  switch (OP(p)) {
   5671  case ANY:
   5672  case ANY + ADD_NL:
   5673    while (count < maxcount) {
   5674      // Matching anything means we continue until end-of-line (or
   5675      // end-of-file for ANY + ADD_NL), only limited by maxcount.
   5676      while (*scan != NUL && count < maxcount) {
   5677        count++;
   5678        MB_PTR_ADV(scan);
   5679      }
   5680      if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5681          || rex.reg_line_lbr || count == maxcount) {
   5682        break;
   5683      }
   5684      count++;  // count the line-break
   5685      reg_nextline();
   5686      scan = rex.input;
   5687      if (got_int) {
   5688        break;
   5689      }
   5690    }
   5691    break;
   5692 
   5693  case IDENT:
   5694  case IDENT + ADD_NL:
   5695    testval = 1;
   5696    FALLTHROUGH;
   5697  case SIDENT:
   5698  case SIDENT + ADD_NL:
   5699    while (count < maxcount) {
   5700      if (vim_isIDc(utf_ptr2char((char *)scan)) && (testval || !ascii_isdigit(*scan))) {
   5701        MB_PTR_ADV(scan);
   5702      } else if (*scan == NUL) {
   5703        if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5704            || rex.reg_line_lbr) {
   5705          break;
   5706        }
   5707        reg_nextline();
   5708        scan = rex.input;
   5709        if (got_int) {
   5710          break;
   5711        }
   5712      } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
   5713        scan++;
   5714      } else {
   5715        break;
   5716      }
   5717      count++;
   5718    }
   5719    break;
   5720 
   5721  case KWORD:
   5722  case KWORD + ADD_NL:
   5723    testval = 1;
   5724    FALLTHROUGH;
   5725  case SKWORD:
   5726  case SKWORD + ADD_NL:
   5727    while (count < maxcount) {
   5728      if (vim_iswordp_buf((char *)scan, rex.reg_buf)
   5729          && (testval || !ascii_isdigit(*scan))) {
   5730        MB_PTR_ADV(scan);
   5731      } else if (*scan == NUL) {
   5732        if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5733            || rex.reg_line_lbr) {
   5734          break;
   5735        }
   5736        reg_nextline();
   5737        scan = rex.input;
   5738        if (got_int) {
   5739          break;
   5740        }
   5741      } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
   5742        scan++;
   5743      } else {
   5744        break;
   5745      }
   5746      count++;
   5747    }
   5748    break;
   5749 
   5750  case FNAME:
   5751  case FNAME + ADD_NL:
   5752    testval = 1;
   5753    FALLTHROUGH;
   5754  case SFNAME:
   5755  case SFNAME + ADD_NL:
   5756    while (count < maxcount) {
   5757      if (vim_isfilec(utf_ptr2char((char *)scan)) && (testval || !ascii_isdigit(*scan))) {
   5758        MB_PTR_ADV(scan);
   5759      } else if (*scan == NUL) {
   5760        if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5761            || rex.reg_line_lbr) {
   5762          break;
   5763        }
   5764        reg_nextline();
   5765        scan = rex.input;
   5766        if (got_int) {
   5767          break;
   5768        }
   5769      } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
   5770        scan++;
   5771      } else {
   5772        break;
   5773      }
   5774      count++;
   5775    }
   5776    break;
   5777 
   5778  case PRINT:
   5779  case PRINT + ADD_NL:
   5780    testval = 1;
   5781    FALLTHROUGH;
   5782  case SPRINT:
   5783  case SPRINT + ADD_NL:
   5784    while (count < maxcount) {
   5785      if (*scan == NUL) {
   5786        if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5787            || rex.reg_line_lbr) {
   5788          break;
   5789        }
   5790        reg_nextline();
   5791        scan = rex.input;
   5792        if (got_int) {
   5793          break;
   5794        }
   5795      } else if (vim_isprintc(utf_ptr2char((char *)scan)) == 1
   5796                 && (testval || !ascii_isdigit(*scan))) {
   5797        MB_PTR_ADV(scan);
   5798      } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
   5799        scan++;
   5800      } else {
   5801        break;
   5802      }
   5803      count++;
   5804    }
   5805    break;
   5806 
   5807  case WHITE:
   5808  case WHITE + ADD_NL:
   5809    testval = mask = RI_WHITE;
   5810 do_class:
   5811    while (count < maxcount) {
   5812      int l;
   5813      if (*scan == NUL) {
   5814        if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5815            || rex.reg_line_lbr) {
   5816          break;
   5817        }
   5818        reg_nextline();
   5819        scan = rex.input;
   5820        if (got_int) {
   5821          break;
   5822        }
   5823      } else if ((l = utfc_ptr2len((char *)scan)) > 1) {
   5824        if (testval != 0) {
   5825          break;
   5826        }
   5827        scan += l;
   5828      } else if ((class_tab[*scan] & mask) == testval) {
   5829        scan++;
   5830      } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
   5831        scan++;
   5832      } else {
   5833        break;
   5834      }
   5835      count++;
   5836    }
   5837    break;
   5838 
   5839  case NWHITE:
   5840  case NWHITE + ADD_NL:
   5841    mask = RI_WHITE;
   5842    goto do_class;
   5843  case DIGIT:
   5844  case DIGIT + ADD_NL:
   5845    testval = mask = RI_DIGIT;
   5846    goto do_class;
   5847  case NDIGIT:
   5848  case NDIGIT + ADD_NL:
   5849    mask = RI_DIGIT;
   5850    goto do_class;
   5851  case HEX:
   5852  case HEX + ADD_NL:
   5853    testval = mask = RI_HEX;
   5854    goto do_class;
   5855  case NHEX:
   5856  case NHEX + ADD_NL:
   5857    mask = RI_HEX;
   5858    goto do_class;
   5859  case OCTAL:
   5860  case OCTAL + ADD_NL:
   5861    testval = mask = RI_OCTAL;
   5862    goto do_class;
   5863  case NOCTAL:
   5864  case NOCTAL + ADD_NL:
   5865    mask = RI_OCTAL;
   5866    goto do_class;
   5867  case WORD:
   5868  case WORD + ADD_NL:
   5869    testval = mask = RI_WORD;
   5870    goto do_class;
   5871  case NWORD:
   5872  case NWORD + ADD_NL:
   5873    mask = RI_WORD;
   5874    goto do_class;
   5875  case HEAD:
   5876  case HEAD + ADD_NL:
   5877    testval = mask = RI_HEAD;
   5878    goto do_class;
   5879  case NHEAD:
   5880  case NHEAD + ADD_NL:
   5881    mask = RI_HEAD;
   5882    goto do_class;
   5883  case ALPHA:
   5884  case ALPHA + ADD_NL:
   5885    testval = mask = RI_ALPHA;
   5886    goto do_class;
   5887  case NALPHA:
   5888  case NALPHA + ADD_NL:
   5889    mask = RI_ALPHA;
   5890    goto do_class;
   5891  case LOWER:
   5892  case LOWER + ADD_NL:
   5893    testval = mask = RI_LOWER;
   5894    goto do_class;
   5895  case NLOWER:
   5896  case NLOWER + ADD_NL:
   5897    mask = RI_LOWER;
   5898    goto do_class;
   5899  case UPPER:
   5900  case UPPER + ADD_NL:
   5901    testval = mask = RI_UPPER;
   5902    goto do_class;
   5903  case NUPPER:
   5904  case NUPPER + ADD_NL:
   5905    mask = RI_UPPER;
   5906    goto do_class;
   5907 
   5908  case EXACTLY: {
   5909    int cu, cl;
   5910 
   5911    // This doesn't do a multi-byte character, because a MULTIBYTECODE
   5912    // would have been used for it.  It does handle single-byte
   5913    // characters, such as latin1.
   5914    if (rex.reg_ic) {
   5915      cu = mb_toupper(*opnd);
   5916      cl = mb_tolower(*opnd);
   5917      while (count < maxcount && (*scan == cu || *scan == cl)) {
   5918        count++;
   5919        scan++;
   5920      }
   5921    } else {
   5922      cu = *opnd;
   5923      while (count < maxcount && *scan == cu) {
   5924        count++;
   5925        scan++;
   5926      }
   5927    }
   5928    break;
   5929  }
   5930 
   5931  case MULTIBYTECODE: {
   5932    int i, len, cf = 0;
   5933 
   5934    // Safety check (just in case 'encoding' was changed since
   5935    // compiling the program).
   5936    if ((len = utfc_ptr2len((char *)opnd)) > 1) {
   5937      if (rex.reg_ic) {
   5938        cf = utf_fold(utf_ptr2char((char *)opnd));
   5939      }
   5940      while (count < maxcount && utfc_ptr2len((char *)scan) >= len) {
   5941        for (i = 0; i < len; i++) {
   5942          if (opnd[i] != scan[i]) {
   5943            break;
   5944          }
   5945        }
   5946        if (i < len && (!rex.reg_ic
   5947                        || utf_fold(utf_ptr2char((char *)scan)) != cf)) {
   5948          break;
   5949        }
   5950        scan += len;
   5951        count++;
   5952      }
   5953    }
   5954  }
   5955  break;
   5956 
   5957  case ANYOF:
   5958  case ANYOF + ADD_NL:
   5959    testval = 1;
   5960    FALLTHROUGH;
   5961 
   5962  case ANYBUT:
   5963  case ANYBUT + ADD_NL:
   5964    while (count < maxcount) {
   5965      int len;
   5966      if (*scan == NUL) {
   5967        if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
   5968            || rex.reg_line_lbr) {
   5969          break;
   5970        }
   5971        reg_nextline();
   5972        scan = rex.input;
   5973        if (got_int) {
   5974          break;
   5975        }
   5976      } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
   5977        scan++;
   5978      } else if ((len = utfc_ptr2len((char *)scan)) > 1) {
   5979        if ((cstrchr((char *)opnd, utf_ptr2char((char *)scan)) == NULL) == testval) {
   5980          break;
   5981        }
   5982        scan += len;
   5983      } else {
   5984        if ((cstrchr((char *)opnd, *scan) == NULL) == testval) {
   5985          break;
   5986        }
   5987        scan++;
   5988      }
   5989      count++;
   5990    }
   5991    break;
   5992 
   5993  case NEWL:
   5994    while (count < maxcount
   5995           && ((*scan == NUL && rex.lnum <= rex.reg_maxline && !rex.reg_line_lbr
   5996                && REG_MULTI) || (*scan == '\n' && rex.reg_line_lbr))) {
   5997      count++;
   5998      if (rex.reg_line_lbr) {
   5999        ADVANCE_REGINPUT();
   6000      } else {
   6001        reg_nextline();
   6002      }
   6003      scan = rex.input;
   6004      if (got_int) {
   6005        break;
   6006      }
   6007    }
   6008    break;
   6009 
   6010  default:  // Oh dear.  Called inappropriately.
   6011    iemsg(_(e_re_corr));
   6012 #ifdef REGEXP_DEBUG
   6013    printf("Called regrepeat with op code %d\n", OP(p));
   6014 #endif
   6015    break;
   6016  }
   6017 
   6018  rex.input = scan;
   6019 
   6020  return (int)count;
   6021 }
   6022 
   6023 // Push an item onto the regstack.
   6024 // Returns pointer to new item.  Returns NULL when out of memory.
   6025 static regitem_T *regstack_push(regstate_T state, uint8_t *scan)
   6026 {
   6027  regitem_T *rp;
   6028 
   6029  if ((int64_t)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
   6030    emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
   6031    return NULL;
   6032  }
   6033  ga_grow(&regstack, sizeof(regitem_T));
   6034 
   6035  rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
   6036  rp->rs_state = state;
   6037  rp->rs_scan = scan;
   6038 
   6039  regstack.ga_len += (int)sizeof(regitem_T);
   6040  return rp;
   6041 }
   6042 
   6043 // Pop an item from the regstack.
   6044 static void regstack_pop(uint8_t **scan)
   6045 {
   6046  regitem_T *rp;
   6047 
   6048  rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
   6049  *scan = rp->rs_scan;
   6050 
   6051  regstack.ga_len -= (int)sizeof(regitem_T);
   6052 }
   6053 
   6054 // Save the current subexpr to "bp", so that they can be restored
   6055 // later by restore_subexpr().
   6056 static void save_subexpr(regbehind_T *bp)
   6057  FUNC_ATTR_NONNULL_ALL
   6058 {
   6059  // When "rex.need_clear_subexpr" is set we don't need to save the values, only
   6060  // remember that this flag needs to be set again when restoring.
   6061  bp->save_need_clear_subexpr = rex.need_clear_subexpr;
   6062  if (rex.need_clear_subexpr) {
   6063    return;
   6064  }
   6065 
   6066  for (int i = 0; i < NSUBEXP; i++) {
   6067    if (REG_MULTI) {
   6068      bp->save_start[i].se_u.pos = rex.reg_startpos[i];
   6069      bp->save_end[i].se_u.pos = rex.reg_endpos[i];
   6070    } else {
   6071      bp->save_start[i].se_u.ptr = rex.reg_startp[i];
   6072      bp->save_end[i].se_u.ptr = rex.reg_endp[i];
   6073    }
   6074  }
   6075 }
   6076 
   6077 // Restore the subexpr from "bp".
   6078 static void restore_subexpr(regbehind_T *bp)
   6079  FUNC_ATTR_NONNULL_ALL
   6080 {
   6081  // Only need to restore saved values when they are not to be cleared.
   6082  rex.need_clear_subexpr = bp->save_need_clear_subexpr;
   6083  if (rex.need_clear_subexpr) {
   6084    return;
   6085  }
   6086 
   6087  for (int i = 0; i < NSUBEXP; i++) {
   6088    if (REG_MULTI) {
   6089      rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
   6090      rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
   6091    } else {
   6092      rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
   6093      rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
   6094    }
   6095  }
   6096 }
   6097 /// Main matching routine
   6098 ///
   6099 /// Conceptually the strategy is simple: Check to see whether the current node
   6100 /// matches, push an item onto the regstack and loop to see whether the rest
   6101 /// matches, and then act accordingly.  In practice we make some effort to
   6102 /// avoid using the regstack, in particular by going through "ordinary" nodes
   6103 /// (that don't need to know whether the rest of the match failed) by a nested
   6104 /// loop.
   6105 ///
   6106 /// @param scan       Current node.
   6107 /// @param tm         timeout limit or NULL
   6108 /// @param timed_out  flag set on timeout or NULL
   6109 ///
   6110 /// @return - true when there is a match.  Leaves rex.input and rex.lnum
   6111 ///         just after the last matched character.
   6112 ///         - false when there is no match.  Leaves rex.input and rex.lnum in an
   6113 ///         undefined state!
   6114 static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
   6115 {
   6116  uint8_t *next;          // Next node.
   6117  int op;
   6118  int c;
   6119  regitem_T *rp;
   6120  int no;
   6121  int status;                   // one of the RA_ values:
   6122  int tm_count = 0;
   6123 
   6124  // Make "regstack" and "backpos" empty.  They are allocated and freed in
   6125  // bt_regexec_both() to reduce malloc()/free() calls.
   6126  regstack.ga_len = 0;
   6127  backpos.ga_len = 0;
   6128 
   6129  // Repeat until "regstack" is empty.
   6130  while (true) {
   6131    // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
   6132    // Allow interrupting them with CTRL-C.
   6133    reg_breakcheck();
   6134 
   6135 #ifdef REGEXP_DEBUG
   6136    if (scan != NULL && regnarrate) {
   6137      fprintf(stderr, "%s", (char *)regprop(scan));
   6138      fprintf(stderr, "%s", "(\n");
   6139    }
   6140 #endif
   6141 
   6142    // Repeat for items that can be matched sequentially, without using the
   6143    // regstack.
   6144    while (true) {
   6145      if (got_int || scan == NULL) {
   6146        status = RA_FAIL;
   6147        break;
   6148      }
   6149      // Check for timeout once in a 100 times to avoid overhead.
   6150      if (tm != NULL && ++tm_count == 100) {
   6151        tm_count = 0;
   6152        if (profile_passed_limit(*tm)) {
   6153          if (timed_out != NULL) {
   6154            *timed_out = true;
   6155          }
   6156          status = RA_FAIL;
   6157          break;
   6158        }
   6159      }
   6160      status = RA_CONT;
   6161 
   6162 #ifdef REGEXP_DEBUG
   6163      if (regnarrate) {
   6164        fprintf(stderr, "%s", (char *)regprop(scan));
   6165        fprintf(stderr, "%s", "...\n");
   6166        if (re_extmatch_in != NULL) {
   6167          int i;
   6168 
   6169          fprintf(stderr, _("External submatches:\n"));
   6170          for (i = 0; i < NSUBEXP; i++) {
   6171            fprintf(stderr, "%s", "    \"");
   6172            if (re_extmatch_in->matches[i] != NULL) {
   6173              fprintf(stderr, "%s", (char *)re_extmatch_in->matches[i]);
   6174            }
   6175            fprintf(stderr, "%s", "\"\n");
   6176          }
   6177        }
   6178      }
   6179 #endif
   6180      next = regnext(scan);
   6181 
   6182      op = OP(scan);
   6183      // Check for character class with NL added.
   6184      if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
   6185          && *rex.input == NUL && rex.lnum <= rex.reg_maxline) {
   6186        reg_nextline();
   6187      } else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n') {
   6188        ADVANCE_REGINPUT();
   6189      } else {
   6190        if (WITH_NL(op)) {
   6191          op -= ADD_NL;
   6192        }
   6193        c = utf_ptr2char((char *)rex.input);
   6194        switch (op) {
   6195        case BOL:
   6196          if (rex.input != rex.line) {
   6197            status = RA_NOMATCH;
   6198          }
   6199          break;
   6200 
   6201        case EOL:
   6202          if (c != NUL) {
   6203            status = RA_NOMATCH;
   6204          }
   6205          break;
   6206 
   6207        case RE_BOF:
   6208          // We're not at the beginning of the file when below the first
   6209          // line where we started, not at the start of the line or we
   6210          // didn't start at the first line of the buffer.
   6211          if (rex.lnum != 0 || rex.input != rex.line
   6212              || (REG_MULTI && rex.reg_firstlnum > 1)) {
   6213            status = RA_NOMATCH;
   6214          }
   6215          break;
   6216 
   6217        case RE_EOF:
   6218          if (rex.lnum != rex.reg_maxline || c != NUL) {
   6219            status = RA_NOMATCH;
   6220          }
   6221          break;
   6222 
   6223        case CURSOR:
   6224          // Check if the buffer is in a window and compare the
   6225          // rex.reg_win->w_cursor position to the match position.
   6226          if (rex.reg_win == NULL
   6227              || (rex.lnum + rex.reg_firstlnum != rex.reg_win->w_cursor.lnum)
   6228              || ((colnr_T)(rex.input - rex.line) !=
   6229                  rex.reg_win->w_cursor.col)) {
   6230            status = RA_NOMATCH;
   6231          }
   6232          break;
   6233 
   6234        case RE_MARK:
   6235          // Compare the mark position to the match position.
   6236        {
   6237          int mark = OPERAND(scan)[0];
   6238          int cmp = OPERAND(scan)[1];
   6239          pos_T *pos;
   6240          size_t col = REG_MULTI ? (size_t)(rex.input - rex.line) : 0;
   6241          fmark_T *fm = mark_get(rex.reg_buf, curwin, NULL, kMarkBufLocal, mark);
   6242 
   6243          // Line may have been freed, get it again.
   6244          if (REG_MULTI) {
   6245            rex.line = (uint8_t *)reg_getline(rex.lnum);
   6246            rex.input = rex.line + col;
   6247          }
   6248 
   6249          if (fm == NULL                    // mark doesn't exist
   6250              || fm->mark.lnum <= 0) {           // mark isn't set in reg_buf
   6251            status = RA_NOMATCH;
   6252          } else {
   6253            pos = &fm->mark;
   6254            const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
   6255                                    && pos->col == MAXCOL
   6256                                    ? reg_getline_len(pos->lnum - rex.reg_firstlnum)
   6257                                    : pos->col;
   6258 
   6259            if (pos->lnum == rex.lnum + rex.reg_firstlnum
   6260                ? (pos_col == (colnr_T)(rex.input - rex.line)
   6261                   ? (cmp == '<' || cmp == '>')
   6262                   : (pos_col < (colnr_T)(rex.input - rex.line)
   6263                      ? cmp != '>'
   6264                      : cmp != '<'))
   6265                : (pos->lnum < rex.lnum + rex.reg_firstlnum
   6266                   ? cmp != '>'
   6267                   : cmp != '<')) {
   6268              status = RA_NOMATCH;
   6269            }
   6270          }
   6271        }
   6272        break;
   6273 
   6274        case RE_VISUAL:
   6275          if (!reg_match_visual()) {
   6276            status = RA_NOMATCH;
   6277          }
   6278          break;
   6279 
   6280        case RE_LNUM:
   6281          assert(rex.lnum + rex.reg_firstlnum >= 0
   6282                 && (uintmax_t)(rex.lnum + rex.reg_firstlnum) <= UINT32_MAX);
   6283          if (!REG_MULTI
   6284              || !re_num_cmp((uint32_t)(rex.lnum + rex.reg_firstlnum), scan)) {
   6285            status = RA_NOMATCH;
   6286          }
   6287          break;
   6288 
   6289        case RE_COL:
   6290          assert(rex.input - rex.line + 1 >= 0
   6291                 && (uintmax_t)(rex.input - rex.line + 1) <= UINT32_MAX);
   6292          if (!re_num_cmp((uint32_t)(rex.input - rex.line + 1), scan)) {
   6293            status = RA_NOMATCH;
   6294          }
   6295          break;
   6296 
   6297        case RE_VCOL: {
   6298          win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
   6299          linenr_T lnum = REG_MULTI ? rex.reg_firstlnum + rex.lnum : 1;
   6300          if (REG_MULTI && (lnum <= 0 || lnum > wp->w_buffer->b_ml.ml_line_count)) {
   6301            lnum = 1;
   6302          }
   6303          int vcol = win_linetabsize(wp, lnum, (char *)rex.line,
   6304                                     (colnr_T)(rex.input - rex.line));
   6305          if (!re_num_cmp((uint32_t)vcol + 1, scan)) {
   6306            status = RA_NOMATCH;
   6307          }
   6308          break;
   6309        }
   6310        break;
   6311 
   6312        case BOW:  // \<word; rex.input points to w
   6313          if (c == NUL) {  // Can't match at end of line
   6314            status = RA_NOMATCH;
   6315          } else {
   6316            // Get class of current and previous char (if it exists).
   6317            const int this_class =
   6318              mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab);
   6319            if (this_class <= 1) {
   6320              status = RA_NOMATCH;  // Not on a word at all.
   6321            } else if (reg_prev_class() == this_class) {
   6322              status = RA_NOMATCH;  // Previous char is in same word.
   6323            }
   6324          }
   6325          break;
   6326 
   6327        case EOW:  // word\>; rex.input points after d
   6328          if (rex.input == rex.line) {  // Can't match at start of line
   6329            status = RA_NOMATCH;
   6330          } else {
   6331            int this_class, prev_class;
   6332 
   6333            // Get class of current and previous char (if it exists).
   6334            this_class = mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab);
   6335            prev_class = reg_prev_class();
   6336            if (this_class == prev_class
   6337                || prev_class == 0 || prev_class == 1) {
   6338              status = RA_NOMATCH;
   6339            }
   6340          }
   6341          break;  // Matched with EOW
   6342 
   6343        case ANY:
   6344          // ANY does not match new lines.
   6345          if (c == NUL) {
   6346            status = RA_NOMATCH;
   6347          } else {
   6348            ADVANCE_REGINPUT();
   6349          }
   6350          break;
   6351 
   6352        case IDENT:
   6353          if (!vim_isIDc(c)) {
   6354            status = RA_NOMATCH;
   6355          } else {
   6356            ADVANCE_REGINPUT();
   6357          }
   6358          break;
   6359 
   6360        case SIDENT:
   6361          if (ascii_isdigit(*rex.input) || !vim_isIDc(c)) {
   6362            status = RA_NOMATCH;
   6363          } else {
   6364            ADVANCE_REGINPUT();
   6365          }
   6366          break;
   6367 
   6368        case KWORD:
   6369          if (!vim_iswordp_buf((char *)rex.input, rex.reg_buf)) {
   6370            status = RA_NOMATCH;
   6371          } else {
   6372            ADVANCE_REGINPUT();
   6373          }
   6374          break;
   6375 
   6376        case SKWORD:
   6377          if (ascii_isdigit(*rex.input)
   6378              || !vim_iswordp_buf((char *)rex.input, rex.reg_buf)) {
   6379            status = RA_NOMATCH;
   6380          } else {
   6381            ADVANCE_REGINPUT();
   6382          }
   6383          break;
   6384 
   6385        case FNAME:
   6386          if (!vim_isfilec(c)) {
   6387            status = RA_NOMATCH;
   6388          } else {
   6389            ADVANCE_REGINPUT();
   6390          }
   6391          break;
   6392 
   6393        case SFNAME:
   6394          if (ascii_isdigit(*rex.input) || !vim_isfilec(c)) {
   6395            status = RA_NOMATCH;
   6396          } else {
   6397            ADVANCE_REGINPUT();
   6398          }
   6399          break;
   6400 
   6401        case PRINT:
   6402          if (!vim_isprintc(utf_ptr2char((char *)rex.input))) {
   6403            status = RA_NOMATCH;
   6404          } else {
   6405            ADVANCE_REGINPUT();
   6406          }
   6407          break;
   6408 
   6409        case SPRINT:
   6410          if (ascii_isdigit(*rex.input) || !vim_isprintc(utf_ptr2char((char *)rex.input))) {
   6411            status = RA_NOMATCH;
   6412          } else {
   6413            ADVANCE_REGINPUT();
   6414          }
   6415          break;
   6416 
   6417        case WHITE:
   6418          if (!ascii_iswhite(c)) {
   6419            status = RA_NOMATCH;
   6420          } else {
   6421            ADVANCE_REGINPUT();
   6422          }
   6423          break;
   6424 
   6425        case NWHITE:
   6426          if (c == NUL || ascii_iswhite(c)) {
   6427            status = RA_NOMATCH;
   6428          } else {
   6429            ADVANCE_REGINPUT();
   6430          }
   6431          break;
   6432 
   6433        case DIGIT:
   6434          if (!ri_digit(c)) {
   6435            status = RA_NOMATCH;
   6436          } else {
   6437            ADVANCE_REGINPUT();
   6438          }
   6439          break;
   6440 
   6441        case NDIGIT:
   6442          if (c == NUL || ri_digit(c)) {
   6443            status = RA_NOMATCH;
   6444          } else {
   6445            ADVANCE_REGINPUT();
   6446          }
   6447          break;
   6448 
   6449        case HEX:
   6450          if (!ri_hex(c)) {
   6451            status = RA_NOMATCH;
   6452          } else {
   6453            ADVANCE_REGINPUT();
   6454          }
   6455          break;
   6456 
   6457        case NHEX:
   6458          if (c == NUL || ri_hex(c)) {
   6459            status = RA_NOMATCH;
   6460          } else {
   6461            ADVANCE_REGINPUT();
   6462          }
   6463          break;
   6464 
   6465        case OCTAL:
   6466          if (!ri_octal(c)) {
   6467            status = RA_NOMATCH;
   6468          } else {
   6469            ADVANCE_REGINPUT();
   6470          }
   6471          break;
   6472 
   6473        case NOCTAL:
   6474          if (c == NUL || ri_octal(c)) {
   6475            status = RA_NOMATCH;
   6476          } else {
   6477            ADVANCE_REGINPUT();
   6478          }
   6479          break;
   6480 
   6481        case WORD:
   6482          if (!ri_word(c)) {
   6483            status = RA_NOMATCH;
   6484          } else {
   6485            ADVANCE_REGINPUT();
   6486          }
   6487          break;
   6488 
   6489        case NWORD:
   6490          if (c == NUL || ri_word(c)) {
   6491            status = RA_NOMATCH;
   6492          } else {
   6493            ADVANCE_REGINPUT();
   6494          }
   6495          break;
   6496 
   6497        case HEAD:
   6498          if (!ri_head(c)) {
   6499            status = RA_NOMATCH;
   6500          } else {
   6501            ADVANCE_REGINPUT();
   6502          }
   6503          break;
   6504 
   6505        case NHEAD:
   6506          if (c == NUL || ri_head(c)) {
   6507            status = RA_NOMATCH;
   6508          } else {
   6509            ADVANCE_REGINPUT();
   6510          }
   6511          break;
   6512 
   6513        case ALPHA:
   6514          if (!ri_alpha(c)) {
   6515            status = RA_NOMATCH;
   6516          } else {
   6517            ADVANCE_REGINPUT();
   6518          }
   6519          break;
   6520 
   6521        case NALPHA:
   6522          if (c == NUL || ri_alpha(c)) {
   6523            status = RA_NOMATCH;
   6524          } else {
   6525            ADVANCE_REGINPUT();
   6526          }
   6527          break;
   6528 
   6529        case LOWER:
   6530          if (!ri_lower(c)) {
   6531            status = RA_NOMATCH;
   6532          } else {
   6533            ADVANCE_REGINPUT();
   6534          }
   6535          break;
   6536 
   6537        case NLOWER:
   6538          if (c == NUL || ri_lower(c)) {
   6539            status = RA_NOMATCH;
   6540          } else {
   6541            ADVANCE_REGINPUT();
   6542          }
   6543          break;
   6544 
   6545        case UPPER:
   6546          if (!ri_upper(c)) {
   6547            status = RA_NOMATCH;
   6548          } else {
   6549            ADVANCE_REGINPUT();
   6550          }
   6551          break;
   6552 
   6553        case NUPPER:
   6554          if (c == NUL || ri_upper(c)) {
   6555            status = RA_NOMATCH;
   6556          } else {
   6557            ADVANCE_REGINPUT();
   6558          }
   6559          break;
   6560 
   6561        case EXACTLY: {
   6562          int len;
   6563          uint8_t *opnd;
   6564 
   6565          opnd = OPERAND(scan);
   6566          // Inline the first byte, for speed.
   6567          if (*opnd != *rex.input
   6568              && (!rex.reg_ic)) {
   6569            status = RA_NOMATCH;
   6570          } else if (*opnd == NUL) {
   6571            // match empty string always works; happens when "~" is
   6572            // empty.
   6573          } else {
   6574            if (opnd[1] == NUL && !rex.reg_ic) {
   6575              len = 1;  // matched a single byte above
   6576            } else {
   6577              // Need to match first byte again for multi-byte.
   6578              len = (int)strlen((char *)opnd);
   6579              if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) {
   6580                status = RA_NOMATCH;
   6581              }
   6582            }
   6583            // Check for following composing character, unless %C
   6584            // follows (skips over all composing chars).
   6585            if (status != RA_NOMATCH
   6586                && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
   6587                && !rex.reg_icombine
   6588                && OP(next) != RE_COMPOSING) {
   6589              // raaron: This code makes a composing character get
   6590              // ignored, which is the correct behavior (sometimes)
   6591              // for voweled Hebrew texts.
   6592              status = RA_NOMATCH;
   6593            }
   6594            if (status != RA_NOMATCH) {
   6595              rex.input += len;
   6596            }
   6597          }
   6598        }
   6599        break;
   6600 
   6601        case ANYOF:
   6602        case ANYBUT: {
   6603          uint8_t *q = OPERAND(scan);
   6604 
   6605          if (c == NUL) {
   6606            status = RA_NOMATCH;
   6607          } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) {
   6608            status = RA_NOMATCH;
   6609          } else {  // Check following combining characters
   6610            int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q);
   6611 
   6612            rex.input += utf_ptr2len((char *)rex.input);
   6613            q += utf_ptr2len((char *)q);
   6614 
   6615            if (len == 0) {
   6616              break;
   6617            }
   6618 
   6619            for (int i = 0; i < len; i++) {
   6620              if (q[i] != rex.input[i]) {
   6621                status = RA_NOMATCH;
   6622                break;
   6623              }
   6624            }
   6625            rex.input += len;
   6626          }
   6627          break;
   6628        }
   6629 
   6630        case MULTIBYTECODE: {
   6631          int i, len;
   6632 
   6633          const uint8_t *opnd = OPERAND(scan);
   6634          // Safety check (just in case 'encoding' was changed since
   6635          // compiling the program).
   6636          if ((len = utfc_ptr2len((char *)opnd)) < 2) {
   6637            status = RA_NOMATCH;
   6638            break;
   6639          }
   6640          const int opndc = utf_ptr2char((char *)opnd);
   6641          if (utf_iscomposing_legacy(opndc)) {
   6642            // When only a composing char is given match at any
   6643            // position where that composing char appears.
   6644            status = RA_NOMATCH;
   6645            for (i = 0; rex.input[i] != NUL;
   6646                 i += utf_ptr2len((char *)rex.input + i)) {
   6647              const int inpc = utf_ptr2char((char *)rex.input + i);
   6648              if (!utf_iscomposing_legacy(inpc)) {
   6649                if (i > 0) {
   6650                  break;
   6651                }
   6652              } else if (opndc == inpc) {
   6653                // Include all following composing chars.
   6654                len = i + utfc_ptr2len((char *)rex.input + i);
   6655                status = RA_MATCH;
   6656                break;
   6657              }
   6658            }
   6659          } else {
   6660            if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) {
   6661              status = RA_NOMATCH;
   6662              break;
   6663            }
   6664          }
   6665          rex.input += len;
   6666        }
   6667        break;
   6668 
   6669        case RE_COMPOSING:
   6670          // Skip composing characters.
   6671          while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
   6672            rex.input += utf_ptr2len((char *)rex.input);
   6673          }
   6674          break;
   6675 
   6676        case NOTHING:
   6677          break;
   6678 
   6679        case BACK: {
   6680          int i;
   6681 
   6682          // When we run into BACK we need to check if we don't keep
   6683          // looping without matching any input.  The second and later
   6684          // times a BACK is encountered it fails if the input is still
   6685          // at the same position as the previous time.
   6686          // The positions are stored in "backpos" and found by the
   6687          // current value of "scan", the position in the RE program.
   6688          backpos_T *bp = (backpos_T *)backpos.ga_data;
   6689          for (i = 0; i < backpos.ga_len; i++) {
   6690            if (bp[i].bp_scan == scan) {
   6691              break;
   6692            }
   6693          }
   6694          if (i == backpos.ga_len) {
   6695            backpos_T *p = GA_APPEND_VIA_PTR(backpos_T, &backpos);
   6696            p->bp_scan = scan;
   6697          } else if (reg_save_equal(&bp[i].bp_pos)) {
   6698            // Still at same position as last time, fail.
   6699            status = RA_NOMATCH;
   6700          }
   6701 
   6702          assert(status != RA_FAIL);
   6703          if (status != RA_NOMATCH) {
   6704            reg_save(&bp[i].bp_pos, &backpos);
   6705          }
   6706        }
   6707        break;
   6708 
   6709        case MOPEN + 0:     // Match start: \zs
   6710        case MOPEN + 1:     // \(
   6711        case MOPEN + 2:
   6712        case MOPEN + 3:
   6713        case MOPEN + 4:
   6714        case MOPEN + 5:
   6715        case MOPEN + 6:
   6716        case MOPEN + 7:
   6717        case MOPEN + 8:
   6718        case MOPEN + 9:
   6719          no = op - MOPEN;
   6720          cleanup_subexpr();
   6721          rp = regstack_push(RS_MOPEN, scan);
   6722          if (rp == NULL) {
   6723            status = RA_FAIL;
   6724          } else {
   6725            rp->rs_no = (int16_t)no;
   6726            save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
   6727                    &rex.reg_startp[no]);
   6728            // We simply continue and handle the result when done.
   6729          }
   6730          break;
   6731 
   6732        case NOPEN:         // \%(
   6733        case NCLOSE:        // \) after \%(
   6734          if (regstack_push(RS_NOPEN, scan) == NULL) {
   6735            status = RA_FAIL;
   6736          }
   6737          // We simply continue and handle the result when done.
   6738          break;
   6739 
   6740        case ZOPEN + 1:
   6741        case ZOPEN + 2:
   6742        case ZOPEN + 3:
   6743        case ZOPEN + 4:
   6744        case ZOPEN + 5:
   6745        case ZOPEN + 6:
   6746        case ZOPEN + 7:
   6747        case ZOPEN + 8:
   6748        case ZOPEN + 9:
   6749          no = op - ZOPEN;
   6750          cleanup_zsubexpr();
   6751          rp = regstack_push(RS_ZOPEN, scan);
   6752          if (rp == NULL) {
   6753            status = RA_FAIL;
   6754          } else {
   6755            rp->rs_no = (int16_t)no;
   6756            save_se(&rp->rs_un.sesave, &reg_startzpos[no],
   6757                    &reg_startzp[no]);
   6758            // We simply continue and handle the result when done.
   6759          }
   6760          break;
   6761 
   6762        case MCLOSE + 0:    // Match end: \ze
   6763        case MCLOSE + 1:    // \)
   6764        case MCLOSE + 2:
   6765        case MCLOSE + 3:
   6766        case MCLOSE + 4:
   6767        case MCLOSE + 5:
   6768        case MCLOSE + 6:
   6769        case MCLOSE + 7:
   6770        case MCLOSE + 8:
   6771        case MCLOSE + 9:
   6772          no = op - MCLOSE;
   6773          cleanup_subexpr();
   6774          rp = regstack_push(RS_MCLOSE, scan);
   6775          if (rp == NULL) {
   6776            status = RA_FAIL;
   6777          } else {
   6778            rp->rs_no = (int16_t)no;
   6779            save_se(&rp->rs_un.sesave, &rex.reg_endpos[no], &rex.reg_endp[no]);
   6780            // We simply continue and handle the result when done.
   6781          }
   6782          break;
   6783 
   6784        case ZCLOSE + 1:    // \) after \z(
   6785        case ZCLOSE + 2:
   6786        case ZCLOSE + 3:
   6787        case ZCLOSE + 4:
   6788        case ZCLOSE + 5:
   6789        case ZCLOSE + 6:
   6790        case ZCLOSE + 7:
   6791        case ZCLOSE + 8:
   6792        case ZCLOSE + 9:
   6793          no = op - ZCLOSE;
   6794          cleanup_zsubexpr();
   6795          rp = regstack_push(RS_ZCLOSE, scan);
   6796          if (rp == NULL) {
   6797            status = RA_FAIL;
   6798          } else {
   6799            rp->rs_no = (int16_t)no;
   6800            save_se(&rp->rs_un.sesave, &reg_endzpos[no],
   6801                    &reg_endzp[no]);
   6802            // We simply continue and handle the result when done.
   6803          }
   6804          break;
   6805 
   6806        case BACKREF + 1:
   6807        case BACKREF + 2:
   6808        case BACKREF + 3:
   6809        case BACKREF + 4:
   6810        case BACKREF + 5:
   6811        case BACKREF + 6:
   6812        case BACKREF + 7:
   6813        case BACKREF + 8:
   6814        case BACKREF + 9: {
   6815          int len;
   6816 
   6817          no = op - BACKREF;
   6818          cleanup_subexpr();
   6819          if (!REG_MULTI) {  // Single-line regexp
   6820            if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL) {
   6821              // Backref was not set: Match an empty string.
   6822              len = 0;
   6823            } else {
   6824              // Compare current input with back-ref in the same line.
   6825              len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
   6826              if (cstrncmp((char *)rex.reg_startp[no], (char *)rex.input, &len) != 0) {
   6827                status = RA_NOMATCH;
   6828              }
   6829            }
   6830          } else {  // Multi-line regexp
   6831            if (rex.reg_startpos[no].lnum < 0 || rex.reg_endpos[no].lnum < 0) {
   6832              // Backref was not set: Match an empty string.
   6833              len = 0;
   6834            } else {
   6835              if (rex.reg_startpos[no].lnum == rex.lnum
   6836                  && rex.reg_endpos[no].lnum == rex.lnum) {
   6837                // Compare back-ref within the current line.
   6838                len = rex.reg_endpos[no].col - rex.reg_startpos[no].col;
   6839                if (cstrncmp((char *)rex.line + rex.reg_startpos[no].col,
   6840                             (char *)rex.input, &len) != 0) {
   6841                  status = RA_NOMATCH;
   6842                }
   6843              } else {
   6844                // Messy situation: Need to compare between two lines.
   6845                int r = match_with_backref(rex.reg_startpos[no].lnum,
   6846                                           rex.reg_startpos[no].col,
   6847                                           rex.reg_endpos[no].lnum,
   6848                                           rex.reg_endpos[no].col,
   6849                                           &len);
   6850                if (r != RA_MATCH) {
   6851                  status = r;
   6852                }
   6853              }
   6854            }
   6855          }
   6856 
   6857          // Matched the backref, skip over it.
   6858          rex.input += len;
   6859        }
   6860        break;
   6861 
   6862        case ZREF + 1:
   6863        case ZREF + 2:
   6864        case ZREF + 3:
   6865        case ZREF + 4:
   6866        case ZREF + 5:
   6867        case ZREF + 6:
   6868        case ZREF + 7:
   6869        case ZREF + 8:
   6870        case ZREF + 9:
   6871          cleanup_zsubexpr();
   6872          no = op - ZREF;
   6873          if (re_extmatch_in != NULL
   6874              && re_extmatch_in->matches[no] != NULL) {
   6875            int len = (int)strlen((char *)re_extmatch_in->matches[no]);
   6876            if (cstrncmp((char *)re_extmatch_in->matches[no], (char *)rex.input, &len) != 0) {
   6877              status = RA_NOMATCH;
   6878            } else {
   6879              rex.input += len;
   6880            }
   6881          } else {
   6882            // Backref was not set: Match an empty string.
   6883          }
   6884          break;
   6885 
   6886        case BRANCH:
   6887          if (OP(next) != BRANCH) {     // No choice.
   6888            next = OPERAND(scan);               // Avoid recursion.
   6889          } else {
   6890            rp = regstack_push(RS_BRANCH, scan);
   6891            if (rp == NULL) {
   6892              status = RA_FAIL;
   6893            } else {
   6894              status = RA_BREAK;                // rest is below
   6895            }
   6896          }
   6897          break;
   6898 
   6899        case BRACE_LIMITS:
   6900          if (OP(next) == BRACE_SIMPLE) {
   6901            bl_minval = OPERAND_MIN(scan);
   6902            bl_maxval = OPERAND_MAX(scan);
   6903          } else if (OP(next) >= BRACE_COMPLEX
   6904                     && OP(next) < BRACE_COMPLEX + 10) {
   6905            no = OP(next) - BRACE_COMPLEX;
   6906            brace_min[no] = OPERAND_MIN(scan);
   6907            brace_max[no] = OPERAND_MAX(scan);
   6908            brace_count[no] = 0;
   6909          } else {
   6910            internal_error("BRACE_LIMITS");
   6911            status = RA_FAIL;
   6912          }
   6913          break;
   6914 
   6915        case BRACE_COMPLEX + 0:
   6916        case BRACE_COMPLEX + 1:
   6917        case BRACE_COMPLEX + 2:
   6918        case BRACE_COMPLEX + 3:
   6919        case BRACE_COMPLEX + 4:
   6920        case BRACE_COMPLEX + 5:
   6921        case BRACE_COMPLEX + 6:
   6922        case BRACE_COMPLEX + 7:
   6923        case BRACE_COMPLEX + 8:
   6924        case BRACE_COMPLEX + 9:
   6925          no = op - BRACE_COMPLEX;
   6926          brace_count[no]++;
   6927 
   6928          // If not matched enough times yet, try one more
   6929          if (brace_count[no] <= (brace_min[no] <= brace_max[no]
   6930                                  ? brace_min[no] : brace_max[no])) {
   6931            rp = regstack_push(RS_BRCPLX_MORE, scan);
   6932            if (rp == NULL) {
   6933              status = RA_FAIL;
   6934            } else {
   6935              rp->rs_no = (int16_t)no;
   6936              reg_save(&rp->rs_un.regsave, &backpos);
   6937              next = OPERAND(scan);
   6938              // We continue and handle the result when done.
   6939            }
   6940            break;
   6941          }
   6942 
   6943          // If matched enough times, may try matching some more
   6944          if (brace_min[no] <= brace_max[no]) {
   6945            // Range is the normal way around, use longest match
   6946            if (brace_count[no] <= brace_max[no]) {
   6947              rp = regstack_push(RS_BRCPLX_LONG, scan);
   6948              if (rp == NULL) {
   6949                status = RA_FAIL;
   6950              } else {
   6951                rp->rs_no = (int16_t)no;
   6952                reg_save(&rp->rs_un.regsave, &backpos);
   6953                next = OPERAND(scan);
   6954                // We continue and handle the result when done.
   6955              }
   6956            }
   6957          } else {
   6958            // Range is backwards, use shortest match first
   6959            if (brace_count[no] <= brace_min[no]) {
   6960              rp = regstack_push(RS_BRCPLX_SHORT, scan);
   6961              if (rp == NULL) {
   6962                status = RA_FAIL;
   6963              } else {
   6964                reg_save(&rp->rs_un.regsave, &backpos);
   6965                // We continue and handle the result when done.
   6966              }
   6967            }
   6968          }
   6969          break;
   6970 
   6971        case BRACE_SIMPLE:
   6972        case STAR:
   6973        case PLUS: {
   6974          regstar_T rst;
   6975 
   6976          // Lookahead to avoid useless match attempts when we know
   6977          // what character comes next.
   6978          if (OP(next) == EXACTLY) {
   6979            rst.nextb = *OPERAND(next);
   6980            if (rex.reg_ic) {
   6981              if (mb_isupper(rst.nextb)) {
   6982                rst.nextb_ic = mb_tolower(rst.nextb);
   6983              } else {
   6984                rst.nextb_ic = mb_toupper(rst.nextb);
   6985              }
   6986            } else {
   6987              rst.nextb_ic = rst.nextb;
   6988            }
   6989          } else {
   6990            rst.nextb = NUL;
   6991            rst.nextb_ic = NUL;
   6992          }
   6993          if (op != BRACE_SIMPLE) {
   6994            rst.minval = (op == STAR) ? 0 : 1;
   6995            rst.maxval = MAX_LIMIT;
   6996          } else {
   6997            rst.minval = bl_minval;
   6998            rst.maxval = bl_maxval;
   6999          }
   7000 
   7001          // When maxval > minval, try matching as much as possible, up
   7002          // to maxval.  When maxval < minval, try matching at least the
   7003          // minimal number (since the range is backwards, that's also
   7004          // maxval!).
   7005          rst.count = regrepeat(OPERAND(scan), rst.maxval);
   7006          if (got_int) {
   7007            status = RA_FAIL;
   7008            break;
   7009          }
   7010          if (rst.minval <= rst.maxval
   7011              ? rst.count >= rst.minval : rst.count >= rst.maxval) {
   7012            // It could match.  Prepare for trying to match what
   7013            // follows.  The code is below.  Parameters are stored in
   7014            // a regstar_T on the regstack.
   7015            if ((int64_t)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
   7016              emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
   7017              status = RA_FAIL;
   7018            } else {
   7019              ga_grow(&regstack, sizeof(regstar_T));
   7020              regstack.ga_len += (int)sizeof(regstar_T);
   7021              rp = regstack_push(rst.minval <= rst.maxval ? RS_STAR_LONG : RS_STAR_SHORT, scan);
   7022              if (rp == NULL) {
   7023                status = RA_FAIL;
   7024              } else {
   7025                *(((regstar_T *)rp) - 1) = rst;
   7026                status = RA_BREAK;                  // skip the restore bits
   7027              }
   7028            }
   7029          } else {
   7030            status = RA_NOMATCH;
   7031          }
   7032        }
   7033        break;
   7034 
   7035        case NOMATCH:
   7036        case MATCH:
   7037        case SUBPAT:
   7038          rp = regstack_push(RS_NOMATCH, scan);
   7039          if (rp == NULL) {
   7040            status = RA_FAIL;
   7041          } else {
   7042            rp->rs_no = (int16_t)op;
   7043            reg_save(&rp->rs_un.regsave, &backpos);
   7044            next = OPERAND(scan);
   7045            // We continue and handle the result when done.
   7046          }
   7047          break;
   7048 
   7049        case BEHIND:
   7050        case NOBEHIND:
   7051          // Need a bit of room to store extra positions.
   7052          if ((int64_t)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
   7053            emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
   7054            status = RA_FAIL;
   7055          } else {
   7056            ga_grow(&regstack, sizeof(regbehind_T));
   7057            regstack.ga_len += (int)sizeof(regbehind_T);
   7058            rp = regstack_push(RS_BEHIND1, scan);
   7059            if (rp == NULL) {
   7060              status = RA_FAIL;
   7061            } else {
   7062              // Need to save the subexpr to be able to restore them
   7063              // when there is a match but we don't use it.
   7064              save_subexpr(((regbehind_T *)rp) - 1);
   7065 
   7066              rp->rs_no = (int16_t)op;
   7067              reg_save(&rp->rs_un.regsave, &backpos);
   7068              // First try if what follows matches.  If it does then we
   7069              // check the behind match by looping.
   7070            }
   7071          }
   7072          break;
   7073 
   7074        case BHPOS:
   7075          if (REG_MULTI) {
   7076            if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
   7077                || behind_pos.rs_u.pos.lnum != rex.lnum) {
   7078              status = RA_NOMATCH;
   7079            }
   7080          } else if (behind_pos.rs_u.ptr != rex.input) {
   7081            status = RA_NOMATCH;
   7082          }
   7083          break;
   7084 
   7085        case NEWL:
   7086          if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
   7087               || rex.reg_line_lbr) && (c != '\n' || !rex.reg_line_lbr)) {
   7088            status = RA_NOMATCH;
   7089          } else if (rex.reg_line_lbr) {
   7090            ADVANCE_REGINPUT();
   7091          } else {
   7092            reg_nextline();
   7093          }
   7094          break;
   7095 
   7096        case END:
   7097          status = RA_MATCH;    // Success!
   7098          break;
   7099 
   7100        default:
   7101          iemsg(_(e_re_corr));
   7102 #ifdef REGEXP_DEBUG
   7103          printf("Illegal op code %d\n", op);
   7104 #endif
   7105          status = RA_FAIL;
   7106          break;
   7107        }
   7108      }
   7109 
   7110      // If we can't continue sequentially, break the inner loop.
   7111      if (status != RA_CONT) {
   7112        break;
   7113      }
   7114 
   7115      // Continue in inner loop, advance to next item.
   7116      scan = next;
   7117    }  // end of inner loop
   7118 
   7119    // If there is something on the regstack execute the code for the state.
   7120    // If the state is popped then loop and use the older state.
   7121    while (!GA_EMPTY(&regstack) && status != RA_FAIL) {
   7122      rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
   7123      switch (rp->rs_state) {
   7124      case RS_NOPEN:
   7125        // Result is passed on as-is, simply pop the state.
   7126        regstack_pop(&scan);
   7127        break;
   7128 
   7129      case RS_MOPEN:
   7130        // Pop the state.  Restore pointers when there is no match.
   7131        if (status == RA_NOMATCH) {
   7132          restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
   7133                     &rex.reg_startp[rp->rs_no]);
   7134        }
   7135        regstack_pop(&scan);
   7136        break;
   7137 
   7138      case RS_ZOPEN:
   7139        // Pop the state.  Restore pointers when there is no match.
   7140        if (status == RA_NOMATCH) {
   7141          restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
   7142                     &reg_startzp[rp->rs_no]);
   7143        }
   7144        regstack_pop(&scan);
   7145        break;
   7146 
   7147      case RS_MCLOSE:
   7148        // Pop the state.  Restore pointers when there is no match.
   7149        if (status == RA_NOMATCH) {
   7150          restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
   7151                     &rex.reg_endp[rp->rs_no]);
   7152        }
   7153        regstack_pop(&scan);
   7154        break;
   7155 
   7156      case RS_ZCLOSE:
   7157        // Pop the state.  Restore pointers when there is no match.
   7158        if (status == RA_NOMATCH) {
   7159          restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
   7160                     &reg_endzp[rp->rs_no]);
   7161        }
   7162        regstack_pop(&scan);
   7163        break;
   7164 
   7165      case RS_BRANCH:
   7166        if (status == RA_MATCH) {
   7167          // this branch matched, use it
   7168          regstack_pop(&scan);
   7169        } else {
   7170          if (status != RA_BREAK) {
   7171            // After a non-matching branch: try next one.
   7172            reg_restore(&rp->rs_un.regsave, &backpos);
   7173            scan = rp->rs_scan;
   7174          }
   7175          if (scan == NULL || OP(scan) != BRANCH) {
   7176            // no more branches, didn't find a match
   7177            status = RA_NOMATCH;
   7178            regstack_pop(&scan);
   7179          } else {
   7180            // Prepare to try a branch.
   7181            rp->rs_scan = regnext(scan);
   7182            reg_save(&rp->rs_un.regsave, &backpos);
   7183            scan = OPERAND(scan);
   7184          }
   7185        }
   7186        break;
   7187 
   7188      case RS_BRCPLX_MORE:
   7189        // Pop the state.  Restore pointers when there is no match.
   7190        if (status == RA_NOMATCH) {
   7191          reg_restore(&rp->rs_un.regsave, &backpos);
   7192          brace_count[rp->rs_no]--;             // decrement match count
   7193        }
   7194        regstack_pop(&scan);
   7195        break;
   7196 
   7197      case RS_BRCPLX_LONG:
   7198        // Pop the state.  Restore pointers when there is no match.
   7199        if (status == RA_NOMATCH) {
   7200          // There was no match, but we did find enough matches.
   7201          reg_restore(&rp->rs_un.regsave, &backpos);
   7202          brace_count[rp->rs_no]--;
   7203          // continue with the items after "\{}"
   7204          status = RA_CONT;
   7205        }
   7206        regstack_pop(&scan);
   7207        if (status == RA_CONT) {
   7208          scan = regnext(scan);
   7209        }
   7210        break;
   7211 
   7212      case RS_BRCPLX_SHORT:
   7213        // Pop the state.  Restore pointers when there is no match.
   7214        if (status == RA_NOMATCH) {
   7215          // There was no match, try to match one more item.
   7216          reg_restore(&rp->rs_un.regsave, &backpos);
   7217        }
   7218        regstack_pop(&scan);
   7219        if (status == RA_NOMATCH) {
   7220          scan = OPERAND(scan);
   7221          status = RA_CONT;
   7222        }
   7223        break;
   7224 
   7225      case RS_NOMATCH:
   7226        // Pop the state.  If the operand matches for NOMATCH or
   7227        // doesn't match for MATCH/SUBPAT, we fail.  Otherwise backup,
   7228        // except for SUBPAT, and continue with the next item.
   7229        if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH)) {
   7230          status = RA_NOMATCH;
   7231        } else {
   7232          status = RA_CONT;
   7233          if (rp->rs_no != SUBPAT) {            // zero-width
   7234            reg_restore(&rp->rs_un.regsave, &backpos);
   7235          }
   7236        }
   7237        regstack_pop(&scan);
   7238        if (status == RA_CONT) {
   7239          scan = regnext(scan);
   7240        }
   7241        break;
   7242 
   7243      case RS_BEHIND1:
   7244        if (status == RA_NOMATCH) {
   7245          regstack_pop(&scan);
   7246          regstack.ga_len -= (int)sizeof(regbehind_T);
   7247        } else {
   7248          // The stuff after BEHIND/NOBEHIND matches.  Now try if
   7249          // the behind part does (not) match before the current
   7250          // position in the input.  This must be done at every
   7251          // position in the input and checking if the match ends at
   7252          // the current position.
   7253 
   7254          // save the position after the found match for next
   7255          reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
   7256 
   7257          // Start looking for a match with operand at the current
   7258          // position.  Go back one character until we find the
   7259          // result, hitting the start of the line or the previous
   7260          // line (for multi-line matching).
   7261          // Set behind_pos to where the match should end, BHPOS
   7262          // will match it.  Save the current value.
   7263          (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
   7264          behind_pos = rp->rs_un.regsave;
   7265 
   7266          rp->rs_state = RS_BEHIND2;
   7267 
   7268          reg_restore(&rp->rs_un.regsave, &backpos);
   7269          scan = OPERAND(rp->rs_scan) + 4;
   7270        }
   7271        break;
   7272 
   7273      case RS_BEHIND2:
   7274        // Looping for BEHIND / NOBEHIND match.
   7275        if (status == RA_MATCH && reg_save_equal(&behind_pos)) {
   7276          // found a match that ends where "next" started
   7277          behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
   7278          if (rp->rs_no == BEHIND) {
   7279            reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
   7280                        &backpos);
   7281          } else {
   7282            // But we didn't want a match.  Need to restore the
   7283            // subexpr, because what follows matched, so they have
   7284            // been set.
   7285            status = RA_NOMATCH;
   7286            restore_subexpr(((regbehind_T *)rp) - 1);
   7287          }
   7288          regstack_pop(&scan);
   7289          regstack.ga_len -= (int)sizeof(regbehind_T);
   7290        } else {
   7291          int64_t limit;
   7292 
   7293          // No match or a match that doesn't end where we want it: Go
   7294          // back one character.  May go to previous line once.
   7295          no = OK;
   7296          limit = OPERAND_MIN(rp->rs_scan);
   7297          if (REG_MULTI) {
   7298            if (limit > 0
   7299                && ((rp->rs_un.regsave.rs_u.pos.lnum
   7300                     < behind_pos.rs_u.pos.lnum
   7301                     ? (colnr_T)strlen((char *)rex.line)
   7302                     : behind_pos.rs_u.pos.col)
   7303                    - rp->rs_un.regsave.rs_u.pos.col >= limit)) {
   7304              no = FAIL;
   7305            } else if (rp->rs_un.regsave.rs_u.pos.col == 0) {
   7306              if (rp->rs_un.regsave.rs_u.pos.lnum
   7307                  < behind_pos.rs_u.pos.lnum
   7308                  || reg_getline(--rp->rs_un.regsave.rs_u.pos.lnum)
   7309                  == NULL) {
   7310                no = FAIL;
   7311              } else {
   7312                reg_restore(&rp->rs_un.regsave, &backpos);
   7313                rp->rs_un.regsave.rs_u.pos.col =
   7314                  (colnr_T)strlen((char *)rex.line);
   7315              }
   7316            } else {
   7317              const uint8_t *const line =
   7318                (uint8_t *)reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
   7319 
   7320              rp->rs_un.regsave.rs_u.pos.col -=
   7321                utf_head_off((char *)line,
   7322                             (char *)line + rp->rs_un.regsave.rs_u.pos.col - 1)
   7323                + 1;
   7324            }
   7325          } else {
   7326            if (rp->rs_un.regsave.rs_u.ptr == rex.line) {
   7327              no = FAIL;
   7328            } else {
   7329              MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
   7330              if (limit > 0
   7331                  && (behind_pos.rs_u.ptr - rp->rs_un.regsave.rs_u.ptr) > (ptrdiff_t)limit) {
   7332                no = FAIL;
   7333              }
   7334            }
   7335          }
   7336          if (no == OK) {
   7337            // Advanced, prepare for finding match again.
   7338            reg_restore(&rp->rs_un.regsave, &backpos);
   7339            scan = OPERAND(rp->rs_scan) + 4;
   7340            if (status == RA_MATCH) {
   7341              // We did match, so subexpr may have been changed,
   7342              // need to restore them for the next try.
   7343              status = RA_NOMATCH;
   7344              restore_subexpr(((regbehind_T *)rp) - 1);
   7345            }
   7346          } else {
   7347            // Can't advance.  For NOBEHIND that's a match.
   7348            behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
   7349            if (rp->rs_no == NOBEHIND) {
   7350              reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
   7351                          &backpos);
   7352              status = RA_MATCH;
   7353            } else {
   7354              // We do want a proper match.  Need to restore the
   7355              // subexpr if we had a match, because they may have
   7356              // been set.
   7357              if (status == RA_MATCH) {
   7358                status = RA_NOMATCH;
   7359                restore_subexpr(((regbehind_T *)rp) - 1);
   7360              }
   7361            }
   7362            regstack_pop(&scan);
   7363            regstack.ga_len -= (int)sizeof(regbehind_T);
   7364          }
   7365        }
   7366        break;
   7367 
   7368      case RS_STAR_LONG:
   7369      case RS_STAR_SHORT: {
   7370        regstar_T *rst = ((regstar_T *)rp) - 1;
   7371 
   7372        if (status == RA_MATCH) {
   7373          regstack_pop(&scan);
   7374          regstack.ga_len -= (int)sizeof(regstar_T);
   7375          break;
   7376        }
   7377 
   7378        // Tried once already, restore input pointers.
   7379        if (status != RA_BREAK) {
   7380          reg_restore(&rp->rs_un.regsave, &backpos);
   7381        }
   7382 
   7383        // Repeat until we found a position where it could match.
   7384        while (true) {
   7385          if (status != RA_BREAK) {
   7386            // Tried first position already, advance.
   7387            if (rp->rs_state == RS_STAR_LONG) {
   7388              // Trying for longest match, but couldn't or
   7389              // didn't match -- back up one char.
   7390              if (--rst->count < rst->minval) {
   7391                break;
   7392              }
   7393              if (rex.input == rex.line) {
   7394                // backup to last char of previous line
   7395                if (rex.lnum == 0) {
   7396                  status = RA_NOMATCH;
   7397                  break;
   7398                }
   7399                rex.lnum--;
   7400                rex.line = (uint8_t *)reg_getline(rex.lnum);
   7401                // Just in case regrepeat() didn't count right.
   7402                if (rex.line == NULL) {
   7403                  break;
   7404                }
   7405                rex.input = rex.line + reg_getline_len(rex.lnum);
   7406                reg_breakcheck();
   7407              } else {
   7408                MB_PTR_BACK(rex.line, rex.input);
   7409              }
   7410            } else {
   7411              // Range is backwards, use shortest match first.
   7412              // Careful: maxval and minval are exchanged!
   7413              // Couldn't or didn't match: try advancing one
   7414              // char.
   7415              if (rst->count == rst->minval
   7416                  || regrepeat(OPERAND(rp->rs_scan), 1L) == 0) {
   7417                break;
   7418              }
   7419              rst->count++;
   7420            }
   7421            if (got_int) {
   7422              break;
   7423            }
   7424          } else {
   7425            status = RA_NOMATCH;
   7426          }
   7427 
   7428          // If it could match, try it.
   7429          if (rst->nextb == NUL || *rex.input == rst->nextb
   7430              || *rex.input == rst->nextb_ic) {
   7431            reg_save(&rp->rs_un.regsave, &backpos);
   7432            scan = regnext(rp->rs_scan);
   7433            status = RA_CONT;
   7434            break;
   7435          }
   7436        }
   7437        if (status != RA_CONT) {
   7438          // Failed.
   7439          regstack_pop(&scan);
   7440          regstack.ga_len -= (int)sizeof(regstar_T);
   7441          status = RA_NOMATCH;
   7442        }
   7443      }
   7444      break;
   7445      }
   7446 
   7447      // If we want to continue the inner loop or didn't pop a state
   7448      // continue matching loop
   7449      if (status == RA_CONT || rp == (regitem_T *)
   7450          ((char *)regstack.ga_data + regstack.ga_len) - 1) {
   7451        break;
   7452      }
   7453    }
   7454 
   7455    // May need to continue with the inner loop, starting at "scan".
   7456    if (status == RA_CONT) {
   7457      continue;
   7458    }
   7459 
   7460    // If the regstack is empty or something failed we are done.
   7461    if (GA_EMPTY(&regstack) || status == RA_FAIL) {
   7462      if (scan == NULL) {
   7463        // We get here only if there's trouble -- normally "case END" is
   7464        // the terminating point.
   7465        iemsg(_(e_re_corr));
   7466 #ifdef REGEXP_DEBUG
   7467        printf("Premature EOL\n");
   7468 #endif
   7469      }
   7470      return status == RA_MATCH;
   7471    }
   7472  }  // End of loop until the regstack is empty.
   7473 
   7474  // NOTREACHED
   7475 }
   7476 
   7477 /// Try match of "prog" with at rex.line["col"].
   7478 ///
   7479 /// @param tm         timeout limit or NULL
   7480 /// @param timed_out  flag set on timeout or NULL
   7481 ///
   7482 /// @return  0 for failure, or number of lines contained in the match.
   7483 static int regtry(bt_regprog_T *prog, colnr_T col, proftime_T *tm, int *timed_out)
   7484 {
   7485  rex.input = rex.line + col;
   7486  rex.need_clear_subexpr = true;
   7487  // Clear the external match subpointers if necessaey.
   7488  rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
   7489 
   7490  if (regmatch(&prog->program[1], tm, timed_out) == 0) {
   7491    return 0;
   7492  }
   7493 
   7494  cleanup_subexpr();
   7495  if (REG_MULTI) {
   7496    if (rex.reg_startpos[0].lnum < 0) {
   7497      rex.reg_startpos[0].lnum = 0;
   7498      rex.reg_startpos[0].col = col;
   7499    }
   7500    if (rex.reg_endpos[0].lnum < 0) {
   7501      rex.reg_endpos[0].lnum = rex.lnum;
   7502      rex.reg_endpos[0].col = (int)(rex.input - rex.line);
   7503    } else {
   7504      // Use line number of "\ze".
   7505      rex.lnum = rex.reg_endpos[0].lnum;
   7506    }
   7507  } else {
   7508    if (rex.reg_startp[0] == NULL) {
   7509      rex.reg_startp[0] = rex.line + col;
   7510    }
   7511    if (rex.reg_endp[0] == NULL) {
   7512      rex.reg_endp[0] = rex.input;
   7513    }
   7514  }
   7515  // Package any found \z(...\) matches for export. Default is none.
   7516  unref_extmatch(re_extmatch_out);
   7517  re_extmatch_out = NULL;
   7518 
   7519  if (prog->reghasz == REX_SET) {
   7520    int i;
   7521 
   7522    cleanup_zsubexpr();
   7523    re_extmatch_out = make_extmatch();
   7524    for (i = 0; i < NSUBEXP; i++) {
   7525      if (REG_MULTI) {
   7526        // Only accept single line matches.
   7527        if (reg_startzpos[i].lnum >= 0
   7528            && reg_endzpos[i].lnum == reg_startzpos[i].lnum
   7529            && reg_endzpos[i].col >= reg_startzpos[i].col) {
   7530          re_extmatch_out->matches[i] =
   7531            (uint8_t *)xstrnsave(reg_getline(reg_startzpos[i].lnum) + reg_startzpos[i].col,
   7532                                 (size_t)(reg_endzpos[i].col - reg_startzpos[i].col));
   7533        }
   7534      } else {
   7535        if (reg_startzp[i] != NULL && reg_endzp[i] != NULL) {
   7536          re_extmatch_out->matches[i] =
   7537            (uint8_t *)xstrnsave((char *)reg_startzp[i], (size_t)(reg_endzp[i] - reg_startzp[i]));
   7538        }
   7539      }
   7540    }
   7541  }
   7542  return 1 + rex.lnum;
   7543 }
   7544 
   7545 /// Match a regexp against a string ("line" points to the string) or multiple
   7546 /// lines (if "line" is NULL, use reg_getline()).
   7547 ///
   7548 /// @param startcol   column to start looking for match
   7549 /// @param tm         timeout limit or NULL
   7550 /// @param timed_out  flag set on timeout or NULL
   7551 ///
   7552 /// @return  0 for failure, or number of lines contained in the match.
   7553 static int bt_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int *timed_out)
   7554 {
   7555  bt_regprog_T *prog;
   7556  uint8_t *s;
   7557  colnr_T col = startcol;
   7558  int retval = 0;
   7559 
   7560  // Create "regstack" and "backpos" if they are not allocated yet.
   7561  // We allocate *_INITIAL amount of bytes first and then set the grow size
   7562  // to much bigger value to avoid many malloc calls in case of deep regular
   7563  // expressions.
   7564  if (regstack.ga_data == NULL) {
   7565    // Use an item size of 1 byte, since we push different things
   7566    // onto the regstack.
   7567    ga_init(&regstack, 1, REGSTACK_INITIAL);
   7568    ga_grow(&regstack, REGSTACK_INITIAL);
   7569    ga_set_growsize(&regstack, REGSTACK_INITIAL * 8);
   7570  }
   7571 
   7572  if (backpos.ga_data == NULL) {
   7573    ga_init(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
   7574    ga_grow(&backpos, BACKPOS_INITIAL);
   7575    ga_set_growsize(&backpos, BACKPOS_INITIAL * 8);
   7576  }
   7577 
   7578  if (REG_MULTI) {
   7579    prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
   7580    line = (uint8_t *)reg_getline(0);
   7581    rex.reg_startpos = rex.reg_mmatch->startpos;
   7582    rex.reg_endpos = rex.reg_mmatch->endpos;
   7583  } else {
   7584    prog = (bt_regprog_T *)rex.reg_match->regprog;
   7585    rex.reg_startp = (uint8_t **)rex.reg_match->startp;
   7586    rex.reg_endp = (uint8_t **)rex.reg_match->endp;
   7587  }
   7588 
   7589  // Be paranoid...
   7590  if (prog == NULL || line == NULL) {
   7591    iemsg(_(e_null));
   7592    goto theend;
   7593  }
   7594 
   7595  // Check validity of program.
   7596  if (prog_magic_wrong()) {
   7597    goto theend;
   7598  }
   7599 
   7600  // If the start column is past the maximum column: no need to try.
   7601  if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
   7602    goto theend;
   7603  }
   7604 
   7605  // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
   7606  if (prog->regflags & RF_ICASE) {
   7607    rex.reg_ic = true;
   7608  } else if (prog->regflags & RF_NOICASE) {
   7609    rex.reg_ic = false;
   7610  }
   7611 
   7612  // If pattern contains "\Z" overrule value of rex.reg_icombine
   7613  if (prog->regflags & RF_ICOMBINE) {
   7614    rex.reg_icombine = true;
   7615  }
   7616 
   7617  // If there is a "must appear" string, look for it.
   7618  if (prog->regmust != NULL) {
   7619    int c = utf_ptr2char((char *)prog->regmust);
   7620    s = line + col;
   7621 
   7622    // This is used very often, esp. for ":global".  Use two versions of
   7623    // the loop to avoid overhead of conditions.
   7624    if (!rex.reg_ic) {
   7625      while ((s = (uint8_t *)vim_strchr((char *)s, c)) != NULL) {
   7626        if (cstrncmp((char *)s, (char *)prog->regmust, &prog->regmlen) == 0) {
   7627          break;  // Found it.
   7628        }
   7629        MB_PTR_ADV(s);
   7630      }
   7631    } else {
   7632      while ((s = (uint8_t *)cstrchr((char *)s, c)) != NULL) {
   7633        if (cstrncmp((char *)s, (char *)prog->regmust, &prog->regmlen) == 0) {
   7634          break;  // Found it.
   7635        }
   7636        MB_PTR_ADV(s);
   7637      }
   7638    }
   7639    if (s == NULL) {  // Not present.
   7640      goto theend;
   7641    }
   7642  }
   7643 
   7644  rex.line = line;
   7645  rex.lnum = 0;
   7646  reg_toolong = false;
   7647 
   7648  // Simplest case: Anchored match need be tried only once.
   7649  if (prog->reganch) {
   7650    int c = utf_ptr2char((char *)rex.line + col);
   7651    if (prog->regstart == NUL
   7652        || prog->regstart == c
   7653        || (rex.reg_ic
   7654            && (utf_fold(prog->regstart) == utf_fold(c)
   7655                || (c < 255 && prog->regstart < 255
   7656                    && mb_tolower(prog->regstart) == mb_tolower(c))))) {
   7657      retval = regtry(prog, col, tm, timed_out);
   7658    } else {
   7659      retval = 0;
   7660    }
   7661  } else {
   7662    int tm_count = 0;
   7663    // Messy cases:  unanchored match.
   7664    while (!got_int) {
   7665      if (prog->regstart != NUL) {
   7666        // Skip until the char we know it must start with.
   7667        s = (uint8_t *)cstrchr((char *)rex.line + col, prog->regstart);
   7668        if (s == NULL) {
   7669          retval = 0;
   7670          break;
   7671        }
   7672        col = (int)(s - rex.line);
   7673      }
   7674 
   7675      // Check for maximum column to try.
   7676      if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
   7677        retval = 0;
   7678        break;
   7679      }
   7680 
   7681      retval = regtry(prog, col, tm, timed_out);
   7682      if (retval > 0) {
   7683        break;
   7684      }
   7685 
   7686      // if not currently on the first line, get it again
   7687      if (rex.lnum != 0) {
   7688        rex.lnum = 0;
   7689        rex.line = (uint8_t *)reg_getline(0);
   7690      }
   7691      if (rex.line[col] == NUL) {
   7692        break;
   7693      }
   7694      col += utfc_ptr2len((char *)rex.line + col);
   7695      // Check for timeout once in a twenty times to avoid overhead.
   7696      if (tm != NULL && ++tm_count == 20) {
   7697        tm_count = 0;
   7698        if (profile_passed_limit(*tm)) {
   7699          if (timed_out != NULL) {
   7700            *timed_out = true;
   7701          }
   7702          break;
   7703        }
   7704      }
   7705    }
   7706  }
   7707 
   7708 theend:
   7709  // Free "reg_tofree" when it's a bit big.
   7710  // Free regstack and backpos if they are bigger than their initial size.
   7711  if (reg_tofreelen > 400) {
   7712    XFREE_CLEAR(reg_tofree);
   7713  }
   7714  if (regstack.ga_maxlen > REGSTACK_INITIAL) {
   7715    ga_clear(&regstack);
   7716  }
   7717  if (backpos.ga_maxlen > BACKPOS_INITIAL) {
   7718    ga_clear(&backpos);
   7719  }
   7720 
   7721  if (retval > 0) {
   7722    // Make sure the end is never before the start.  Can happen when \zs
   7723    // and \ze are used.
   7724    if (REG_MULTI) {
   7725      const lpos_T *const start = &rex.reg_mmatch->startpos[0];
   7726      const lpos_T *const end = &rex.reg_mmatch->endpos[0];
   7727 
   7728      if (end->lnum < start->lnum
   7729          || (end->lnum == start->lnum && end->col < start->col)) {
   7730        rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
   7731      }
   7732 
   7733      // startpos[0] may be set by "\zs", also return the column where
   7734      // the whole pattern matched.
   7735      rex.reg_mmatch->rmm_matchcol = col;
   7736    } else {
   7737      if (rex.reg_match->endp[0] < rex.reg_match->startp[0]) {
   7738        rex.reg_match->endp[0] = rex.reg_match->startp[0];
   7739      }
   7740 
   7741      // startpos[0] may be set by "\zs", also return the column where
   7742      // the whole pattern matched.
   7743      rex.reg_match->rm_matchcol = col;
   7744    }
   7745  }
   7746 
   7747  return retval;
   7748 }
   7749 
   7750 /// Match a regexp against a string.
   7751 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
   7752 /// Uses curbuf for line count and 'iskeyword'.
   7753 /// If "line_lbr" is true, consider a "\n" in "line" to be a line break.
   7754 ///
   7755 /// @param line  string to match against
   7756 /// @param col   column to start looking for match
   7757 ///
   7758 /// @return  0 for failure, number of lines contained in the match otherwise.
   7759 static int bt_regexec_nl(regmatch_T *rmp, uint8_t *line, colnr_T col, bool line_lbr)
   7760 {
   7761  rex.reg_match = rmp;
   7762  rex.reg_mmatch = NULL;
   7763  rex.reg_maxline = 0;
   7764  rex.reg_line_lbr = line_lbr;
   7765  rex.reg_buf = curbuf;
   7766  rex.reg_win = NULL;
   7767  rex.reg_ic = rmp->rm_ic;
   7768  rex.reg_icombine = false;
   7769  rex.reg_nobreak = rmp->regprog->re_flags & RE_NOBREAK;
   7770  rex.reg_maxcol = 0;
   7771 
   7772  int64_t r = bt_regexec_both(line, col, NULL, NULL);
   7773  assert(r <= INT_MAX);
   7774  return (int)r;
   7775 }
   7776 
   7777 /// Matches a regexp against multiple lines.
   7778 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
   7779 /// Uses curbuf for line count and 'iskeyword'.
   7780 ///
   7781 /// @param win Window in which to search or NULL
   7782 /// @param buf Buffer in which to search
   7783 /// @param lnum Number of line to start looking for match
   7784 /// @param col Column to start looking for match
   7785 /// @param tm Timeout limit or NULL
   7786 ///
   7787 /// @return zero if there is no match and number of lines contained in the match
   7788 ///         otherwise.
   7789 static int bt_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col,
   7790                            proftime_T *tm, int *timed_out)
   7791 {
   7792  init_regexec_multi(rmp, win, buf, lnum);
   7793  return bt_regexec_both(NULL, col, tm, timed_out);
   7794 }
   7795 
   7796 // Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
   7797 static int re_num_cmp(uint32_t val, const uint8_t *scan)
   7798 {
   7799  uint32_t n = (uint32_t)OPERAND_MIN(scan);
   7800 
   7801  if (OPERAND_CMP(scan) == '>') {
   7802    return val > n;
   7803  }
   7804  if (OPERAND_CMP(scan) == '<') {
   7805    return val < n;
   7806  }
   7807  return val == n;
   7808 }
   7809 
   7810 #ifdef BT_REGEXP_DUMP
   7811 
   7812 // regdump - dump a regexp onto stdout in vaguely comprehensible form
   7813 static void regdump(uint8_t *pattern, bt_regprog_T *r)
   7814 {
   7815  uint8_t *s;
   7816  int op = EXACTLY;             // Arbitrary non-END op.
   7817  uint8_t *next;
   7818  uint8_t *end = NULL;
   7819  FILE *f;
   7820 
   7821 # ifdef BT_REGEXP_LOG
   7822  f = fopen("bt_regexp_log.log", "a");
   7823 # else
   7824  f = stdout;
   7825 # endif
   7826  if (f == NULL) {
   7827    return;
   7828  }
   7829  fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n",
   7830          pattern);
   7831 
   7832  s = &r->program[1];
   7833  // Loop until we find the END that isn't before a referred next (an END
   7834  // can also appear in a NOMATCH operand).
   7835  while (op != END || s <= end) {
   7836    op = OP(s);
   7837    fprintf(f, "%2d%s", (int)(s - r->program), regprop(s));     // Where, what.
   7838    next = regnext(s);
   7839    if (next == NULL) {         // Next ptr.
   7840      fprintf(f, "(0)");
   7841    } else {
   7842      fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
   7843    }
   7844    if (end < next) {
   7845      end = next;
   7846    }
   7847    if (op == BRACE_LIMITS) {
   7848      // Two ints
   7849      fprintf(f, " minval %" PRId64 ", maxval %" PRId64,
   7850              (int64_t)OPERAND_MIN(s), (int64_t)OPERAND_MAX(s));
   7851      s += 8;
   7852    } else if (op == BEHIND || op == NOBEHIND) {
   7853      // one int
   7854      fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s));
   7855      s += 4;
   7856    } else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL) {
   7857      // one int plus comparator
   7858      fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s));
   7859      s += 5;
   7860    }
   7861    s += 3;
   7862    if (op == ANYOF || op == ANYOF + ADD_NL
   7863        || op == ANYBUT || op == ANYBUT + ADD_NL
   7864        || op == EXACTLY) {
   7865      // Literal string, where present.
   7866      fprintf(f, "\nxxxxxxxxx\n");
   7867      while (*s != NUL) {
   7868        fprintf(f, "%c", *s++);
   7869      }
   7870      fprintf(f, "\nxxxxxxxxx\n");
   7871      s++;
   7872    }
   7873    fprintf(f, "\r\n");
   7874  }
   7875 
   7876  // Header fields of interest.
   7877  if (r->regstart != NUL) {
   7878    fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
   7879            ? (char *)transchar(r->regstart)
   7880            : "multibyte", r->regstart);
   7881  }
   7882  if (r->reganch) {
   7883    fprintf(f, "anchored; ");
   7884  }
   7885  if (r->regmust != NULL) {
   7886    fprintf(f, "must have \"%s\"", r->regmust);
   7887  }
   7888  fprintf(f, "\r\n");
   7889 
   7890 # ifdef BT_REGEXP_LOG
   7891  fclose(f);
   7892 # endif
   7893 }
   7894 #endif      // BT_REGEXP_DUMP
   7895 
   7896 #ifdef REGEXP_DEBUG
   7897 
   7898 // regprop - printable representation of opcode
   7899 static uint8_t *regprop(uint8_t *op)
   7900 {
   7901  char *p;
   7902  static char buf[50];
   7903  static size_t buflen = 0;
   7904 
   7905  STRCPY(buf, ":");
   7906  buflen = 1;
   7907 
   7908  switch ((int)OP(op)) {
   7909  case BOL:
   7910    p = "BOL";
   7911    break;
   7912  case EOL:
   7913    p = "EOL";
   7914    break;
   7915  case RE_BOF:
   7916    p = "BOF";
   7917    break;
   7918  case RE_EOF:
   7919    p = "EOF";
   7920    break;
   7921  case CURSOR:
   7922    p = "CURSOR";
   7923    break;
   7924  case RE_VISUAL:
   7925    p = "RE_VISUAL";
   7926    break;
   7927  case RE_LNUM:
   7928    p = "RE_LNUM";
   7929    break;
   7930  case RE_MARK:
   7931    p = "RE_MARK";
   7932    break;
   7933  case RE_COL:
   7934    p = "RE_COL";
   7935    break;
   7936  case RE_VCOL:
   7937    p = "RE_VCOL";
   7938    break;
   7939  case BOW:
   7940    p = "BOW";
   7941    break;
   7942  case EOW:
   7943    p = "EOW";
   7944    break;
   7945  case ANY:
   7946    p = "ANY";
   7947    break;
   7948  case ANY + ADD_NL:
   7949    p = "ANY+NL";
   7950    break;
   7951  case ANYOF:
   7952    p = "ANYOF";
   7953    break;
   7954  case ANYOF + ADD_NL:
   7955    p = "ANYOF+NL";
   7956    break;
   7957  case ANYBUT:
   7958    p = "ANYBUT";
   7959    break;
   7960  case ANYBUT + ADD_NL:
   7961    p = "ANYBUT+NL";
   7962    break;
   7963  case IDENT:
   7964    p = "IDENT";
   7965    break;
   7966  case IDENT + ADD_NL:
   7967    p = "IDENT+NL";
   7968    break;
   7969  case SIDENT:
   7970    p = "SIDENT";
   7971    break;
   7972  case SIDENT + ADD_NL:
   7973    p = "SIDENT+NL";
   7974    break;
   7975  case KWORD:
   7976    p = "KWORD";
   7977    break;
   7978  case KWORD + ADD_NL:
   7979    p = "KWORD+NL";
   7980    break;
   7981  case SKWORD:
   7982    p = "SKWORD";
   7983    break;
   7984  case SKWORD + ADD_NL:
   7985    p = "SKWORD+NL";
   7986    break;
   7987  case FNAME:
   7988    p = "FNAME";
   7989    break;
   7990  case FNAME + ADD_NL:
   7991    p = "FNAME+NL";
   7992    break;
   7993  case SFNAME:
   7994    p = "SFNAME";
   7995    break;
   7996  case SFNAME + ADD_NL:
   7997    p = "SFNAME+NL";
   7998    break;
   7999  case PRINT:
   8000    p = "PRINT";
   8001    break;
   8002  case PRINT + ADD_NL:
   8003    p = "PRINT+NL";
   8004    break;
   8005  case SPRINT:
   8006    p = "SPRINT";
   8007    break;
   8008  case SPRINT + ADD_NL:
   8009    p = "SPRINT+NL";
   8010    break;
   8011  case WHITE:
   8012    p = "WHITE";
   8013    break;
   8014  case WHITE + ADD_NL:
   8015    p = "WHITE+NL";
   8016    break;
   8017  case NWHITE:
   8018    p = "NWHITE";
   8019    break;
   8020  case NWHITE + ADD_NL:
   8021    p = "NWHITE+NL";
   8022    break;
   8023  case DIGIT:
   8024    p = "DIGIT";
   8025    break;
   8026  case DIGIT + ADD_NL:
   8027    p = "DIGIT+NL";
   8028    break;
   8029  case NDIGIT:
   8030    p = "NDIGIT";
   8031    break;
   8032  case NDIGIT + ADD_NL:
   8033    p = "NDIGIT+NL";
   8034    break;
   8035  case HEX:
   8036    p = "HEX";
   8037    break;
   8038  case HEX + ADD_NL:
   8039    p = "HEX+NL";
   8040    break;
   8041  case NHEX:
   8042    p = "NHEX";
   8043    break;
   8044  case NHEX + ADD_NL:
   8045    p = "NHEX+NL";
   8046    break;
   8047  case OCTAL:
   8048    p = "OCTAL";
   8049    break;
   8050  case OCTAL + ADD_NL:
   8051    p = "OCTAL+NL";
   8052    break;
   8053  case NOCTAL:
   8054    p = "NOCTAL";
   8055    break;
   8056  case NOCTAL + ADD_NL:
   8057    p = "NOCTAL+NL";
   8058    break;
   8059  case WORD:
   8060    p = "WORD";
   8061    break;
   8062  case WORD + ADD_NL:
   8063    p = "WORD+NL";
   8064    break;
   8065  case NWORD:
   8066    p = "NWORD";
   8067    break;
   8068  case NWORD + ADD_NL:
   8069    p = "NWORD+NL";
   8070    break;
   8071  case HEAD:
   8072    p = "HEAD";
   8073    break;
   8074  case HEAD + ADD_NL:
   8075    p = "HEAD+NL";
   8076    break;
   8077  case NHEAD:
   8078    p = "NHEAD";
   8079    break;
   8080  case NHEAD + ADD_NL:
   8081    p = "NHEAD+NL";
   8082    break;
   8083  case ALPHA:
   8084    p = "ALPHA";
   8085    break;
   8086  case ALPHA + ADD_NL:
   8087    p = "ALPHA+NL";
   8088    break;
   8089  case NALPHA:
   8090    p = "NALPHA";
   8091    break;
   8092  case NALPHA + ADD_NL:
   8093    p = "NALPHA+NL";
   8094    break;
   8095  case LOWER:
   8096    p = "LOWER";
   8097    break;
   8098  case LOWER + ADD_NL:
   8099    p = "LOWER+NL";
   8100    break;
   8101  case NLOWER:
   8102    p = "NLOWER";
   8103    break;
   8104  case NLOWER + ADD_NL:
   8105    p = "NLOWER+NL";
   8106    break;
   8107  case UPPER:
   8108    p = "UPPER";
   8109    break;
   8110  case UPPER + ADD_NL:
   8111    p = "UPPER+NL";
   8112    break;
   8113  case NUPPER:
   8114    p = "NUPPER";
   8115    break;
   8116  case NUPPER + ADD_NL:
   8117    p = "NUPPER+NL";
   8118    break;
   8119  case BRANCH:
   8120    p = "BRANCH";
   8121    break;
   8122  case EXACTLY:
   8123    p = "EXACTLY";
   8124    break;
   8125  case NOTHING:
   8126    p = "NOTHING";
   8127    break;
   8128  case BACK:
   8129    p = "BACK";
   8130    break;
   8131  case END:
   8132    p = "END";
   8133    break;
   8134  case MOPEN + 0:
   8135    p = "MATCH START";
   8136    break;
   8137  case MOPEN + 1:
   8138  case MOPEN + 2:
   8139  case MOPEN + 3:
   8140  case MOPEN + 4:
   8141  case MOPEN + 5:
   8142  case MOPEN + 6:
   8143  case MOPEN + 7:
   8144  case MOPEN + 8:
   8145  case MOPEN + 9:
   8146    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8147                               "MOPEN%d", OP(op) - MOPEN);
   8148    p = NULL;
   8149    break;
   8150  case MCLOSE + 0:
   8151    p = "MATCH END";
   8152    break;
   8153  case MCLOSE + 1:
   8154  case MCLOSE + 2:
   8155  case MCLOSE + 3:
   8156  case MCLOSE + 4:
   8157  case MCLOSE + 5:
   8158  case MCLOSE + 6:
   8159  case MCLOSE + 7:
   8160  case MCLOSE + 8:
   8161  case MCLOSE + 9:
   8162    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8163                               "MCLOSE%d", OP(op) - MCLOSE);
   8164    p = NULL;
   8165    break;
   8166  case BACKREF + 1:
   8167  case BACKREF + 2:
   8168  case BACKREF + 3:
   8169  case BACKREF + 4:
   8170  case BACKREF + 5:
   8171  case BACKREF + 6:
   8172  case BACKREF + 7:
   8173  case BACKREF + 8:
   8174  case BACKREF + 9:
   8175    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8176                               "BACKREF%d", OP(op) - BACKREF);
   8177    p = NULL;
   8178    break;
   8179  case NOPEN:
   8180    p = "NOPEN";
   8181    break;
   8182  case NCLOSE:
   8183    p = "NCLOSE";
   8184    break;
   8185  case ZOPEN + 1:
   8186  case ZOPEN + 2:
   8187  case ZOPEN + 3:
   8188  case ZOPEN + 4:
   8189  case ZOPEN + 5:
   8190  case ZOPEN + 6:
   8191  case ZOPEN + 7:
   8192  case ZOPEN + 8:
   8193  case ZOPEN + 9:
   8194    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8195                               "ZOPEN%d", OP(op) - ZOPEN);
   8196    p = NULL;
   8197    break;
   8198  case ZCLOSE + 1:
   8199  case ZCLOSE + 2:
   8200  case ZCLOSE + 3:
   8201  case ZCLOSE + 4:
   8202  case ZCLOSE + 5:
   8203  case ZCLOSE + 6:
   8204  case ZCLOSE + 7:
   8205  case ZCLOSE + 8:
   8206  case ZCLOSE + 9:
   8207    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8208                               "ZCLOSE%d", OP(op) - ZCLOSE);
   8209    p = NULL;
   8210    break;
   8211  case ZREF + 1:
   8212  case ZREF + 2:
   8213  case ZREF + 3:
   8214  case ZREF + 4:
   8215  case ZREF + 5:
   8216  case ZREF + 6:
   8217  case ZREF + 7:
   8218  case ZREF + 8:
   8219  case ZREF + 9:
   8220    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8221                               "ZREF%d", OP(op) - ZREF);
   8222    p = NULL;
   8223    break;
   8224  case STAR:
   8225    p = "STAR";
   8226    break;
   8227  case PLUS:
   8228    p = "PLUS";
   8229    break;
   8230  case NOMATCH:
   8231    p = "NOMATCH";
   8232    break;
   8233  case MATCH:
   8234    p = "MATCH";
   8235    break;
   8236  case BEHIND:
   8237    p = "BEHIND";
   8238    break;
   8239  case NOBEHIND:
   8240    p = "NOBEHIND";
   8241    break;
   8242  case SUBPAT:
   8243    p = "SUBPAT";
   8244    break;
   8245  case BRACE_LIMITS:
   8246    p = "BRACE_LIMITS";
   8247    break;
   8248  case BRACE_SIMPLE:
   8249    p = "BRACE_SIMPLE";
   8250    break;
   8251  case BRACE_COMPLEX + 0:
   8252  case BRACE_COMPLEX + 1:
   8253  case BRACE_COMPLEX + 2:
   8254  case BRACE_COMPLEX + 3:
   8255  case BRACE_COMPLEX + 4:
   8256  case BRACE_COMPLEX + 5:
   8257  case BRACE_COMPLEX + 6:
   8258  case BRACE_COMPLEX + 7:
   8259  case BRACE_COMPLEX + 8:
   8260  case BRACE_COMPLEX + 9:
   8261    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8262                               "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
   8263    p = NULL;
   8264    break;
   8265  case MULTIBYTECODE:
   8266    p = "MULTIBYTECODE";
   8267    break;
   8268  case NEWL:
   8269    p = "NEWL";
   8270    break;
   8271  default:
   8272    buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen,
   8273                               "corrupt %d", OP(op));
   8274    p = NULL;
   8275    break;
   8276  }
   8277  if (p != NULL) {
   8278    STRCPY(buf + buflen, p);
   8279  }
   8280  return (uint8_t *)buf;
   8281 }
   8282 #endif      // REGEXP_DEBUG
   8283 
   8284 // }}}1
   8285 
   8286 // regexp_nfa.c {{{1
   8287 // NFA regular expression implementation.
   8288 
   8289 // Logging of NFA engine.
   8290 //
   8291 // The NFA engine can write four log files:
   8292 // - Error log: Contains NFA engine's fatal errors.
   8293 // - Dump log: Contains compiled NFA state machine's information.
   8294 // - Run log: Contains information of matching procedure.
   8295 // - Debug log: Contains detailed information of matching procedure. Can be
   8296 //   disabled by undefining NFA_REGEXP_DEBUG_LOG.
   8297 // The first one can also be used without debug mode.
   8298 // The last three are enabled when compiled as debug mode and individually
   8299 // disabled by commenting them out.
   8300 // The log files can get quite big!
   8301 // To disable all of this when compiling Vim for debugging, undefine REGEXP_DEBUG in
   8302 // regexp.c
   8303 #ifdef REGEXP_DEBUG
   8304 # define NFA_REGEXP_ERROR_LOG   "nfa_regexp_error.log"
   8305 # define NFA_REGEXP_DUMP_LOG    "nfa_regexp_dump.log"
   8306 # define NFA_REGEXP_RUN_LOG     "nfa_regexp_run.log"
   8307 # define NFA_REGEXP_DEBUG_LOG   "nfa_regexp_debug.log"
   8308 #endif
   8309 
   8310 // Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
   8311 #define NFA_ADD_NL              31
   8312 
   8313 enum {
   8314  NFA_SPLIT = -1024,
   8315  NFA_MATCH,
   8316  NFA_EMPTY,                        // matches 0-length
   8317 
   8318  NFA_START_COLL,                   // [abc] start
   8319  NFA_END_COLL,                     // [abc] end
   8320  NFA_START_NEG_COLL,               // [^abc] start
   8321  NFA_END_NEG_COLL,                 // [^abc] end (postfix only)
   8322  NFA_RANGE,                        // range of the two previous items
   8323                                    // (postfix only)
   8324  NFA_RANGE_MIN,                    // low end of a range
   8325  NFA_RANGE_MAX,                    // high end of a range
   8326 
   8327  NFA_CONCAT,                       // concatenate two previous items (postfix
   8328                                    // only)
   8329  NFA_OR,                           // \| (postfix only)
   8330  NFA_STAR,                         // greedy * (postfix only)
   8331  NFA_STAR_NONGREEDY,               // non-greedy * (postfix only)
   8332  NFA_QUEST,                        // greedy \? (postfix only)
   8333  NFA_QUEST_NONGREEDY,              // non-greedy \? (postfix only)
   8334 
   8335  NFA_BOL,                          // ^    Begin line
   8336  NFA_EOL,                          // $    End line
   8337  NFA_BOW,                          // \<   Begin word
   8338  NFA_EOW,                          // \>   End word
   8339  NFA_BOF,                          // \%^  Begin file
   8340  NFA_EOF,                          // \%$  End file
   8341  NFA_NEWL,
   8342  NFA_ZSTART,                       // Used for \zs
   8343  NFA_ZEND,                         // Used for \ze
   8344  NFA_NOPEN,                        // Start of subexpression marked with \%(
   8345  NFA_NCLOSE,                       // End of subexpr. marked with \%( ... \)
   8346  NFA_START_INVISIBLE,
   8347  NFA_START_INVISIBLE_FIRST,
   8348  NFA_START_INVISIBLE_NEG,
   8349  NFA_START_INVISIBLE_NEG_FIRST,
   8350  NFA_START_INVISIBLE_BEFORE,
   8351  NFA_START_INVISIBLE_BEFORE_FIRST,
   8352  NFA_START_INVISIBLE_BEFORE_NEG,
   8353  NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
   8354  NFA_START_PATTERN,
   8355  NFA_END_INVISIBLE,
   8356  NFA_END_INVISIBLE_NEG,
   8357  NFA_END_PATTERN,
   8358  NFA_COMPOSING,                    // Next nodes in NFA are part of the
   8359                                    // composing multibyte char
   8360  NFA_END_COMPOSING,                // End of a composing char in the NFA
   8361  NFA_ANY_COMPOSING,                // \%C: Any composing characters.
   8362  NFA_OPT_CHARS,                    // \%[abc]
   8363 
   8364  // The following are used only in the postfix form, not in the NFA
   8365  NFA_PREV_ATOM_NO_WIDTH,           // Used for \@=
   8366  NFA_PREV_ATOM_NO_WIDTH_NEG,       // Used for \@!
   8367  NFA_PREV_ATOM_JUST_BEFORE,        // Used for \@<=
   8368  NFA_PREV_ATOM_JUST_BEFORE_NEG,    // Used for \@<!
   8369  NFA_PREV_ATOM_LIKE_PATTERN,       // Used for \@>
   8370 
   8371  NFA_BACKREF1,                     // \1
   8372  NFA_BACKREF2,                     // \2
   8373  NFA_BACKREF3,                     // \3
   8374  NFA_BACKREF4,                     // \4
   8375  NFA_BACKREF5,                     // \5
   8376  NFA_BACKREF6,                     // \6
   8377  NFA_BACKREF7,                     // \7
   8378  NFA_BACKREF8,                     // \8
   8379  NFA_BACKREF9,                     // \9
   8380  NFA_ZREF1,                        // \z1
   8381  NFA_ZREF2,                        // \z2
   8382  NFA_ZREF3,                        // \z3
   8383  NFA_ZREF4,                        // \z4
   8384  NFA_ZREF5,                        // \z5
   8385  NFA_ZREF6,                        // \z6
   8386  NFA_ZREF7,                        // \z7
   8387  NFA_ZREF8,                        // \z8
   8388  NFA_ZREF9,                        // \z9
   8389  NFA_SKIP,                         // Skip characters
   8390 
   8391  NFA_MOPEN,
   8392  NFA_MOPEN1,
   8393  NFA_MOPEN2,
   8394  NFA_MOPEN3,
   8395  NFA_MOPEN4,
   8396  NFA_MOPEN5,
   8397  NFA_MOPEN6,
   8398  NFA_MOPEN7,
   8399  NFA_MOPEN8,
   8400  NFA_MOPEN9,
   8401 
   8402  NFA_MCLOSE,
   8403  NFA_MCLOSE1,
   8404  NFA_MCLOSE2,
   8405  NFA_MCLOSE3,
   8406  NFA_MCLOSE4,
   8407  NFA_MCLOSE5,
   8408  NFA_MCLOSE6,
   8409  NFA_MCLOSE7,
   8410  NFA_MCLOSE8,
   8411  NFA_MCLOSE9,
   8412 
   8413  NFA_ZOPEN,
   8414  NFA_ZOPEN1,
   8415  NFA_ZOPEN2,
   8416  NFA_ZOPEN3,
   8417  NFA_ZOPEN4,
   8418  NFA_ZOPEN5,
   8419  NFA_ZOPEN6,
   8420  NFA_ZOPEN7,
   8421  NFA_ZOPEN8,
   8422  NFA_ZOPEN9,
   8423 
   8424  NFA_ZCLOSE,
   8425  NFA_ZCLOSE1,
   8426  NFA_ZCLOSE2,
   8427  NFA_ZCLOSE3,
   8428  NFA_ZCLOSE4,
   8429  NFA_ZCLOSE5,
   8430  NFA_ZCLOSE6,
   8431  NFA_ZCLOSE7,
   8432  NFA_ZCLOSE8,
   8433  NFA_ZCLOSE9,
   8434 
   8435  // NFA_FIRST_NL
   8436  NFA_ANY,              //      Match any one character.
   8437  NFA_IDENT,            //      Match identifier char
   8438  NFA_SIDENT,           //      Match identifier char but no digit
   8439  NFA_KWORD,            //      Match keyword char
   8440  NFA_SKWORD,           //      Match word char but no digit
   8441  NFA_FNAME,            //      Match file name char
   8442  NFA_SFNAME,           //      Match file name char but no digit
   8443  NFA_PRINT,            //      Match printable char
   8444  NFA_SPRINT,           //      Match printable char but no digit
   8445  NFA_WHITE,            //      Match whitespace char
   8446  NFA_NWHITE,           //      Match non-whitespace char
   8447  NFA_DIGIT,            //      Match digit char
   8448  NFA_NDIGIT,           //      Match non-digit char
   8449  NFA_HEX,              //      Match hex char
   8450  NFA_NHEX,             //      Match non-hex char
   8451  NFA_OCTAL,            //      Match octal char
   8452  NFA_NOCTAL,           //      Match non-octal char
   8453  NFA_WORD,             //      Match word char
   8454  NFA_NWORD,            //      Match non-word char
   8455  NFA_HEAD,             //      Match head char
   8456  NFA_NHEAD,            //      Match non-head char
   8457  NFA_ALPHA,            //      Match alpha char
   8458  NFA_NALPHA,           //      Match non-alpha char
   8459  NFA_LOWER,            //      Match lowercase char
   8460  NFA_NLOWER,           //      Match non-lowercase char
   8461  NFA_UPPER,            //      Match uppercase char
   8462  NFA_NUPPER,           //      Match non-uppercase char
   8463  NFA_LOWER_IC,         //      Match [a-z]
   8464  NFA_NLOWER_IC,        //      Match [^a-z]
   8465  NFA_UPPER_IC,         //      Match [A-Z]
   8466  NFA_NUPPER_IC,        //      Match [^A-Z]
   8467 
   8468  NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
   8469  NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
   8470 
   8471  NFA_CURSOR,           //      Match cursor pos
   8472  NFA_LNUM,             //      Match line number
   8473  NFA_LNUM_GT,          //      Match > line number
   8474  NFA_LNUM_LT,          //      Match < line number
   8475  NFA_COL,              //      Match cursor column
   8476  NFA_COL_GT,           //      Match > cursor column
   8477  NFA_COL_LT,           //      Match < cursor column
   8478  NFA_VCOL,             //      Match cursor virtual column
   8479  NFA_VCOL_GT,          //      Match > cursor virtual column
   8480  NFA_VCOL_LT,          //      Match < cursor virtual column
   8481  NFA_MARK,             //      Match mark
   8482  NFA_MARK_GT,          //      Match > mark
   8483  NFA_MARK_LT,          //      Match < mark
   8484  NFA_VISUAL,           //      Match Visual area
   8485 
   8486  // Character classes [:alnum:] etc
   8487  NFA_CLASS_ALNUM,
   8488  NFA_CLASS_ALPHA,
   8489  NFA_CLASS_BLANK,
   8490  NFA_CLASS_CNTRL,
   8491  NFA_CLASS_DIGIT,
   8492  NFA_CLASS_GRAPH,
   8493  NFA_CLASS_LOWER,
   8494  NFA_CLASS_PRINT,
   8495  NFA_CLASS_PUNCT,
   8496  NFA_CLASS_SPACE,
   8497  NFA_CLASS_UPPER,
   8498  NFA_CLASS_XDIGIT,
   8499  NFA_CLASS_TAB,
   8500  NFA_CLASS_RETURN,
   8501  NFA_CLASS_BACKSPACE,
   8502  NFA_CLASS_ESCAPE,
   8503  NFA_CLASS_IDENT,
   8504  NFA_CLASS_KEYWORD,
   8505  NFA_CLASS_FNAME,
   8506 };
   8507 
   8508 // Keep in sync with classchars.
   8509 static int nfa_classcodes[] = {
   8510  NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD, NFA_SKWORD,
   8511  NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
   8512  NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
   8513  NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
   8514  NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
   8515  NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
   8516  NFA_UPPER, NFA_NUPPER
   8517 };
   8518 
   8519 static const char e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely");
   8520 static const char e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
   8521 static const char e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %" PRId64);
   8522 static const char e_value_too_large[] = N_("E951: \\% value too large");
   8523 
   8524 // Variables only used in nfa_regcomp() and descendants.
   8525 static int nfa_re_flags;  ///< re_flags passed to nfa_regcomp().
   8526 static int *post_start;   ///< holds the postfix form of r.e.
   8527 static int *post_end;
   8528 static int *post_ptr;
   8529 
   8530 // Set when the pattern should use the NFA engine.
   8531 // E.g. [[:upper:]] only allows 8bit characters for BT engine,
   8532 // while NFA engine handles multibyte characters correctly.
   8533 static bool wants_nfa;
   8534 
   8535 static int nstate;  ///< Number of states in the NFA. Also used when executing.
   8536 static int istate;  ///< Index in the state vector, used in alloc_state()
   8537 
   8538 // If not NULL match must end at this position
   8539 static save_se_T *nfa_endp = NULL;
   8540 
   8541 // 0 for first call to nfa_regmatch(), 1 for recursive call.
   8542 static int nfa_ll_index = 0;
   8543 
   8544 // Helper functions used when doing re2post() ... regatom() parsing
   8545 #define EMIT(c) \
   8546  do { \
   8547    if (post_ptr >= post_end) { \
   8548      realloc_post_list(); \
   8549    } \
   8550    *post_ptr++ = c; \
   8551  } while (0)
   8552 
   8553 /// Initialize internal variables before NFA compilation.
   8554 ///
   8555 /// @param re_flags  @see vim_regcomp()
   8556 static void nfa_regcomp_start(uint8_t *expr, int re_flags)
   8557 {
   8558  size_t postfix_size;
   8559  size_t nstate_max;
   8560 
   8561  nstate = 0;
   8562  istate = 0;
   8563  // A reasonable estimation for maximum size
   8564  nstate_max = (strlen((char *)expr) + 1) * 25;
   8565 
   8566  // Some items blow up in size, such as [A-z].  Add more space for that.
   8567  // When it is still not enough realloc_post_list() will be used.
   8568  nstate_max += 1000;
   8569 
   8570  // Size for postfix representation of expr.
   8571  postfix_size = sizeof(int) * nstate_max;
   8572 
   8573  post_start = (int *)xmalloc(postfix_size);
   8574  post_ptr = post_start;
   8575  post_end = post_start + nstate_max;
   8576  wants_nfa = false;
   8577  rex.nfa_has_zend = false;
   8578  rex.nfa_has_backref = false;
   8579 
   8580  // shared with BT engine
   8581  regcomp_start(expr, re_flags);
   8582 }
   8583 
   8584 // Figure out if the NFA state list starts with an anchor, must match at start
   8585 // of the line.
   8586 static int nfa_get_reganch(nfa_state_T *start, int depth)
   8587 {
   8588  nfa_state_T *p = start;
   8589 
   8590  if (depth > 4) {
   8591    return 0;
   8592  }
   8593 
   8594  while (p != NULL) {
   8595    switch (p->c) {
   8596    case NFA_BOL:
   8597    case NFA_BOF:
   8598      return 1;           // yes!
   8599 
   8600    case NFA_ZSTART:
   8601    case NFA_ZEND:
   8602    case NFA_CURSOR:
   8603    case NFA_VISUAL:
   8604 
   8605    case NFA_MOPEN:
   8606    case NFA_MOPEN1:
   8607    case NFA_MOPEN2:
   8608    case NFA_MOPEN3:
   8609    case NFA_MOPEN4:
   8610    case NFA_MOPEN5:
   8611    case NFA_MOPEN6:
   8612    case NFA_MOPEN7:
   8613    case NFA_MOPEN8:
   8614    case NFA_MOPEN9:
   8615    case NFA_NOPEN:
   8616    case NFA_ZOPEN:
   8617    case NFA_ZOPEN1:
   8618    case NFA_ZOPEN2:
   8619    case NFA_ZOPEN3:
   8620    case NFA_ZOPEN4:
   8621    case NFA_ZOPEN5:
   8622    case NFA_ZOPEN6:
   8623    case NFA_ZOPEN7:
   8624    case NFA_ZOPEN8:
   8625    case NFA_ZOPEN9:
   8626      p = p->out;
   8627      break;
   8628 
   8629    case NFA_SPLIT:
   8630      return nfa_get_reganch(p->out, depth + 1)
   8631             && nfa_get_reganch(p->out1, depth + 1);
   8632 
   8633    default:
   8634      return 0;           // noooo
   8635    }
   8636  }
   8637  return 0;
   8638 }
   8639 
   8640 // Figure out if the NFA state list starts with a character which must match
   8641 // at start of the match.
   8642 static int nfa_get_regstart(nfa_state_T *start, int depth)
   8643 {
   8644  nfa_state_T *p = start;
   8645 
   8646  if (depth > 4) {
   8647    return 0;
   8648  }
   8649 
   8650  while (p != NULL) {
   8651    switch (p->c) {
   8652    // all kinds of zero-width matches
   8653    case NFA_BOL:
   8654    case NFA_BOF:
   8655    case NFA_BOW:
   8656    case NFA_EOW:
   8657    case NFA_ZSTART:
   8658    case NFA_ZEND:
   8659    case NFA_CURSOR:
   8660    case NFA_VISUAL:
   8661    case NFA_LNUM:
   8662    case NFA_LNUM_GT:
   8663    case NFA_LNUM_LT:
   8664    case NFA_COL:
   8665    case NFA_COL_GT:
   8666    case NFA_COL_LT:
   8667    case NFA_VCOL:
   8668    case NFA_VCOL_GT:
   8669    case NFA_VCOL_LT:
   8670    case NFA_MARK:
   8671    case NFA_MARK_GT:
   8672    case NFA_MARK_LT:
   8673 
   8674    case NFA_MOPEN:
   8675    case NFA_MOPEN1:
   8676    case NFA_MOPEN2:
   8677    case NFA_MOPEN3:
   8678    case NFA_MOPEN4:
   8679    case NFA_MOPEN5:
   8680    case NFA_MOPEN6:
   8681    case NFA_MOPEN7:
   8682    case NFA_MOPEN8:
   8683    case NFA_MOPEN9:
   8684    case NFA_NOPEN:
   8685    case NFA_ZOPEN:
   8686    case NFA_ZOPEN1:
   8687    case NFA_ZOPEN2:
   8688    case NFA_ZOPEN3:
   8689    case NFA_ZOPEN4:
   8690    case NFA_ZOPEN5:
   8691    case NFA_ZOPEN6:
   8692    case NFA_ZOPEN7:
   8693    case NFA_ZOPEN8:
   8694    case NFA_ZOPEN9:
   8695      p = p->out;
   8696      break;
   8697 
   8698    case NFA_SPLIT: {
   8699      int c1 = nfa_get_regstart(p->out, depth + 1);
   8700      int c2 = nfa_get_regstart(p->out1, depth + 1);
   8701 
   8702      if (c1 == c2) {
   8703        return c1;             // yes!
   8704      }
   8705      return 0;
   8706    }
   8707 
   8708    default:
   8709      if (p->c > 0) {
   8710        return p->c;             // yes!
   8711      }
   8712      return 0;
   8713    }
   8714  }
   8715  return 0;
   8716 }
   8717 
   8718 // Figure out if the NFA state list contains just literal text and nothing
   8719 // else.  If so return a string in allocated memory with what must match after
   8720 // regstart.  Otherwise return NULL.
   8721 static uint8_t *nfa_get_match_text(nfa_state_T *start)
   8722 {
   8723  nfa_state_T *p = start;
   8724  int len = 0;
   8725  uint8_t *ret;
   8726  uint8_t *s;
   8727 
   8728  if (p->c != NFA_MOPEN) {
   8729    return NULL;     // just in case
   8730  }
   8731  p = p->out;
   8732  while (p->c > 0) {
   8733    len += utf_char2len(p->c);
   8734    p = p->out;
   8735  }
   8736  if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH) {
   8737    return NULL;
   8738  }
   8739 
   8740  ret = xmalloc((size_t)len);
   8741  p = start->out->out;     // skip first char, it goes into regstart
   8742  s = ret;
   8743  while (p->c > 0) {
   8744    s += utf_char2bytes(p->c, (char *)s);
   8745    p = p->out;
   8746  }
   8747  *s = NUL;
   8748 
   8749  return ret;
   8750 }
   8751 
   8752 // Allocate more space for post_start.  Called when
   8753 // running above the estimated number of states.
   8754 static void realloc_post_list(void)
   8755 {
   8756  // For weird patterns the number of states can be very high. Increasing by
   8757  // 50% seems a reasonable compromise between memory use and speed.
   8758  const size_t new_max = (size_t)(post_end - post_start) * 3 / 2;
   8759  int *new_start = xrealloc(post_start, new_max * sizeof(int));
   8760  post_ptr = new_start + (post_ptr - post_start);
   8761  post_end = new_start + new_max;
   8762  post_start = new_start;
   8763 }
   8764 
   8765 // Search between "start" and "end" and try to recognize a
   8766 // character class in expanded form. For example [0-9].
   8767 // On success, return the id the character class to be emitted.
   8768 // On failure, return 0 (=FAIL)
   8769 // Start points to the first char of the range, while end should point
   8770 // to the closing brace.
   8771 // Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
   8772 // need to be interpreted as [a-zA-Z].
   8773 static int nfa_recognize_char_class(uint8_t *start, const uint8_t *end, int extra_newl)
   8774 {
   8775 #define CLASS_not            0x80
   8776 #define CLASS_af             0x40
   8777 #define CLASS_AF             0x20
   8778 #define CLASS_az             0x10
   8779 #define CLASS_AZ             0x08
   8780 #define CLASS_o7             0x04
   8781 #define CLASS_o9             0x02
   8782 #define CLASS_underscore     0x01
   8783 
   8784  uint8_t *p;
   8785  int config = 0;
   8786 
   8787  bool newl = extra_newl == true;
   8788 
   8789  if (*end != ']') {
   8790    return FAIL;
   8791  }
   8792  p = start;
   8793  if (*p == '^') {
   8794    config |= CLASS_not;
   8795    p++;
   8796  }
   8797 
   8798  while (p < end) {
   8799    if (p + 2 < end && *(p + 1) == '-') {
   8800      switch (*p) {
   8801      case '0':
   8802        if (*(p + 2) == '9') {
   8803          config |= CLASS_o9;
   8804          break;
   8805        } else if (*(p + 2) == '7') {
   8806          config |= CLASS_o7;
   8807          break;
   8808        }
   8809        return FAIL;
   8810      case 'a':
   8811        if (*(p + 2) == 'z') {
   8812          config |= CLASS_az;
   8813          break;
   8814        } else if (*(p + 2) == 'f') {
   8815          config |= CLASS_af;
   8816          break;
   8817        }
   8818        return FAIL;
   8819      case 'A':
   8820        if (*(p + 2) == 'Z') {
   8821          config |= CLASS_AZ;
   8822          break;
   8823        } else if (*(p + 2) == 'F') {
   8824          config |= CLASS_AF;
   8825          break;
   8826        }
   8827        return FAIL;
   8828      default:
   8829        return FAIL;
   8830      }
   8831      p += 3;
   8832    } else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n') {
   8833      newl = true;
   8834      p += 2;
   8835    } else if (*p == '_') {
   8836      config |= CLASS_underscore;
   8837      p++;
   8838    } else if (*p == '\n') {
   8839      newl = true;
   8840      p++;
   8841    } else {
   8842      return FAIL;
   8843    }
   8844  }   // while (p < end)
   8845 
   8846  if (p != end) {
   8847    return FAIL;
   8848  }
   8849 
   8850  if (newl == true) {
   8851    extra_newl = NFA_ADD_NL;
   8852  }
   8853 
   8854  switch (config) {
   8855  case CLASS_o9:
   8856    return extra_newl + NFA_DIGIT;
   8857  case CLASS_not |  CLASS_o9:
   8858    return extra_newl + NFA_NDIGIT;
   8859  case CLASS_af | CLASS_AF | CLASS_o9:
   8860    return extra_newl + NFA_HEX;
   8861  case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
   8862    return extra_newl + NFA_NHEX;
   8863  case CLASS_o7:
   8864    return extra_newl + NFA_OCTAL;
   8865  case CLASS_not | CLASS_o7:
   8866    return extra_newl + NFA_NOCTAL;
   8867  case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
   8868    return extra_newl + NFA_WORD;
   8869  case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
   8870    return extra_newl + NFA_NWORD;
   8871  case CLASS_az | CLASS_AZ | CLASS_underscore:
   8872    return extra_newl + NFA_HEAD;
   8873  case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
   8874    return extra_newl + NFA_NHEAD;
   8875  case CLASS_az | CLASS_AZ:
   8876    return extra_newl + NFA_ALPHA;
   8877  case CLASS_not | CLASS_az | CLASS_AZ:
   8878    return extra_newl + NFA_NALPHA;
   8879  case CLASS_az:
   8880    return extra_newl + NFA_LOWER_IC;
   8881  case CLASS_not | CLASS_az:
   8882    return extra_newl + NFA_NLOWER_IC;
   8883  case CLASS_AZ:
   8884    return extra_newl + NFA_UPPER_IC;
   8885  case CLASS_not | CLASS_AZ:
   8886    return extra_newl + NFA_NUPPER_IC;
   8887  }
   8888  return FAIL;
   8889 }
   8890 
   8891 // Produce the bytes for equivalence class "c".
   8892 // Currently only handles latin1, latin9 and utf-8.
   8893 // Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
   8894 // equivalent to 'a OR b OR c'
   8895 //
   8896 // NOTE! When changing this function, also update reg_equi_class()
   8897 static void nfa_emit_equi_class(int c)
   8898 {
   8899 #define EMIT2(c)   EMIT(c); EMIT(NFA_CONCAT);
   8900 
   8901  {
   8902 #define A_grave 0xc0
   8903 #define A_acute 0xc1
   8904 #define A_circumflex 0xc2
   8905 #define A_virguilla 0xc3
   8906 #define A_diaeresis 0xc4
   8907 #define A_ring 0xc5
   8908 #define C_cedilla 0xc7
   8909 #define E_grave 0xc8
   8910 #define E_acute 0xc9
   8911 #define E_circumflex 0xca
   8912 #define E_diaeresis 0xcb
   8913 #define I_grave 0xcc
   8914 #define I_acute 0xcd
   8915 #define I_circumflex 0xce
   8916 #define I_diaeresis 0xcf
   8917 #define N_virguilla 0xd1
   8918 #define O_grave 0xd2
   8919 #define O_acute 0xd3
   8920 #define O_circumflex 0xd4
   8921 #define O_virguilla 0xd5
   8922 #define O_diaeresis 0xd6
   8923 #define O_slash 0xd8
   8924 #define U_grave 0xd9
   8925 #define U_acute 0xda
   8926 #define U_circumflex 0xdb
   8927 #define U_diaeresis 0xdc
   8928 #define Y_acute 0xdd
   8929 #define a_grave 0xe0
   8930 #define a_acute 0xe1
   8931 #define a_circumflex 0xe2
   8932 #define a_virguilla 0xe3
   8933 #define a_diaeresis 0xe4
   8934 #define a_ring 0xe5
   8935 #define c_cedilla 0xe7
   8936 #define e_grave 0xe8
   8937 #define e_acute 0xe9
   8938 #define e_circumflex 0xea
   8939 #define e_diaeresis 0xeb
   8940 #define i_grave 0xec
   8941 #define i_acute 0xed
   8942 #define i_circumflex 0xee
   8943 #define i_diaeresis 0xef
   8944 #define n_virguilla 0xf1
   8945 #define o_grave 0xf2
   8946 #define o_acute 0xf3
   8947 #define o_circumflex 0xf4
   8948 #define o_virguilla 0xf5
   8949 #define o_diaeresis 0xf6
   8950 #define o_slash 0xf8
   8951 #define u_grave 0xf9
   8952 #define u_acute 0xfa
   8953 #define u_circumflex 0xfb
   8954 #define u_diaeresis 0xfc
   8955 #define y_acute 0xfd
   8956 #define y_diaeresis 0xff
   8957    switch (c) {
   8958    case 'A':
   8959    case A_grave:
   8960    case A_acute:
   8961    case A_circumflex:
   8962    case A_virguilla:
   8963    case A_diaeresis:
   8964    case A_ring:
   8965    case 0x100:
   8966    case 0x102:
   8967    case 0x104:
   8968    case 0x1cd:
   8969    case 0x1de:
   8970    case 0x1e0:
   8971    case 0x1fa:
   8972    case 0x200:
   8973    case 0x202:
   8974    case 0x226:
   8975    case 0x23a:
   8976    case 0x1e00:
   8977    case 0x1ea0:
   8978    case 0x1ea2:
   8979    case 0x1ea4:
   8980    case 0x1ea6:
   8981    case 0x1ea8:
   8982    case 0x1eaa:
   8983    case 0x1eac:
   8984    case 0x1eae:
   8985    case 0x1eb0:
   8986    case 0x1eb2:
   8987    case 0x1eb4:
   8988    case 0x1eb6:
   8989      EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
   8990      EMIT2(A_circumflex) EMIT2(A_virguilla)
   8991      EMIT2(A_diaeresis) EMIT2(A_ring)
   8992      EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
   8993      EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
   8994      EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
   8995      EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
   8996      EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
   8997      EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
   8998      EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
   8999      EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
   9000      return;
   9001 
   9002    case 'B':
   9003    case 0x181:
   9004    case 0x243:
   9005    case 0x1e02:
   9006    case 0x1e04:
   9007    case 0x1e06:
   9008      EMIT2('B')
   9009      EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
   9010      EMIT2(0x1e04) EMIT2(0x1e06)
   9011      return;
   9012 
   9013    case 'C':
   9014    case C_cedilla:
   9015    case 0x106:
   9016    case 0x108:
   9017    case 0x10a:
   9018    case 0x10c:
   9019    case 0x187:
   9020    case 0x23b:
   9021    case 0x1e08:
   9022    case 0xa792:
   9023      EMIT2('C') EMIT2(C_cedilla)
   9024      EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
   9025      EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
   9026      EMIT2(0x1e08) EMIT2(0xa792)
   9027      return;
   9028 
   9029    case 'D':
   9030    case 0x10e:
   9031    case 0x110:
   9032    case 0x18a:
   9033    case 0x1e0a:
   9034    case 0x1e0c:
   9035    case 0x1e0e:
   9036    case 0x1e10:
   9037    case 0x1e12:
   9038      EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
   9039      EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
   9040      EMIT2(0x1e10) EMIT2(0x1e12)
   9041      return;
   9042 
   9043    case 'E':
   9044    case E_grave:
   9045    case E_acute:
   9046    case E_circumflex:
   9047    case E_diaeresis:
   9048    case 0x112:
   9049    case 0x114:
   9050    case 0x116:
   9051    case 0x118:
   9052    case 0x11a:
   9053    case 0x204:
   9054    case 0x206:
   9055    case 0x228:
   9056    case 0x246:
   9057    case 0x1e14:
   9058    case 0x1e16:
   9059    case 0x1e18:
   9060    case 0x1e1a:
   9061    case 0x1e1c:
   9062    case 0x1eb8:
   9063    case 0x1eba:
   9064    case 0x1ebc:
   9065    case 0x1ebe:
   9066    case 0x1ec0:
   9067    case 0x1ec2:
   9068    case 0x1ec4:
   9069    case 0x1ec6:
   9070      EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
   9071      EMIT2(E_circumflex) EMIT2(E_diaeresis)
   9072      EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
   9073      EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
   9074      EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
   9075      EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
   9076      EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
   9077      EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
   9078      EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
   9079      EMIT2(0x1ec6)
   9080      return;
   9081 
   9082    case 'F':
   9083    case 0x191:
   9084    case 0x1e1e:
   9085    case 0xa798:
   9086      EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
   9087      return;
   9088 
   9089    case 'G':
   9090    case 0x11c:
   9091    case 0x11e:
   9092    case 0x120:
   9093    case 0x122:
   9094    case 0x193:
   9095    case 0x1e4:
   9096    case 0x1e6:
   9097    case 0x1f4:
   9098    case 0x1e20:
   9099    case 0xa7a0:
   9100      EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
   9101      EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
   9102      EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
   9103      EMIT2(0xa7a0)
   9104      return;
   9105 
   9106    case 'H':
   9107    case 0x124:
   9108    case 0x126:
   9109    case 0x21e:
   9110    case 0x1e22:
   9111    case 0x1e24:
   9112    case 0x1e26:
   9113    case 0x1e28:
   9114    case 0x1e2a:
   9115    case 0x2c67:
   9116      EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
   9117      EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
   9118      EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
   9119      return;
   9120 
   9121    case 'I':
   9122    case I_grave:
   9123    case I_acute:
   9124    case I_circumflex:
   9125    case I_diaeresis:
   9126    case 0x128:
   9127    case 0x12a:
   9128    case 0x12c:
   9129    case 0x12e:
   9130    case 0x130:
   9131    case 0x197:
   9132    case 0x1cf:
   9133    case 0x208:
   9134    case 0x20a:
   9135    case 0x1e2c:
   9136    case 0x1e2e:
   9137    case 0x1ec8:
   9138    case 0x1eca:
   9139      EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
   9140      EMIT2(I_circumflex) EMIT2(I_diaeresis)
   9141      EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
   9142      EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
   9143      EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
   9144      EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
   9145      EMIT2(0x1eca)
   9146      return;
   9147 
   9148    case 'J':
   9149    case 0x134:
   9150    case 0x248:
   9151      EMIT2('J') EMIT2(0x134) EMIT2(0x248)
   9152      return;
   9153 
   9154    case 'K':
   9155    case 0x136:
   9156    case 0x198:
   9157    case 0x1e8:
   9158    case 0x1e30:
   9159    case 0x1e32:
   9160    case 0x1e34:
   9161    case 0x2c69:
   9162    case 0xa740:
   9163      EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
   9164      EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
   9165      EMIT2(0x2c69) EMIT2(0xa740)
   9166      return;
   9167 
   9168    case 'L':
   9169    case 0x139:
   9170    case 0x13b:
   9171    case 0x13d:
   9172    case 0x13f:
   9173    case 0x141:
   9174    case 0x23d:
   9175    case 0x1e36:
   9176    case 0x1e38:
   9177    case 0x1e3a:
   9178    case 0x1e3c:
   9179    case 0x2c60:
   9180      EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
   9181      EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
   9182      EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
   9183      EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
   9184      return;
   9185 
   9186    case 'M':
   9187    case 0x1e3e:
   9188    case 0x1e40:
   9189    case 0x1e42:
   9190      EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
   9191      EMIT2(0x1e42)
   9192      return;
   9193 
   9194    case 'N':
   9195    case N_virguilla:
   9196    case 0x143:
   9197    case 0x145:
   9198    case 0x147:
   9199    case 0x1f8:
   9200    case 0x1e44:
   9201    case 0x1e46:
   9202    case 0x1e48:
   9203    case 0x1e4a:
   9204    case 0xa7a4:
   9205      EMIT2('N') EMIT2(N_virguilla)
   9206      EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
   9207      EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
   9208      EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
   9209      return;
   9210 
   9211    case 'O':
   9212    case O_grave:
   9213    case O_acute:
   9214    case O_circumflex:
   9215    case O_virguilla:
   9216    case O_diaeresis:
   9217    case O_slash:
   9218    case 0x14c:
   9219    case 0x14e:
   9220    case 0x150:
   9221    case 0x19f:
   9222    case 0x1a0:
   9223    case 0x1d1:
   9224    case 0x1ea:
   9225    case 0x1ec:
   9226    case 0x1fe:
   9227    case 0x20c:
   9228    case 0x20e:
   9229    case 0x22a:
   9230    case 0x22c:
   9231    case 0x22e:
   9232    case 0x230:
   9233    case 0x1e4c:
   9234    case 0x1e4e:
   9235    case 0x1e50:
   9236    case 0x1e52:
   9237    case 0x1ecc:
   9238    case 0x1ece:
   9239    case 0x1ed0:
   9240    case 0x1ed2:
   9241    case 0x1ed4:
   9242    case 0x1ed6:
   9243    case 0x1ed8:
   9244    case 0x1eda:
   9245    case 0x1edc:
   9246    case 0x1ede:
   9247    case 0x1ee0:
   9248    case 0x1ee2:
   9249      EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
   9250      EMIT2(O_circumflex) EMIT2(O_virguilla)
   9251      EMIT2(O_diaeresis) EMIT2(O_slash)
   9252      EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
   9253      EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
   9254      EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
   9255      EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
   9256      EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
   9257      EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
   9258      EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
   9259      EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
   9260      EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
   9261      EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
   9262      EMIT2(0x1ee2)
   9263      return;
   9264 
   9265    case 'P':
   9266    case 0x1a4:
   9267    case 0x1e54:
   9268    case 0x1e56:
   9269    case 0x2c63:
   9270      EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
   9271      EMIT2(0x2c63)
   9272      return;
   9273 
   9274    case 'Q':
   9275    case 0x24a:
   9276      EMIT2('Q') EMIT2(0x24a)
   9277      return;
   9278 
   9279    case 'R':
   9280    case 0x154:
   9281    case 0x156:
   9282    case 0x158:
   9283    case 0x210:
   9284    case 0x212:
   9285    case 0x24c:
   9286    case 0x1e58:
   9287    case 0x1e5a:
   9288    case 0x1e5c:
   9289    case 0x1e5e:
   9290    case 0x2c64:
   9291    case 0xa7a6:
   9292      EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
   9293      EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
   9294      EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
   9295      EMIT2(0xa7a6)
   9296      return;
   9297 
   9298    case 'S':
   9299    case 0x15a:
   9300    case 0x15c:
   9301    case 0x15e:
   9302    case 0x160:
   9303    case 0x218:
   9304    case 0x1e60:
   9305    case 0x1e62:
   9306    case 0x1e64:
   9307    case 0x1e66:
   9308    case 0x1e68:
   9309    case 0x2c7e:
   9310    case 0xa7a8:
   9311      EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
   9312      EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
   9313      EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
   9314      EMIT2(0xa7a8)
   9315      return;
   9316 
   9317    case 'T':
   9318    case 0x162:
   9319    case 0x164:
   9320    case 0x166:
   9321    case 0x1ac:
   9322    case 0x1ae:
   9323    case 0x21a:
   9324    case 0x23e:
   9325    case 0x1e6a:
   9326    case 0x1e6c:
   9327    case 0x1e6e:
   9328    case 0x1e70:
   9329      EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
   9330      EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
   9331      EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
   9332      return;
   9333 
   9334    case 'U':
   9335    case U_grave:
   9336    case U_acute:
   9337    case U_diaeresis:
   9338    case U_circumflex:
   9339    case 0x168:
   9340    case 0x16a:
   9341    case 0x16c:
   9342    case 0x16e:
   9343    case 0x170:
   9344    case 0x172:
   9345    case 0x1af:
   9346    case 0x1d3:
   9347    case 0x1d5:
   9348    case 0x1d7:
   9349    case 0x1d9:
   9350    case 0x1db:
   9351    case 0x214:
   9352    case 0x216:
   9353    case 0x244:
   9354    case 0x1e72:
   9355    case 0x1e74:
   9356    case 0x1e76:
   9357    case 0x1e78:
   9358    case 0x1e7a:
   9359    case 0x1ee4:
   9360    case 0x1ee6:
   9361    case 0x1ee8:
   9362    case 0x1eea:
   9363    case 0x1eec:
   9364    case 0x1eee:
   9365    case 0x1ef0:
   9366      EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
   9367      EMIT2(U_diaeresis) EMIT2(U_circumflex)
   9368      EMIT2(0x168) EMIT2(0x16a)
   9369      EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
   9370      EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
   9371      EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
   9372      EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
   9373      EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
   9374      EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
   9375      EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
   9376      EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
   9377      EMIT2(0x1ef0)
   9378      return;
   9379 
   9380    case 'V':
   9381    case 0x1b2:
   9382    case 0x1e7c:
   9383    case 0x1e7e:
   9384      EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
   9385      return;
   9386 
   9387    case 'W':
   9388    case 0x174:
   9389    case 0x1e80:
   9390    case 0x1e82:
   9391    case 0x1e84:
   9392    case 0x1e86:
   9393    case 0x1e88:
   9394      EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
   9395      EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
   9396      return;
   9397 
   9398    case 'X':
   9399    case 0x1e8a:
   9400    case 0x1e8c:
   9401      EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
   9402      return;
   9403 
   9404    case 'Y':
   9405    case Y_acute:
   9406    case 0x176:
   9407    case 0x178:
   9408    case 0x1b3:
   9409    case 0x232:
   9410    case 0x24e:
   9411    case 0x1e8e:
   9412    case 0x1ef2:
   9413    case 0x1ef4:
   9414    case 0x1ef6:
   9415    case 0x1ef8:
   9416      EMIT2('Y') EMIT2(Y_acute)
   9417      EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
   9418      EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
   9419      EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
   9420      EMIT2(0x1ef8)
   9421      return;
   9422 
   9423    case 'Z':
   9424    case 0x179:
   9425    case 0x17b:
   9426    case 0x17d:
   9427    case 0x1b5:
   9428    case 0x1e90:
   9429    case 0x1e92:
   9430    case 0x1e94:
   9431    case 0x2c6b:
   9432      EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
   9433      EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
   9434      EMIT2(0x1e94) EMIT2(0x2c6b)
   9435      return;
   9436 
   9437    case 'a':
   9438    case a_grave:
   9439    case a_acute:
   9440    case a_circumflex:
   9441    case a_virguilla:
   9442    case a_diaeresis:
   9443    case a_ring:
   9444    case 0x101:
   9445    case 0x103:
   9446    case 0x105:
   9447    case 0x1ce:
   9448    case 0x1df:
   9449    case 0x1e1:
   9450    case 0x1fb:
   9451    case 0x201:
   9452    case 0x203:
   9453    case 0x227:
   9454    case 0x1d8f:
   9455    case 0x1e01:
   9456    case 0x1e9a:
   9457    case 0x1ea1:
   9458    case 0x1ea3:
   9459    case 0x1ea5:
   9460    case 0x1ea7:
   9461    case 0x1ea9:
   9462    case 0x1eab:
   9463    case 0x1ead:
   9464    case 0x1eaf:
   9465    case 0x1eb1:
   9466    case 0x1eb3:
   9467    case 0x1eb5:
   9468    case 0x1eb7:
   9469    case 0x2c65:
   9470      EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
   9471      EMIT2(a_circumflex) EMIT2(a_virguilla)
   9472      EMIT2(a_diaeresis) EMIT2(a_ring)
   9473      EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
   9474      EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
   9475      EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
   9476      EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
   9477      EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
   9478      EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
   9479      EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
   9480      EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
   9481      EMIT2(0x1eb7) EMIT2(0x2c65)
   9482      return;
   9483 
   9484    case 'b':
   9485    case 0x180:
   9486    case 0x253:
   9487    case 0x1d6c:
   9488    case 0x1d80:
   9489    case 0x1e03:
   9490    case 0x1e05:
   9491    case 0x1e07:
   9492      EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
   9493      EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
   9494      return;
   9495 
   9496    case 'c':
   9497    case c_cedilla:
   9498    case 0x107:
   9499    case 0x109:
   9500    case 0x10b:
   9501    case 0x10d:
   9502    case 0x188:
   9503    case 0x23c:
   9504    case 0x1e09:
   9505    case 0xa793:
   9506    case 0xa794:
   9507      EMIT2('c') EMIT2(c_cedilla)
   9508      EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
   9509      EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
   9510      EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
   9511      return;
   9512 
   9513    case 'd':
   9514    case 0x10f:
   9515    case 0x111:
   9516    case 0x257:
   9517    case 0x1d6d:
   9518    case 0x1d81:
   9519    case 0x1d91:
   9520    case 0x1e0b:
   9521    case 0x1e0d:
   9522    case 0x1e0f:
   9523    case 0x1e11:
   9524    case 0x1e13:
   9525      EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
   9526      EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
   9527      EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
   9528      EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
   9529      return;
   9530 
   9531    case 'e':
   9532    case e_grave:
   9533    case e_acute:
   9534    case e_circumflex:
   9535    case e_diaeresis:
   9536    case 0x113:
   9537    case 0x115:
   9538    case 0x117:
   9539    case 0x119:
   9540    case 0x11b:
   9541    case 0x205:
   9542    case 0x207:
   9543    case 0x229:
   9544    case 0x247:
   9545    case 0x1d92:
   9546    case 0x1e15:
   9547    case 0x1e17:
   9548    case 0x1e19:
   9549    case 0x1e1b:
   9550    case 0x1e1d:
   9551    case 0x1eb9:
   9552    case 0x1ebb:
   9553    case 0x1ebd:
   9554    case 0x1ebf:
   9555    case 0x1ec1:
   9556    case 0x1ec3:
   9557    case 0x1ec5:
   9558    case 0x1ec7:
   9559      EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
   9560      EMIT2(e_circumflex) EMIT2(e_diaeresis)
   9561      EMIT2(0x113) EMIT2(0x115)
   9562      EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
   9563      EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
   9564      EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
   9565      EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
   9566      EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
   9567      EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
   9568      EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
   9569      return;
   9570 
   9571    case 'f':
   9572    case 0x192:
   9573    case 0x1d6e:
   9574    case 0x1d82:
   9575    case 0x1e1f:
   9576    case 0xa799:
   9577      EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
   9578      EMIT2(0x1e1f) EMIT2(0xa799)
   9579      return;
   9580 
   9581    case 'g':
   9582    case 0x11d:
   9583    case 0x11f:
   9584    case 0x121:
   9585    case 0x123:
   9586    case 0x1e5:
   9587    case 0x1e7:
   9588    case 0x1f5:
   9589    case 0x260:
   9590    case 0x1d83:
   9591    case 0x1e21:
   9592    case 0xa7a1:
   9593      EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
   9594      EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
   9595      EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
   9596      EMIT2(0x1e21) EMIT2(0xa7a1)
   9597      return;
   9598 
   9599    case 'h':
   9600    case 0x125:
   9601    case 0x127:
   9602    case 0x21f:
   9603    case 0x1e23:
   9604    case 0x1e25:
   9605    case 0x1e27:
   9606    case 0x1e29:
   9607    case 0x1e2b:
   9608    case 0x1e96:
   9609    case 0x2c68:
   9610    case 0xa795:
   9611      EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
   9612      EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
   9613      EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
   9614      EMIT2(0x2c68) EMIT2(0xa795)
   9615      return;
   9616 
   9617    case 'i':
   9618    case i_grave:
   9619    case i_acute:
   9620    case i_circumflex:
   9621    case i_diaeresis:
   9622    case 0x129:
   9623    case 0x12b:
   9624    case 0x12d:
   9625    case 0x12f:
   9626    case 0x1d0:
   9627    case 0x209:
   9628    case 0x20b:
   9629    case 0x268:
   9630    case 0x1d96:
   9631    case 0x1e2d:
   9632    case 0x1e2f:
   9633    case 0x1ec9:
   9634    case 0x1ecb:
   9635      EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
   9636      EMIT2(i_circumflex) EMIT2(i_diaeresis)
   9637      EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
   9638      EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
   9639      EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
   9640      EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
   9641      EMIT2(0x1ecb) EMIT2(0x1ecb)
   9642      return;
   9643 
   9644    case 'j':
   9645    case 0x135:
   9646    case 0x1f0:
   9647    case 0x249:
   9648      EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
   9649      return;
   9650 
   9651    case 'k':
   9652    case 0x137:
   9653    case 0x199:
   9654    case 0x1e9:
   9655    case 0x1d84:
   9656    case 0x1e31:
   9657    case 0x1e33:
   9658    case 0x1e35:
   9659    case 0x2c6a:
   9660    case 0xa741:
   9661      EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
   9662      EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
   9663      EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
   9664      return;
   9665 
   9666    case 'l':
   9667    case 0x13a:
   9668    case 0x13c:
   9669    case 0x13e:
   9670    case 0x140:
   9671    case 0x142:
   9672    case 0x19a:
   9673    case 0x1e37:
   9674    case 0x1e39:
   9675    case 0x1e3b:
   9676    case 0x1e3d:
   9677    case 0x2c61:
   9678      EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
   9679      EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
   9680      EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
   9681      EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
   9682      return;
   9683 
   9684    case 'm':
   9685    case 0x1d6f:
   9686    case 0x1e3f:
   9687    case 0x1e41:
   9688    case 0x1e43:
   9689      EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
   9690      EMIT2(0x1e41) EMIT2(0x1e43)
   9691      return;
   9692 
   9693    case 'n':
   9694    case n_virguilla:
   9695    case 0x144:
   9696    case 0x146:
   9697    case 0x148:
   9698    case 0x149:
   9699    case 0x1f9:
   9700    case 0x1d70:
   9701    case 0x1d87:
   9702    case 0x1e45:
   9703    case 0x1e47:
   9704    case 0x1e49:
   9705    case 0x1e4b:
   9706    case 0xa7a5:
   9707      EMIT2('n') EMIT2(n_virguilla)
   9708      EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
   9709      EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
   9710      EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
   9711      EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
   9712      return;
   9713 
   9714    case 'o':
   9715    case o_grave:
   9716    case o_acute:
   9717    case o_circumflex:
   9718    case o_virguilla:
   9719    case o_diaeresis:
   9720    case o_slash:
   9721    case 0x14d:
   9722    case 0x14f:
   9723    case 0x151:
   9724    case 0x1a1:
   9725    case 0x1d2:
   9726    case 0x1eb:
   9727    case 0x1ed:
   9728    case 0x1ff:
   9729    case 0x20d:
   9730    case 0x20f:
   9731    case 0x22b:
   9732    case 0x22d:
   9733    case 0x22f:
   9734    case 0x231:
   9735    case 0x275:
   9736    case 0x1e4d:
   9737    case 0x1e4f:
   9738    case 0x1e51:
   9739    case 0x1e53:
   9740    case 0x1ecd:
   9741    case 0x1ecf:
   9742    case 0x1ed1:
   9743    case 0x1ed3:
   9744    case 0x1ed5:
   9745    case 0x1ed7:
   9746    case 0x1ed9:
   9747    case 0x1edb:
   9748    case 0x1edd:
   9749    case 0x1edf:
   9750    case 0x1ee1:
   9751    case 0x1ee3:
   9752      EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
   9753      EMIT2(o_circumflex) EMIT2(o_virguilla)
   9754      EMIT2(o_diaeresis) EMIT2(o_slash)
   9755      EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
   9756      EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
   9757      EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
   9758      EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
   9759      EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
   9760      EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
   9761      EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
   9762      EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
   9763      EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
   9764      EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
   9765      EMIT2(0x1ee3)
   9766      return;
   9767 
   9768    case 'p':
   9769    case 0x1a5:
   9770    case 0x1d71:
   9771    case 0x1d7d:
   9772    case 0x1d88:
   9773    case 0x1e55:
   9774    case 0x1e57:
   9775      EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
   9776      EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
   9777      return;
   9778 
   9779    case 'q':
   9780    case 0x24b:
   9781    case 0x2a0:
   9782      EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
   9783      return;
   9784 
   9785    case 'r':
   9786    case 0x155:
   9787    case 0x157:
   9788    case 0x159:
   9789    case 0x211:
   9790    case 0x213:
   9791    case 0x24d:
   9792    case 0x27d:
   9793    case 0x1d72:
   9794    case 0x1d73:
   9795    case 0x1d89:
   9796    case 0x1e59:
   9797    case 0x1e5b:
   9798    case 0x1e5d:
   9799    case 0x1e5f:
   9800    case 0xa7a7:
   9801      EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
   9802      EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
   9803      EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
   9804      EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
   9805      return;
   9806 
   9807    case 's':
   9808    case 0x15b:
   9809    case 0x15d:
   9810    case 0x15f:
   9811    case 0x161:
   9812    case 0x219:
   9813    case 0x23f:
   9814    case 0x1d74:
   9815    case 0x1d8a:
   9816    case 0x1e61:
   9817    case 0x1e63:
   9818    case 0x1e65:
   9819    case 0x1e67:
   9820    case 0x1e69:
   9821    case 0xa7a9:
   9822      EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
   9823      EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
   9824      EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
   9825      EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
   9826      return;
   9827 
   9828    case 't':
   9829    case 0x163:
   9830    case 0x165:
   9831    case 0x167:
   9832    case 0x1ab:
   9833    case 0x1ad:
   9834    case 0x21b:
   9835    case 0x288:
   9836    case 0x1d75:
   9837    case 0x1e6b:
   9838    case 0x1e6d:
   9839    case 0x1e6f:
   9840    case 0x1e71:
   9841    case 0x1e97:
   9842    case 0x2c66:
   9843      EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
   9844      EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
   9845      EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
   9846      EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
   9847      return;
   9848 
   9849    case 'u':
   9850    case u_grave:
   9851    case u_acute:
   9852    case u_circumflex:
   9853    case u_diaeresis:
   9854    case 0x169:
   9855    case 0x16b:
   9856    case 0x16d:
   9857    case 0x16f:
   9858    case 0x171:
   9859    case 0x173:
   9860    case 0x1b0:
   9861    case 0x1d4:
   9862    case 0x1d6:
   9863    case 0x1d8:
   9864    case 0x1da:
   9865    case 0x1dc:
   9866    case 0x215:
   9867    case 0x217:
   9868    case 0x289:
   9869    case 0x1d7e:
   9870    case 0x1d99:
   9871    case 0x1e73:
   9872    case 0x1e75:
   9873    case 0x1e77:
   9874    case 0x1e79:
   9875    case 0x1e7b:
   9876    case 0x1ee5:
   9877    case 0x1ee7:
   9878    case 0x1ee9:
   9879    case 0x1eeb:
   9880    case 0x1eed:
   9881    case 0x1eef:
   9882    case 0x1ef1:
   9883      EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
   9884      EMIT2(u_circumflex) EMIT2(u_diaeresis)
   9885      EMIT2(0x169) EMIT2(0x16b)
   9886      EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
   9887      EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
   9888      EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
   9889      EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
   9890      EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
   9891      EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
   9892      EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
   9893      EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
   9894      EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
   9895      return;
   9896 
   9897    case 'v':
   9898    case 0x28b:
   9899    case 0x1d8c:
   9900    case 0x1e7d:
   9901    case 0x1e7f:
   9902      EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
   9903      EMIT2(0x1e7f)
   9904      return;
   9905 
   9906    case 'w':
   9907    case 0x175:
   9908    case 0x1e81:
   9909    case 0x1e83:
   9910    case 0x1e85:
   9911    case 0x1e87:
   9912    case 0x1e89:
   9913    case 0x1e98:
   9914      EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
   9915      EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
   9916      return;
   9917 
   9918    case 'x':
   9919    case 0x1e8b:
   9920    case 0x1e8d:
   9921      EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
   9922      return;
   9923 
   9924    case 'y':
   9925    case y_acute:
   9926    case y_diaeresis:
   9927    case 0x177:
   9928    case 0x1b4:
   9929    case 0x233:
   9930    case 0x24f:
   9931    case 0x1e8f:
   9932    case 0x1e99:
   9933    case 0x1ef3:
   9934    case 0x1ef5:
   9935    case 0x1ef7:
   9936    case 0x1ef9:
   9937      EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
   9938      EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
   9939      EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
   9940      EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
   9941      return;
   9942 
   9943    case 'z':
   9944    case 0x17a:
   9945    case 0x17c:
   9946    case 0x17e:
   9947    case 0x1b6:
   9948    case 0x1d76:
   9949    case 0x1d8e:
   9950    case 0x1e91:
   9951    case 0x1e93:
   9952    case 0x1e95:
   9953    case 0x2c6c:
   9954      EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
   9955      EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
   9956      EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
   9957      return;
   9958 
   9959      // default: character itself
   9960    }
   9961  }
   9962 
   9963  EMIT2(c);
   9964 #undef EMIT2
   9965 }
   9966 
   9967 // Code to parse regular expression.
   9968 //
   9969 // We try to reuse parsing functions in regexp.c to
   9970 // minimize surprise and keep the syntax consistent.
   9971 
   9972 // Parse the lowest level.
   9973 //
   9974 // An atom can be one of a long list of items.  Many atoms match one character
   9975 // in the text.  It is often an ordinary character or a character class.
   9976 // Braces can be used to make a pattern into an atom.  The "\z(\)" construct
   9977 // is only for syntax highlighting.
   9978 //
   9979 // atom    ::=     ordinary-atom
   9980 //     or  \( pattern \)
   9981 //     or  \%( pattern \)
   9982 //     or  \z( pattern \)
   9983 static int nfa_regatom(void)
   9984 {
   9985  int c;
   9986  int charclass;
   9987  int equiclass;
   9988  int collclass;
   9989  int got_coll_char;
   9990  uint8_t *p;
   9991  uint8_t *endp;
   9992  uint8_t *old_regparse = (uint8_t *)regparse;
   9993  int extra = 0;
   9994  int emit_range;
   9995  int negated;
   9996  int startc = -1;
   9997  int save_prev_at_start = prev_at_start;
   9998 
   9999  c = getchr();
  10000  switch (c) {
  10001  case NUL:
  10002    EMSG_RET_FAIL(_(e_nul_found));
  10003 
  10004  case Magic('^'):
  10005    EMIT(NFA_BOL);
  10006    break;
  10007 
  10008  case Magic('$'):
  10009    EMIT(NFA_EOL);
  10010    had_eol = true;
  10011    break;
  10012 
  10013  case Magic('<'):
  10014    EMIT(NFA_BOW);
  10015    break;
  10016 
  10017  case Magic('>'):
  10018    EMIT(NFA_EOW);
  10019    break;
  10020 
  10021  case Magic('_'):
  10022    c = no_Magic(getchr());
  10023    if (c == NUL) {
  10024      EMSG_RET_FAIL(_(e_nul_found));
  10025    }
  10026 
  10027    if (c == '^') {             // "\_^" is start-of-line
  10028      EMIT(NFA_BOL);
  10029      break;
  10030    }
  10031    if (c == '$') {             // "\_$" is end-of-line
  10032      EMIT(NFA_EOL);
  10033      had_eol = true;
  10034      break;
  10035    }
  10036 
  10037    extra = NFA_ADD_NL;
  10038 
  10039    // "\_[" is collection plus newline
  10040    if (c == '[') {
  10041      goto collection;
  10042    }
  10043 
  10044    // "\_x" is character class plus newline
  10045    FALLTHROUGH;
  10046 
  10047  // Character classes.
  10048  case Magic('.'):
  10049  case Magic('i'):
  10050  case Magic('I'):
  10051  case Magic('k'):
  10052  case Magic('K'):
  10053  case Magic('f'):
  10054  case Magic('F'):
  10055  case Magic('p'):
  10056  case Magic('P'):
  10057  case Magic('s'):
  10058  case Magic('S'):
  10059  case Magic('d'):
  10060  case Magic('D'):
  10061  case Magic('x'):
  10062  case Magic('X'):
  10063  case Magic('o'):
  10064  case Magic('O'):
  10065  case Magic('w'):
  10066  case Magic('W'):
  10067  case Magic('h'):
  10068  case Magic('H'):
  10069  case Magic('a'):
  10070  case Magic('A'):
  10071  case Magic('l'):
  10072  case Magic('L'):
  10073  case Magic('u'):
  10074  case Magic('U'):
  10075    p = (uint8_t *)vim_strchr((char *)classchars, no_Magic(c));
  10076    if (p == NULL) {
  10077      if (extra == NFA_ADD_NL) {
  10078        semsg(_(e_ill_char_class), (int64_t)c);
  10079        rc_did_emsg = true;
  10080        return FAIL;
  10081      }
  10082      siemsg("INTERNAL: Unknown character class char: %d", c);
  10083      return FAIL;
  10084    }
  10085    // When '.' is followed by a composing char ignore the dot, so that
  10086    // the composing char is matched here.
  10087    if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
  10088      old_regparse = (uint8_t *)regparse;
  10089      c = getchr();
  10090      goto nfa_do_multibyte;
  10091    }
  10092    EMIT(nfa_classcodes[p - classchars]);
  10093    if (extra == NFA_ADD_NL) {
  10094      EMIT(NFA_NEWL);
  10095      EMIT(NFA_OR);
  10096      regflags |= RF_HASNL;
  10097    }
  10098    break;
  10099 
  10100  case Magic('n'):
  10101    if (reg_string) {
  10102      // In a string "\n" matches a newline character.
  10103      EMIT(NL);
  10104    } else {
  10105      // In buffer text "\n" matches the end of a line.
  10106      EMIT(NFA_NEWL);
  10107      regflags |= RF_HASNL;
  10108    }
  10109    break;
  10110 
  10111  case Magic('('):
  10112    if (nfa_reg(REG_PAREN) == FAIL) {
  10113      return FAIL;                  // cascaded error
  10114    }
  10115    break;
  10116 
  10117  case Magic('|'):
  10118  case Magic('&'):
  10119  case Magic(')'):
  10120    semsg(_(e_misplaced), (char)no_Magic(c));
  10121    return FAIL;
  10122 
  10123  case Magic('='):
  10124  case Magic('?'):
  10125  case Magic('+'):
  10126  case Magic('@'):
  10127  case Magic('*'):
  10128  case Magic('{'):
  10129    // these should follow an atom, not form an atom
  10130    semsg(_(e_misplaced), (char)no_Magic(c));
  10131    return FAIL;
  10132 
  10133  case Magic('~'): {
  10134    uint8_t *lp;
  10135 
  10136    // Previous substitute pattern.
  10137    // Generated as "\%(pattern\)".
  10138    if (reg_prev_sub == NULL) {
  10139      emsg(_(e_nopresub));
  10140      return FAIL;
  10141    }
  10142    for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; lp += utf_ptr2len((char *)lp)) {
  10143      EMIT(utf_ptr2char((char *)lp));
  10144      if (lp != (uint8_t *)reg_prev_sub) {
  10145        EMIT(NFA_CONCAT);
  10146      }
  10147    }
  10148    EMIT(NFA_NOPEN);
  10149    break;
  10150  }
  10151 
  10152  case Magic('1'):
  10153  case Magic('2'):
  10154  case Magic('3'):
  10155  case Magic('4'):
  10156  case Magic('5'):
  10157  case Magic('6'):
  10158  case Magic('7'):
  10159  case Magic('8'):
  10160  case Magic('9'): {
  10161    int refnum = no_Magic(c) - '1';
  10162 
  10163    if (!seen_endbrace(refnum + 1)) {
  10164      return FAIL;
  10165    }
  10166    EMIT(NFA_BACKREF1 + refnum);
  10167    rex.nfa_has_backref = true;
  10168  }
  10169  break;
  10170 
  10171  case Magic('z'):
  10172    c = no_Magic(getchr());
  10173    switch (c) {
  10174    case 's':
  10175      EMIT(NFA_ZSTART);
  10176      if (!re_mult_next("\\zs")) {
  10177        return false;
  10178      }
  10179      break;
  10180    case 'e':
  10181      EMIT(NFA_ZEND);
  10182      rex.nfa_has_zend = true;
  10183      if (!re_mult_next("\\ze")) {
  10184        return false;
  10185      }
  10186      break;
  10187    case '1':
  10188    case '2':
  10189    case '3':
  10190    case '4':
  10191    case '5':
  10192    case '6':
  10193    case '7':
  10194    case '8':
  10195    case '9':
  10196      // \z1...\z9
  10197      if ((reg_do_extmatch & REX_USE) == 0) {
  10198        EMSG_RET_FAIL(_(e_z1_not_allowed));
  10199      }
  10200      EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
  10201      // No need to set rex.nfa_has_backref, the sub-matches don't
  10202      // change when \z1 .. \z9 matches or not.
  10203      re_has_z = REX_USE;
  10204      break;
  10205    case '(':
  10206      // \z(
  10207      if (reg_do_extmatch != REX_SET) {
  10208        EMSG_RET_FAIL(_(e_z_not_allowed));
  10209      }
  10210      if (nfa_reg(REG_ZPAREN) == FAIL) {
  10211        return FAIL;                        // cascaded error
  10212      }
  10213      re_has_z = REX_SET;
  10214      break;
  10215    default:
  10216      semsg(_("E867: (NFA) Unknown operator '\\z%c'"),
  10217            no_Magic(c));
  10218      return FAIL;
  10219    }
  10220    break;
  10221 
  10222  case Magic('%'):
  10223    c = no_Magic(getchr());
  10224    switch (c) {
  10225    // () without a back reference
  10226    case '(':
  10227      if (nfa_reg(REG_NPAREN) == FAIL) {
  10228        return FAIL;
  10229      }
  10230      EMIT(NFA_NOPEN);
  10231      break;
  10232 
  10233    case 'd':               // %d123 decimal
  10234    case 'o':               // %o123 octal
  10235    case 'x':               // %xab hex 2
  10236    case 'u':               // %uabcd hex 4
  10237    case 'U':               // %U1234abcd hex 8
  10238    {
  10239      int64_t nr;
  10240 
  10241      switch (c) {
  10242      case 'd':
  10243        nr = getdecchrs(); break;
  10244      case 'o':
  10245        nr = getoctchrs(); break;
  10246      case 'x':
  10247        nr = gethexchrs(2); break;
  10248      case 'u':
  10249        nr = gethexchrs(4); break;
  10250      case 'U':
  10251        nr = gethexchrs(8); break;
  10252      default:
  10253        nr = -1; break;
  10254      }
  10255 
  10256      if (nr < 0 || nr > INT_MAX) {
  10257        EMSG2_RET_FAIL(_("E678: Invalid character after %s%%[dxouU]"),
  10258                       reg_magic == MAGIC_ALL);
  10259      }
  10260      // A NUL is stored in the text as NL
  10261      // TODO(vim): what if a composing character follows?
  10262      EMIT(nr == 0 ? 0x0a : (int)nr);
  10263    }
  10264    break;
  10265 
  10266    // Catch \%^ and \%$ regardless of where they appear in the
  10267    // pattern -- regardless of whether or not it makes sense.
  10268    case '^':
  10269      EMIT(NFA_BOF);
  10270      break;
  10271 
  10272    case '$':
  10273      EMIT(NFA_EOF);
  10274      break;
  10275 
  10276    case '#':
  10277      if (regparse[0] == '=' && regparse[1] >= 48
  10278          && regparse[1] <= 50) {
  10279        // misplaced \%#=1
  10280        semsg(_(e_atom_engine_must_be_at_start_of_pattern), regparse[1]);
  10281        return FAIL;
  10282      }
  10283      EMIT(NFA_CURSOR);
  10284      break;
  10285 
  10286    case 'V':
  10287      EMIT(NFA_VISUAL);
  10288      break;
  10289 
  10290    case 'C':
  10291      EMIT(NFA_ANY_COMPOSING);
  10292      break;
  10293 
  10294    case '[': {
  10295      int n;
  10296 
  10297      // \%[abc]
  10298      for (n = 0; (c = peekchr()) != ']'; n++) {
  10299        if (c == NUL) {
  10300          EMSG2_RET_FAIL(_(e_missing_sb),
  10301                         reg_magic == MAGIC_ALL);
  10302        }
  10303        // recursive call!
  10304        if (nfa_regatom() == FAIL) {
  10305          return FAIL;
  10306        }
  10307      }
  10308      (void)getchr();  // get the ]
  10309      if (n == 0) {
  10310        EMSG2_RET_FAIL(_(e_empty_sb), reg_magic == MAGIC_ALL);
  10311      }
  10312      EMIT(NFA_OPT_CHARS);
  10313      EMIT(n);
  10314 
  10315      // Emit as "\%(\%[abc]\)" to be able to handle
  10316      // "\%[abc]*" which would cause the empty string to be
  10317      // matched an unlimited number of times. NFA_NOPEN is
  10318      // added only once at a position, while NFA_SPLIT is
  10319      // added multiple times.  This is more efficient than
  10320      // not allowing NFA_SPLIT multiple times, it is used
  10321      // a lot.
  10322      EMIT(NFA_NOPEN);
  10323      break;
  10324    }
  10325 
  10326    default: {
  10327      int64_t n = 0;
  10328      const int cmp = c;
  10329      bool cur = false;
  10330      bool got_digit = false;
  10331 
  10332      if (c == '<' || c == '>') {
  10333        c = getchr();
  10334      }
  10335      if (no_Magic(c) == '.') {
  10336        cur = true;
  10337        c = getchr();
  10338      }
  10339      while (ascii_isdigit(c)) {
  10340        if (cur) {
  10341          semsg(_(e_regexp_number_after_dot_pos_search_chr), no_Magic(c));
  10342          return FAIL;
  10343        }
  10344        if (n > (INT32_MAX - (c - '0')) / 10) {
  10345          // overflow.
  10346          emsg(_(e_value_too_large));
  10347          return FAIL;
  10348        }
  10349        n = n * 10 + (c - '0');
  10350        c = getchr();
  10351        got_digit = true;
  10352      }
  10353      if (c == 'l' || c == 'c' || c == 'v') {
  10354        int32_t limit = INT32_MAX;
  10355 
  10356        if (!cur && !got_digit) {
  10357          semsg(_(e_nfa_regexp_missing_value_in_chr), no_Magic(c));
  10358          return FAIL;
  10359        }
  10360        if (c == 'l') {
  10361          if (cur) {
  10362            n = curwin->w_cursor.lnum;
  10363          }
  10364          // \%{n}l  \%{n}<l  \%{n}>l
  10365          EMIT(cmp == '<' ? NFA_LNUM_LT
  10366                          : cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
  10367          if (save_prev_at_start) {
  10368            at_start = true;
  10369          }
  10370        } else if (c == 'c') {
  10371          if (cur) {
  10372            n = curwin->w_cursor.col;
  10373            n++;
  10374          }
  10375          // \%{n}c  \%{n}<c  \%{n}>c
  10376          EMIT(cmp == '<' ? NFA_COL_LT
  10377                          : cmp == '>' ? NFA_COL_GT : NFA_COL);
  10378        } else {
  10379          if (cur) {
  10380            colnr_T vcol = 0;
  10381            getvvcol(curwin, &curwin->w_cursor, NULL, NULL, &vcol);
  10382            n = ++vcol;
  10383          }
  10384          // \%{n}v  \%{n}<v  \%{n}>v
  10385          EMIT(cmp == '<' ? NFA_VCOL_LT
  10386                          : cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
  10387          limit = INT32_MAX / MB_MAXBYTES;
  10388        }
  10389        if (n >= limit) {
  10390          emsg(_(e_value_too_large));
  10391          return FAIL;
  10392        }
  10393        EMIT((int)n);
  10394        break;
  10395      } else if (no_Magic(c) == '\'' && n == 0) {
  10396        // \%'m  \%<'m  \%>'m
  10397        EMIT(cmp == '<' ? NFA_MARK_LT
  10398                        : cmp == '>' ? NFA_MARK_GT : NFA_MARK);
  10399        EMIT(getchr());
  10400        break;
  10401      }
  10402    }
  10403      semsg(_("E867: (NFA) Unknown operator '\\%%%c'"),
  10404            no_Magic(c));
  10405      return FAIL;
  10406    }
  10407    break;
  10408 
  10409  case Magic('['):
  10410 collection:
  10411    // [abc]  uses NFA_START_COLL - NFA_END_COLL
  10412    // [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
  10413    // Each character is produced as a regular state, using
  10414    // NFA_CONCAT to bind them together.
  10415    // Besides normal characters there can be:
  10416    // - character classes  NFA_CLASS_*
  10417    // - ranges, two characters followed by NFA_RANGE.
  10418 
  10419    p = (uint8_t *)regparse;
  10420    endp = (uint8_t *)skip_anyof((char *)p);
  10421    if (*endp == ']') {
  10422      // Try to reverse engineer character classes. For example,
  10423      // recognize that [0-9] stands for \d and [A-Za-z_] for \h,
  10424      // and perform the necessary substitutions in the NFA.
  10425      int result = nfa_recognize_char_class((uint8_t *)regparse, endp, extra == NFA_ADD_NL);
  10426      if (result != FAIL) {
  10427        if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) {
  10428          EMIT(result - NFA_ADD_NL);
  10429          EMIT(NFA_NEWL);
  10430          EMIT(NFA_OR);
  10431        } else {
  10432          EMIT(result);
  10433        }
  10434        regparse = (char *)endp;
  10435        MB_PTR_ADV(regparse);
  10436        return OK;
  10437      }
  10438      // Failed to recognize a character class. Use the simple
  10439      // version that turns [abc] into 'a' OR 'b' OR 'c'
  10440      negated = false;
  10441      if (*regparse == '^') {                           // negated range
  10442        negated = true;
  10443        MB_PTR_ADV(regparse);
  10444        EMIT(NFA_START_NEG_COLL);
  10445      } else {
  10446        EMIT(NFA_START_COLL);
  10447      }
  10448      if (*regparse == '-') {
  10449        startc = '-';
  10450        EMIT(startc);
  10451        EMIT(NFA_CONCAT);
  10452        MB_PTR_ADV(regparse);
  10453      }
  10454      // Emit the OR branches for each character in the []
  10455      emit_range = false;
  10456      while ((uint8_t *)regparse < endp) {
  10457        int oldstartc = startc;
  10458        startc = -1;
  10459        got_coll_char = false;
  10460        if (*regparse == '[') {
  10461          // Check for [: :], [= =], [. .]
  10462          equiclass = collclass = 0;
  10463          charclass = get_char_class(&regparse);
  10464          if (charclass == CLASS_NONE) {
  10465            equiclass = get_equi_class(&regparse);
  10466            if (equiclass == 0) {
  10467              collclass = get_coll_element(&regparse);
  10468            }
  10469          }
  10470 
  10471          // Character class like [:alpha:]
  10472          if (charclass != CLASS_NONE) {
  10473            switch (charclass) {
  10474            case CLASS_ALNUM:
  10475              EMIT(NFA_CLASS_ALNUM);
  10476              break;
  10477            case CLASS_ALPHA:
  10478              EMIT(NFA_CLASS_ALPHA);
  10479              break;
  10480            case CLASS_BLANK:
  10481              EMIT(NFA_CLASS_BLANK);
  10482              break;
  10483            case CLASS_CNTRL:
  10484              EMIT(NFA_CLASS_CNTRL);
  10485              break;
  10486            case CLASS_DIGIT:
  10487              EMIT(NFA_CLASS_DIGIT);
  10488              break;
  10489            case CLASS_GRAPH:
  10490              EMIT(NFA_CLASS_GRAPH);
  10491              break;
  10492            case CLASS_LOWER:
  10493              wants_nfa = true;
  10494              EMIT(NFA_CLASS_LOWER);
  10495              break;
  10496            case CLASS_PRINT:
  10497              EMIT(NFA_CLASS_PRINT);
  10498              break;
  10499            case CLASS_PUNCT:
  10500              EMIT(NFA_CLASS_PUNCT);
  10501              break;
  10502            case CLASS_SPACE:
  10503              EMIT(NFA_CLASS_SPACE);
  10504              break;
  10505            case CLASS_UPPER:
  10506              wants_nfa = true;
  10507              EMIT(NFA_CLASS_UPPER);
  10508              break;
  10509            case CLASS_XDIGIT:
  10510              EMIT(NFA_CLASS_XDIGIT);
  10511              break;
  10512            case CLASS_TAB:
  10513              EMIT(NFA_CLASS_TAB);
  10514              break;
  10515            case CLASS_RETURN:
  10516              EMIT(NFA_CLASS_RETURN);
  10517              break;
  10518            case CLASS_BACKSPACE:
  10519              EMIT(NFA_CLASS_BACKSPACE);
  10520              break;
  10521            case CLASS_ESCAPE:
  10522              EMIT(NFA_CLASS_ESCAPE);
  10523              break;
  10524            case CLASS_IDENT:
  10525              EMIT(NFA_CLASS_IDENT);
  10526              break;
  10527            case CLASS_KEYWORD:
  10528              EMIT(NFA_CLASS_KEYWORD);
  10529              break;
  10530            case CLASS_FNAME:
  10531              EMIT(NFA_CLASS_FNAME);
  10532              break;
  10533            }
  10534            EMIT(NFA_CONCAT);
  10535            continue;
  10536          }
  10537          // Try equivalence class [=a=] and the like
  10538          if (equiclass != 0) {
  10539            nfa_emit_equi_class(equiclass);
  10540            continue;
  10541          }
  10542          // Try collating class like [. .]
  10543          if (collclass != 0) {
  10544            startc = collclass;                  // allow [.a.]-x as a range
  10545            // Will emit the proper atom at the end of the
  10546            // while loop.
  10547          }
  10548        }
  10549        // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
  10550        // start character.
  10551        if (*regparse == '-' && oldstartc != -1) {
  10552          emit_range = true;
  10553          startc = oldstartc;
  10554          MB_PTR_ADV(regparse);
  10555          continue;                         // reading the end of the range
  10556        }
  10557 
  10558        // Now handle simple and escaped characters.
  10559        // Only "\]", "\^", "\]" and "\\" are special in Vi.  Vim
  10560        // accepts "\t", "\e", etc., but only when the 'l' flag in
  10561        // 'cpoptions' is not included.
  10562        if (*regparse == '\\'
  10563            && (uint8_t *)regparse + 1 <= endp
  10564            && (vim_strchr(REGEXP_INRANGE, (uint8_t)regparse[1]) != NULL
  10565                || (!reg_cpo_lit
  10566                    && vim_strchr(REGEXP_ABBR, (uint8_t)regparse[1])
  10567                    != NULL))) {
  10568          MB_PTR_ADV(regparse);
  10569 
  10570          if (*regparse == 'n') {
  10571            startc = (reg_string || emit_range || regparse[1] == '-')
  10572                     ? NL : NFA_NEWL;
  10573          } else if (*regparse == 'd'
  10574                     || *regparse == 'o'
  10575                     || *regparse == 'x'
  10576                     || *regparse == 'u'
  10577                     || *regparse == 'U') {
  10578            // TODO(RE): This needs more testing
  10579            startc = coll_get_char();
  10580            // max UTF-8 Codepoint is U+10FFFF,
  10581            // but allow values until INT_MAX
  10582            if (startc == INT_MAX) {
  10583              EMSG_RET_FAIL(_(e_unicode_val_too_large));
  10584            }
  10585            got_coll_char = true;
  10586            MB_PTR_BACK(old_regparse, regparse);
  10587          } else {
  10588            // \r,\t,\e,\b
  10589            startc = backslash_trans(*regparse);
  10590          }
  10591        }
  10592 
  10593        // Normal printable char
  10594        if (startc == -1) {
  10595          startc = utf_ptr2char(regparse);
  10596        }
  10597 
  10598        // Previous char was '-', so this char is end of range.
  10599        if (emit_range) {
  10600          int endc = startc;
  10601          startc = oldstartc;
  10602          if (startc > endc) {
  10603            EMSG_RET_FAIL(_(e_reverse_range));
  10604          }
  10605 
  10606          if (endc > startc + 2) {
  10607            // Emit a range instead of the sequence of
  10608            // individual characters.
  10609            if (startc == 0) {
  10610              // \x00 is translated to \x0a, start at \x01.
  10611              EMIT(1);
  10612            } else {
  10613              post_ptr--;                   // remove NFA_CONCAT
  10614            }
  10615            EMIT(endc);
  10616            EMIT(NFA_RANGE);
  10617            EMIT(NFA_CONCAT);
  10618          } else if (utf_char2len(startc) > 1
  10619                     || utf_char2len(endc) > 1) {
  10620            // Emit the characters in the range.
  10621            // "startc" was already emitted, so skip it.
  10622            for (c = startc + 1; c <= endc; c++) {
  10623              EMIT(c);
  10624              EMIT(NFA_CONCAT);
  10625            }
  10626          } else {
  10627            // Emit the range. "startc" was already emitted, so
  10628            // skip it.
  10629            for (c = startc + 1; c <= endc; c++) {
  10630              EMIT(c);
  10631              EMIT(NFA_CONCAT);
  10632            }
  10633          }
  10634          emit_range = false;
  10635          startc = -1;
  10636        } else {
  10637          // This char (startc) is not part of a range. Just
  10638          // emit it.
  10639          // Normally, simply emit startc. But if we get char
  10640          // code=0 from a collating char, then replace it with
  10641          // 0x0a.
  10642          // This is needed to completely mimic the behaviour of
  10643          // the backtracking engine.
  10644          if (startc == NFA_NEWL) {
  10645            // Line break can't be matched as part of the
  10646            // collection, add an OR below. But not for negated
  10647            // range.
  10648            if (!negated) {
  10649              extra = NFA_ADD_NL;
  10650            }
  10651          } else {
  10652            if (got_coll_char == true && startc == 0) {
  10653              EMIT(0x0a);
  10654              EMIT(NFA_CONCAT);
  10655            } else {
  10656              EMIT(startc);
  10657              if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) {
  10658                EMIT(NFA_CONCAT);
  10659              }
  10660            }
  10661          }
  10662        }
  10663 
  10664        int plen;
  10665        if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) {
  10666          int i = utf_ptr2len(regparse);
  10667 
  10668          c = utf_ptr2char(regparse + i);
  10669 
  10670          // Add composing characters
  10671          while (true) {
  10672            if (c == 0) {
  10673              // \x00 is translated to \x0a, start at \x01.
  10674              EMIT(1);
  10675            } else {
  10676              EMIT(c);
  10677            }
  10678            EMIT(NFA_CONCAT);
  10679            if ((i += utf_char2len(c)) >= plen) {
  10680              break;
  10681            }
  10682            c = utf_ptr2char(regparse + i);
  10683          }
  10684          EMIT(NFA_COMPOSING);
  10685          EMIT(NFA_CONCAT);
  10686        }
  10687        MB_PTR_ADV(regparse);
  10688      }           // while (p < endp)
  10689 
  10690      MB_PTR_BACK(old_regparse, regparse);
  10691      if (*regparse == '-') {               // if last, '-' is just a char
  10692        EMIT('-');
  10693        EMIT(NFA_CONCAT);
  10694      }
  10695 
  10696      // skip the trailing ]
  10697      regparse = (char *)endp;
  10698      MB_PTR_ADV(regparse);
  10699 
  10700      // Mark end of the collection.
  10701      if (negated == true) {
  10702        EMIT(NFA_END_NEG_COLL);
  10703      } else {
  10704        EMIT(NFA_END_COLL);
  10705      }
  10706 
  10707      // \_[] also matches \n but it's not negated
  10708      if (extra == NFA_ADD_NL) {
  10709        EMIT(reg_string ? NL : NFA_NEWL);
  10710        EMIT(NFA_OR);
  10711      }
  10712 
  10713      return OK;
  10714    }         // if exists closing ]
  10715 
  10716    if (reg_strict) {
  10717      EMSG_RET_FAIL(_(e_missingbracket));
  10718    }
  10719    FALLTHROUGH;
  10720 
  10721  default: {
  10722    int plen;
  10723 
  10724 nfa_do_multibyte:
  10725    // plen is length of current char with composing chars
  10726    if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
  10727        || utf_iscomposing_legacy(c)) {
  10728      int i = 0;
  10729 
  10730      // A base character plus composing characters, or just one
  10731      // or more composing characters.
  10732      // This requires creating a separate atom as if enclosing
  10733      // the characters in (), where NFA_COMPOSING is the ( and
  10734      // NFA_END_COMPOSING is the ). Note that right now we are
  10735      // building the postfix form, not the NFA itself;
  10736      // a composing char could be: a, b, c, NFA_COMPOSING
  10737      // where 'b' and 'c' are chars with codes > 256.
  10738      while (true) {
  10739        EMIT(c);
  10740        if (i > 0) {
  10741          EMIT(NFA_CONCAT);
  10742        }
  10743        if ((i += utf_char2len(c)) >= plen) {
  10744          break;
  10745        }
  10746        c = utf_ptr2char((char *)old_regparse + i);
  10747      }
  10748      EMIT(NFA_COMPOSING);
  10749      regparse = (char *)old_regparse + plen;
  10750    } else {
  10751      c = no_Magic(c);
  10752      EMIT(c);
  10753    }
  10754    return OK;
  10755  }
  10756  }
  10757 
  10758  return OK;
  10759 }
  10760 
  10761 // Parse something followed by possible [*+=].
  10762 //
  10763 // A piece is an atom, possibly followed by a multi, an indication of how many
  10764 // times the atom can be matched.  Example: "a*" matches any sequence of "a"
  10765 // characters: "", "a", "aa", etc.
  10766 //
  10767 // piece   ::=      atom
  10768 //      or  atom  multi
  10769 static int nfa_regpiece(void)
  10770 {
  10771  int i;
  10772  int op;
  10773  int ret;
  10774  int minval, maxval;
  10775  bool greedy = true;  // Braces are prefixed with '-' ?
  10776  parse_state_T old_state;
  10777  parse_state_T new_state;
  10778  int64_t c2;
  10779  int old_post_pos;
  10780  int my_post_start;
  10781  int quest;
  10782 
  10783  // Save the current parse state, so that we can use it if <atom>{m,n} is
  10784  // next.
  10785  save_parse_state(&old_state);
  10786 
  10787  // store current pos in the postfix form, for \{m,n} involving 0s
  10788  my_post_start = (int)(post_ptr - post_start);
  10789 
  10790  ret = nfa_regatom();
  10791  if (ret == FAIL) {
  10792    return FAIL;            // cascaded error
  10793  }
  10794  op = peekchr();
  10795  if (re_multi_type(op) == NOT_MULTI) {
  10796    return OK;
  10797  }
  10798 
  10799  skipchr();
  10800  switch (op) {
  10801  case Magic('*'):
  10802    EMIT(NFA_STAR);
  10803    break;
  10804 
  10805  case Magic('+'):
  10806    // Trick: Normally, (a*)\+ would match the whole input "aaa".  The
  10807    // first and only submatch would be "aaa". But the backtracking
  10808    // engine interprets the plus as "try matching one more time", and
  10809    // a* matches a second time at the end of the input, the empty
  10810    // string.
  10811    // The submatch will be the empty string.
  10812    //
  10813    // In order to be consistent with the old engine, we replace
  10814    // <atom>+ with <atom><atom>*
  10815    restore_parse_state(&old_state);
  10816    curchr = -1;
  10817    if (nfa_regatom() == FAIL) {
  10818      return FAIL;
  10819    }
  10820    EMIT(NFA_STAR);
  10821    EMIT(NFA_CONCAT);
  10822    skipchr();                  // skip the \+
  10823    break;
  10824 
  10825  case Magic('@'):
  10826    c2 = getdecchrs();
  10827    op = no_Magic(getchr());
  10828    i = 0;
  10829    switch (op) {
  10830    case '=':
  10831      // \@=
  10832      i = NFA_PREV_ATOM_NO_WIDTH;
  10833      break;
  10834    case '!':
  10835      // \@!
  10836      i = NFA_PREV_ATOM_NO_WIDTH_NEG;
  10837      break;
  10838    case '<':
  10839      op = no_Magic(getchr());
  10840      if (op == '=') {
  10841        // \@<=
  10842        i = NFA_PREV_ATOM_JUST_BEFORE;
  10843      } else if (op == '!') {
  10844        // \@<!
  10845        i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
  10846      }
  10847      break;
  10848    case '>':
  10849      // \@>
  10850      i = NFA_PREV_ATOM_LIKE_PATTERN;
  10851      break;
  10852    }
  10853    if (i == 0) {
  10854      semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op);
  10855      return FAIL;
  10856    }
  10857    EMIT(i);
  10858    if (i == NFA_PREV_ATOM_JUST_BEFORE
  10859        || i == NFA_PREV_ATOM_JUST_BEFORE_NEG) {
  10860      EMIT((int)c2);
  10861    }
  10862    break;
  10863 
  10864  case Magic('?'):
  10865  case Magic('='):
  10866    EMIT(NFA_QUEST);
  10867    break;
  10868 
  10869  case Magic('{'):
  10870    // a{2,5} will expand to 'aaa?a?a?'
  10871    // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
  10872    // version of '?'
  10873    // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
  10874    // parenthesis have the same id
  10875 
  10876    greedy = true;
  10877    c2 = peekchr();
  10878    if (c2 == '-' || c2 == Magic('-')) {
  10879      skipchr();
  10880      greedy = false;
  10881    }
  10882    if (!read_limits(&minval, &maxval)) {
  10883      EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
  10884    }
  10885 
  10886    //  <atom>{0,inf}, <atom>{0,} and <atom>{}  are equivalent to
  10887    //  <atom>*
  10888    if (minval == 0 && maxval == MAX_LIMIT) {
  10889      if (greedy) {
  10890        // \{}, \{0,}
  10891        EMIT(NFA_STAR);
  10892      } else {
  10893        // \{-}, \{-0,}
  10894        EMIT(NFA_STAR_NONGREEDY);
  10895      }
  10896      break;
  10897    }
  10898 
  10899    // Special case: x{0} or x{-0}
  10900    if (maxval == 0) {
  10901      // Ignore result of previous call to nfa_regatom()
  10902      post_ptr = post_start + my_post_start;
  10903      // NFA_EMPTY is 0-length and works everywhere
  10904      EMIT(NFA_EMPTY);
  10905      return OK;
  10906    }
  10907 
  10908    // The engine is very inefficient (uses too many states) when the
  10909    // maximum is much larger than the minimum and when the maximum is
  10910    // large.  However, when maxval is MAX_LIMIT, it is okay, as this
  10911    // will emit NFA_STAR.
  10912    // Bail out if we can use the other engine, but only, when the
  10913    // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
  10914    // does not work with characters > 8 bit with the BT engine)
  10915    if ((nfa_re_flags & RE_AUTO)
  10916        && (maxval > 500 || maxval > minval + 200)
  10917        && (maxval != MAX_LIMIT && minval < 200)
  10918        && !wants_nfa) {
  10919      return FAIL;
  10920    }
  10921 
  10922    // Ignore previous call to nfa_regatom()
  10923    post_ptr = post_start + my_post_start;
  10924    // Save parse state after the repeated atom and the \{}
  10925    save_parse_state(&new_state);
  10926 
  10927    quest = (greedy == true ? NFA_QUEST : NFA_QUEST_NONGREEDY);
  10928    for (i = 0; i < maxval; i++) {
  10929      // Goto beginning of the repeated atom
  10930      restore_parse_state(&old_state);
  10931      old_post_pos = (int)(post_ptr - post_start);
  10932      if (nfa_regatom() == FAIL) {
  10933        return FAIL;
  10934      }
  10935      // after "minval" times, atoms are optional
  10936      if (i + 1 > minval) {
  10937        if (maxval == MAX_LIMIT) {
  10938          if (greedy) {
  10939            EMIT(NFA_STAR);
  10940          } else {
  10941            EMIT(NFA_STAR_NONGREEDY);
  10942          }
  10943        } else {
  10944          EMIT(quest);
  10945        }
  10946      }
  10947      if (old_post_pos != my_post_start) {
  10948        EMIT(NFA_CONCAT);
  10949      }
  10950      if (i + 1 > minval && maxval == MAX_LIMIT) {
  10951        break;
  10952      }
  10953    }
  10954 
  10955    // Go to just after the repeated atom and the \{}
  10956    restore_parse_state(&new_state);
  10957    curchr = -1;
  10958 
  10959    break;
  10960 
  10961  default:
  10962    break;
  10963  }     // end switch
  10964 
  10965  if (re_multi_type(peekchr()) != NOT_MULTI) {
  10966    // Can't have a multi follow a multi.
  10967    EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
  10968  }
  10969 
  10970  return OK;
  10971 }
  10972 
  10973 // Parse one or more pieces, concatenated.  It matches a match for the
  10974 // first piece, followed by a match for the second piece, etc.  Example:
  10975 // "f[0-9]b", first matches "f", then a digit and then "b".
  10976 //
  10977 // concat  ::=      piece
  10978 //      or  piece piece
  10979 //      or  piece piece piece
  10980 //      etc.
  10981 static int nfa_regconcat(void)
  10982 {
  10983  bool cont = true;
  10984  bool first = true;
  10985 
  10986  while (cont) {
  10987    switch (peekchr()) {
  10988    case NUL:
  10989    case Magic('|'):
  10990    case Magic('&'):
  10991    case Magic(')'):
  10992      cont = false;
  10993      break;
  10994 
  10995    case Magic('Z'):
  10996      regflags |= RF_ICOMBINE;
  10997      skipchr_keepstart();
  10998      break;
  10999    case Magic('c'):
  11000      regflags |= RF_ICASE;
  11001      skipchr_keepstart();
  11002      break;
  11003    case Magic('C'):
  11004      regflags |= RF_NOICASE;
  11005      skipchr_keepstart();
  11006      break;
  11007    case Magic('v'):
  11008      reg_magic = MAGIC_ALL;
  11009      skipchr_keepstart();
  11010      curchr = -1;
  11011      break;
  11012    case Magic('m'):
  11013      reg_magic = MAGIC_ON;
  11014      skipchr_keepstart();
  11015      curchr = -1;
  11016      break;
  11017    case Magic('M'):
  11018      reg_magic = MAGIC_OFF;
  11019      skipchr_keepstart();
  11020      curchr = -1;
  11021      break;
  11022    case Magic('V'):
  11023      reg_magic = MAGIC_NONE;
  11024      skipchr_keepstart();
  11025      curchr = -1;
  11026      break;
  11027 
  11028    default:
  11029      if (nfa_regpiece() == FAIL) {
  11030        return FAIL;
  11031      }
  11032      if (first == false) {
  11033        EMIT(NFA_CONCAT);
  11034      } else {
  11035        first = false;
  11036      }
  11037      break;
  11038    }
  11039  }
  11040 
  11041  return OK;
  11042 }
  11043 
  11044 // Parse a branch, one or more concats, separated by "\&".  It matches the
  11045 // last concat, but only if all the preceding concats also match at the same
  11046 // position.  Examples:
  11047 //      "foobeep\&..." matches "foo" in "foobeep".
  11048 //      ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
  11049 //
  11050 // branch ::=       concat
  11051 //              or  concat \& concat
  11052 //              or  concat \& concat \& concat
  11053 //              etc.
  11054 static int nfa_regbranch(void)
  11055 {
  11056  int old_post_pos;
  11057 
  11058  old_post_pos = (int)(post_ptr - post_start);
  11059 
  11060  // First branch, possibly the only one
  11061  if (nfa_regconcat() == FAIL) {
  11062    return FAIL;
  11063  }
  11064 
  11065  // Try next concats
  11066  while (peekchr() == Magic('&')) {
  11067    skipchr();
  11068    // if concat is empty do emit a node
  11069    if (old_post_pos == (int)(post_ptr - post_start)) {
  11070      EMIT(NFA_EMPTY);
  11071    }
  11072    EMIT(NFA_NOPEN);
  11073    EMIT(NFA_PREV_ATOM_NO_WIDTH);
  11074    old_post_pos = (int)(post_ptr - post_start);
  11075    if (nfa_regconcat() == FAIL) {
  11076      return FAIL;
  11077    }
  11078    // if concat is empty do emit a node
  11079    if (old_post_pos == (int)(post_ptr - post_start)) {
  11080      EMIT(NFA_EMPTY);
  11081    }
  11082    EMIT(NFA_CONCAT);
  11083  }
  11084 
  11085  // if a branch is empty, emit one node for it
  11086  if (old_post_pos == (int)(post_ptr - post_start)) {
  11087    EMIT(NFA_EMPTY);
  11088  }
  11089 
  11090  return OK;
  11091 }
  11092 
  11093 ///  Parse a pattern, one or more branches, separated by "\|".  It matches
  11094 ///  anything that matches one of the branches.  Example: "foo\|beep" matches
  11095 ///  "foo" and matches "beep".  If more than one branch matches, the first one
  11096 ///  is used.
  11097 ///
  11098 ///  pattern ::=     branch
  11099 ///      or  branch \| branch
  11100 ///      or  branch \| branch \| branch
  11101 ///      etc.
  11102 ///
  11103 /// @param paren  REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
  11104 static int nfa_reg(int paren)
  11105 {
  11106  int parno = 0;
  11107 
  11108  if (paren == REG_PAREN) {
  11109    if (regnpar >= NSUBEXP) {   // Too many `('
  11110      EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
  11111    }
  11112    parno = regnpar++;
  11113  } else if (paren == REG_ZPAREN) {
  11114    // Make a ZOPEN node.
  11115    if (regnzpar >= NSUBEXP) {
  11116      EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
  11117    }
  11118    parno = regnzpar++;
  11119  }
  11120 
  11121  if (nfa_regbranch() == FAIL) {
  11122    return FAIL;            // cascaded error
  11123  }
  11124  while (peekchr() == Magic('|')) {
  11125    skipchr();
  11126    if (nfa_regbranch() == FAIL) {
  11127      return FAIL;          // cascaded error
  11128    }
  11129    EMIT(NFA_OR);
  11130  }
  11131 
  11132  // Check for proper termination.
  11133  if (paren != REG_NOPAREN && getchr() != Magic(')')) {
  11134    if (paren == REG_NPAREN) {
  11135      EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
  11136    } else {
  11137      EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
  11138    }
  11139  } else if (paren == REG_NOPAREN && peekchr() != NUL) {
  11140    if (peekchr() == Magic(')')) {
  11141      EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
  11142    } else {
  11143      EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
  11144    }
  11145  }
  11146  // Here we set the flag allowing back references to this set of
  11147  // parentheses.
  11148  if (paren == REG_PAREN) {
  11149    had_endbrace[parno] = true;  // have seen the close paren
  11150    EMIT(NFA_MOPEN + parno);
  11151  } else if (paren == REG_ZPAREN) {
  11152    EMIT(NFA_ZOPEN + parno);
  11153  }
  11154 
  11155  return OK;
  11156 }
  11157 
  11158 #ifdef REGEXP_DEBUG
  11159 static uint8_t code[50];
  11160 
  11161 static void nfa_set_code(int c)
  11162 {
  11163  int addnl = false;
  11164 
  11165  if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) {
  11166    addnl = true;
  11167    c -= NFA_ADD_NL;
  11168  }
  11169 
  11170  STRCPY(code, "");
  11171  switch (c) {
  11172  case NFA_MATCH:
  11173    STRCPY(code, "NFA_MATCH "); break;
  11174  case NFA_SPLIT:
  11175    STRCPY(code, "NFA_SPLIT "); break;
  11176  case NFA_CONCAT:
  11177    STRCPY(code, "NFA_CONCAT "); break;
  11178  case NFA_NEWL:
  11179    STRCPY(code, "NFA_NEWL "); break;
  11180  case NFA_ZSTART:
  11181    STRCPY(code, "NFA_ZSTART"); break;
  11182  case NFA_ZEND:
  11183    STRCPY(code, "NFA_ZEND"); break;
  11184 
  11185  case NFA_BACKREF1:
  11186    STRCPY(code, "NFA_BACKREF1"); break;
  11187  case NFA_BACKREF2:
  11188    STRCPY(code, "NFA_BACKREF2"); break;
  11189  case NFA_BACKREF3:
  11190    STRCPY(code, "NFA_BACKREF3"); break;
  11191  case NFA_BACKREF4:
  11192    STRCPY(code, "NFA_BACKREF4"); break;
  11193  case NFA_BACKREF5:
  11194    STRCPY(code, "NFA_BACKREF5"); break;
  11195  case NFA_BACKREF6:
  11196    STRCPY(code, "NFA_BACKREF6"); break;
  11197  case NFA_BACKREF7:
  11198    STRCPY(code, "NFA_BACKREF7"); break;
  11199  case NFA_BACKREF8:
  11200    STRCPY(code, "NFA_BACKREF8"); break;
  11201  case NFA_BACKREF9:
  11202    STRCPY(code, "NFA_BACKREF9"); break;
  11203  case NFA_ZREF1:
  11204    STRCPY(code, "NFA_ZREF1"); break;
  11205  case NFA_ZREF2:
  11206    STRCPY(code, "NFA_ZREF2"); break;
  11207  case NFA_ZREF3:
  11208    STRCPY(code, "NFA_ZREF3"); break;
  11209  case NFA_ZREF4:
  11210    STRCPY(code, "NFA_ZREF4"); break;
  11211  case NFA_ZREF5:
  11212    STRCPY(code, "NFA_ZREF5"); break;
  11213  case NFA_ZREF6:
  11214    STRCPY(code, "NFA_ZREF6"); break;
  11215  case NFA_ZREF7:
  11216    STRCPY(code, "NFA_ZREF7"); break;
  11217  case NFA_ZREF8:
  11218    STRCPY(code, "NFA_ZREF8"); break;
  11219  case NFA_ZREF9:
  11220    STRCPY(code, "NFA_ZREF9"); break;
  11221  case NFA_SKIP:
  11222    STRCPY(code, "NFA_SKIP"); break;
  11223 
  11224  case NFA_PREV_ATOM_NO_WIDTH:
  11225    STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
  11226  case NFA_PREV_ATOM_NO_WIDTH_NEG:
  11227    STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
  11228  case NFA_PREV_ATOM_JUST_BEFORE:
  11229    STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
  11230  case NFA_PREV_ATOM_JUST_BEFORE_NEG:
  11231    STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
  11232  case NFA_PREV_ATOM_LIKE_PATTERN:
  11233    STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
  11234 
  11235  case NFA_NOPEN:
  11236    STRCPY(code, "NFA_NOPEN"); break;
  11237  case NFA_NCLOSE:
  11238    STRCPY(code, "NFA_NCLOSE"); break;
  11239  case NFA_START_INVISIBLE:
  11240    STRCPY(code, "NFA_START_INVISIBLE"); break;
  11241  case NFA_START_INVISIBLE_FIRST:
  11242    STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
  11243  case NFA_START_INVISIBLE_NEG:
  11244    STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
  11245  case NFA_START_INVISIBLE_NEG_FIRST:
  11246    STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
  11247  case NFA_START_INVISIBLE_BEFORE:
  11248    STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
  11249  case NFA_START_INVISIBLE_BEFORE_FIRST:
  11250    STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
  11251  case NFA_START_INVISIBLE_BEFORE_NEG:
  11252    STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
  11253  case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
  11254    STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
  11255  case NFA_START_PATTERN:
  11256    STRCPY(code, "NFA_START_PATTERN"); break;
  11257  case NFA_END_INVISIBLE:
  11258    STRCPY(code, "NFA_END_INVISIBLE"); break;
  11259  case NFA_END_INVISIBLE_NEG:
  11260    STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
  11261  case NFA_END_PATTERN:
  11262    STRCPY(code, "NFA_END_PATTERN"); break;
  11263 
  11264  case NFA_COMPOSING:
  11265    STRCPY(code, "NFA_COMPOSING"); break;
  11266  case NFA_END_COMPOSING:
  11267    STRCPY(code, "NFA_END_COMPOSING"); break;
  11268  case NFA_OPT_CHARS:
  11269    STRCPY(code, "NFA_OPT_CHARS"); break;
  11270 
  11271  case NFA_MOPEN:
  11272  case NFA_MOPEN1:
  11273  case NFA_MOPEN2:
  11274  case NFA_MOPEN3:
  11275  case NFA_MOPEN4:
  11276  case NFA_MOPEN5:
  11277  case NFA_MOPEN6:
  11278  case NFA_MOPEN7:
  11279  case NFA_MOPEN8:
  11280  case NFA_MOPEN9:
  11281    STRCPY(code, "NFA_MOPEN(x)");
  11282    code[10] = c - NFA_MOPEN + '0';
  11283    break;
  11284  case NFA_MCLOSE:
  11285  case NFA_MCLOSE1:
  11286  case NFA_MCLOSE2:
  11287  case NFA_MCLOSE3:
  11288  case NFA_MCLOSE4:
  11289  case NFA_MCLOSE5:
  11290  case NFA_MCLOSE6:
  11291  case NFA_MCLOSE7:
  11292  case NFA_MCLOSE8:
  11293  case NFA_MCLOSE9:
  11294    STRCPY(code, "NFA_MCLOSE(x)");
  11295    code[11] = c - NFA_MCLOSE + '0';
  11296    break;
  11297  case NFA_ZOPEN:
  11298  case NFA_ZOPEN1:
  11299  case NFA_ZOPEN2:
  11300  case NFA_ZOPEN3:
  11301  case NFA_ZOPEN4:
  11302  case NFA_ZOPEN5:
  11303  case NFA_ZOPEN6:
  11304  case NFA_ZOPEN7:
  11305  case NFA_ZOPEN8:
  11306  case NFA_ZOPEN9:
  11307    STRCPY(code, "NFA_ZOPEN(x)");
  11308    code[10] = c - NFA_ZOPEN + '0';
  11309    break;
  11310  case NFA_ZCLOSE:
  11311  case NFA_ZCLOSE1:
  11312  case NFA_ZCLOSE2:
  11313  case NFA_ZCLOSE3:
  11314  case NFA_ZCLOSE4:
  11315  case NFA_ZCLOSE5:
  11316  case NFA_ZCLOSE6:
  11317  case NFA_ZCLOSE7:
  11318  case NFA_ZCLOSE8:
  11319  case NFA_ZCLOSE9:
  11320    STRCPY(code, "NFA_ZCLOSE(x)");
  11321    code[11] = c - NFA_ZCLOSE + '0';
  11322    break;
  11323  case NFA_EOL:
  11324    STRCPY(code, "NFA_EOL "); break;
  11325  case NFA_BOL:
  11326    STRCPY(code, "NFA_BOL "); break;
  11327  case NFA_EOW:
  11328    STRCPY(code, "NFA_EOW "); break;
  11329  case NFA_BOW:
  11330    STRCPY(code, "NFA_BOW "); break;
  11331  case NFA_EOF:
  11332    STRCPY(code, "NFA_EOF "); break;
  11333  case NFA_BOF:
  11334    STRCPY(code, "NFA_BOF "); break;
  11335  case NFA_LNUM:
  11336    STRCPY(code, "NFA_LNUM "); break;
  11337  case NFA_LNUM_GT:
  11338    STRCPY(code, "NFA_LNUM_GT "); break;
  11339  case NFA_LNUM_LT:
  11340    STRCPY(code, "NFA_LNUM_LT "); break;
  11341  case NFA_COL:
  11342    STRCPY(code, "NFA_COL "); break;
  11343  case NFA_COL_GT:
  11344    STRCPY(code, "NFA_COL_GT "); break;
  11345  case NFA_COL_LT:
  11346    STRCPY(code, "NFA_COL_LT "); break;
  11347  case NFA_VCOL:
  11348    STRCPY(code, "NFA_VCOL "); break;
  11349  case NFA_VCOL_GT:
  11350    STRCPY(code, "NFA_VCOL_GT "); break;
  11351  case NFA_VCOL_LT:
  11352    STRCPY(code, "NFA_VCOL_LT "); break;
  11353  case NFA_MARK:
  11354    STRCPY(code, "NFA_MARK "); break;
  11355  case NFA_MARK_GT:
  11356    STRCPY(code, "NFA_MARK_GT "); break;
  11357  case NFA_MARK_LT:
  11358    STRCPY(code, "NFA_MARK_LT "); break;
  11359  case NFA_CURSOR:
  11360    STRCPY(code, "NFA_CURSOR "); break;
  11361  case NFA_VISUAL:
  11362    STRCPY(code, "NFA_VISUAL "); break;
  11363  case NFA_ANY_COMPOSING:
  11364    STRCPY(code, "NFA_ANY_COMPOSING "); break;
  11365 
  11366  case NFA_STAR:
  11367    STRCPY(code, "NFA_STAR "); break;
  11368  case NFA_STAR_NONGREEDY:
  11369    STRCPY(code, "NFA_STAR_NONGREEDY "); break;
  11370  case NFA_QUEST:
  11371    STRCPY(code, "NFA_QUEST"); break;
  11372  case NFA_QUEST_NONGREEDY:
  11373    STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
  11374  case NFA_EMPTY:
  11375    STRCPY(code, "NFA_EMPTY"); break;
  11376  case NFA_OR:
  11377    STRCPY(code, "NFA_OR"); break;
  11378 
  11379  case NFA_START_COLL:
  11380    STRCPY(code, "NFA_START_COLL"); break;
  11381  case NFA_END_COLL:
  11382    STRCPY(code, "NFA_END_COLL"); break;
  11383  case NFA_START_NEG_COLL:
  11384    STRCPY(code, "NFA_START_NEG_COLL"); break;
  11385  case NFA_END_NEG_COLL:
  11386    STRCPY(code, "NFA_END_NEG_COLL"); break;
  11387  case NFA_RANGE:
  11388    STRCPY(code, "NFA_RANGE"); break;
  11389  case NFA_RANGE_MIN:
  11390    STRCPY(code, "NFA_RANGE_MIN"); break;
  11391  case NFA_RANGE_MAX:
  11392    STRCPY(code, "NFA_RANGE_MAX"); break;
  11393 
  11394  case NFA_CLASS_ALNUM:
  11395    STRCPY(code, "NFA_CLASS_ALNUM"); break;
  11396  case NFA_CLASS_ALPHA:
  11397    STRCPY(code, "NFA_CLASS_ALPHA"); break;
  11398  case NFA_CLASS_BLANK:
  11399    STRCPY(code, "NFA_CLASS_BLANK"); break;
  11400  case NFA_CLASS_CNTRL:
  11401    STRCPY(code, "NFA_CLASS_CNTRL"); break;
  11402  case NFA_CLASS_DIGIT:
  11403    STRCPY(code, "NFA_CLASS_DIGIT"); break;
  11404  case NFA_CLASS_GRAPH:
  11405    STRCPY(code, "NFA_CLASS_GRAPH"); break;
  11406  case NFA_CLASS_LOWER:
  11407    STRCPY(code, "NFA_CLASS_LOWER"); break;
  11408  case NFA_CLASS_PRINT:
  11409    STRCPY(code, "NFA_CLASS_PRINT"); break;
  11410  case NFA_CLASS_PUNCT:
  11411    STRCPY(code, "NFA_CLASS_PUNCT"); break;
  11412  case NFA_CLASS_SPACE:
  11413    STRCPY(code, "NFA_CLASS_SPACE"); break;
  11414  case NFA_CLASS_UPPER:
  11415    STRCPY(code, "NFA_CLASS_UPPER"); break;
  11416  case NFA_CLASS_XDIGIT:
  11417    STRCPY(code, "NFA_CLASS_XDIGIT"); break;
  11418  case NFA_CLASS_TAB:
  11419    STRCPY(code, "NFA_CLASS_TAB"); break;
  11420  case NFA_CLASS_RETURN:
  11421    STRCPY(code, "NFA_CLASS_RETURN"); break;
  11422  case NFA_CLASS_BACKSPACE:
  11423    STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
  11424  case NFA_CLASS_ESCAPE:
  11425    STRCPY(code, "NFA_CLASS_ESCAPE"); break;
  11426  case NFA_CLASS_IDENT:
  11427    STRCPY(code, "NFA_CLASS_IDENT"); break;
  11428  case NFA_CLASS_KEYWORD:
  11429    STRCPY(code, "NFA_CLASS_KEYWORD"); break;
  11430  case NFA_CLASS_FNAME:
  11431    STRCPY(code, "NFA_CLASS_FNAME"); break;
  11432 
  11433  case NFA_ANY:
  11434    STRCPY(code, "NFA_ANY"); break;
  11435  case NFA_IDENT:
  11436    STRCPY(code, "NFA_IDENT"); break;
  11437  case NFA_SIDENT:
  11438    STRCPY(code, "NFA_SIDENT"); break;
  11439  case NFA_KWORD:
  11440    STRCPY(code, "NFA_KWORD"); break;
  11441  case NFA_SKWORD:
  11442    STRCPY(code, "NFA_SKWORD"); break;
  11443  case NFA_FNAME:
  11444    STRCPY(code, "NFA_FNAME"); break;
  11445  case NFA_SFNAME:
  11446    STRCPY(code, "NFA_SFNAME"); break;
  11447  case NFA_PRINT:
  11448    STRCPY(code, "NFA_PRINT"); break;
  11449  case NFA_SPRINT:
  11450    STRCPY(code, "NFA_SPRINT"); break;
  11451  case NFA_WHITE:
  11452    STRCPY(code, "NFA_WHITE"); break;
  11453  case NFA_NWHITE:
  11454    STRCPY(code, "NFA_NWHITE"); break;
  11455  case NFA_DIGIT:
  11456    STRCPY(code, "NFA_DIGIT"); break;
  11457  case NFA_NDIGIT:
  11458    STRCPY(code, "NFA_NDIGIT"); break;
  11459  case NFA_HEX:
  11460    STRCPY(code, "NFA_HEX"); break;
  11461  case NFA_NHEX:
  11462    STRCPY(code, "NFA_NHEX"); break;
  11463  case NFA_OCTAL:
  11464    STRCPY(code, "NFA_OCTAL"); break;
  11465  case NFA_NOCTAL:
  11466    STRCPY(code, "NFA_NOCTAL"); break;
  11467  case NFA_WORD:
  11468    STRCPY(code, "NFA_WORD"); break;
  11469  case NFA_NWORD:
  11470    STRCPY(code, "NFA_NWORD"); break;
  11471  case NFA_HEAD:
  11472    STRCPY(code, "NFA_HEAD"); break;
  11473  case NFA_NHEAD:
  11474    STRCPY(code, "NFA_NHEAD"); break;
  11475  case NFA_ALPHA:
  11476    STRCPY(code, "NFA_ALPHA"); break;
  11477  case NFA_NALPHA:
  11478    STRCPY(code, "NFA_NALPHA"); break;
  11479  case NFA_LOWER:
  11480    STRCPY(code, "NFA_LOWER"); break;
  11481  case NFA_NLOWER:
  11482    STRCPY(code, "NFA_NLOWER"); break;
  11483  case NFA_UPPER:
  11484    STRCPY(code, "NFA_UPPER"); break;
  11485  case NFA_NUPPER:
  11486    STRCPY(code, "NFA_NUPPER"); break;
  11487  case NFA_LOWER_IC:
  11488    STRCPY(code, "NFA_LOWER_IC"); break;
  11489  case NFA_NLOWER_IC:
  11490    STRCPY(code, "NFA_NLOWER_IC"); break;
  11491  case NFA_UPPER_IC:
  11492    STRCPY(code, "NFA_UPPER_IC"); break;
  11493  case NFA_NUPPER_IC:
  11494    STRCPY(code, "NFA_NUPPER_IC"); break;
  11495 
  11496  default:
  11497    STRCPY(code, "CHAR(x)");
  11498    code[5] = c;
  11499  }
  11500 
  11501  if (addnl == true) {
  11502    strcat(code, " + NEWLINE ");
  11503  }
  11504 }
  11505 
  11506 static FILE *log_fd;
  11507 static const uint8_t e_log_open_failed[] =
  11508  N_("Could not open temporary log file for writing, displaying on stderr... ");
  11509 
  11510 // Print the postfix notation of the current regexp.
  11511 static void nfa_postfix_dump(uint8_t *expr, int retval)
  11512 {
  11513  int *p;
  11514  FILE *f;
  11515 
  11516  f = fopen(NFA_REGEXP_DUMP_LOG, "a");
  11517  if (f == NULL) {
  11518    return;
  11519  }
  11520 
  11521  fprintf(f, "\n-------------------------\n");
  11522  if (retval == FAIL) {
  11523    fprintf(f, ">>> NFA engine failed... \n");
  11524  } else if (retval == OK) {
  11525    fprintf(f, ">>> NFA engine succeeded !\n");
  11526  }
  11527  fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
  11528  for (p = post_start; *p && p < post_ptr; p++) {
  11529    nfa_set_code(*p);
  11530    fprintf(f, "%s, ", code);
  11531  }
  11532  fprintf(f, "\"\nPostfix notation (int): ");
  11533  for (p = post_start; *p && p < post_ptr; p++) {
  11534    fprintf(f, "%d ", *p);
  11535  }
  11536  fprintf(f, "\n\n");
  11537  fclose(f);
  11538 }
  11539 
  11540 // Print the NFA starting with a root node "state".
  11541 static void nfa_print_state(FILE *debugf, nfa_state_T *state)
  11542 {
  11543  garray_T indent;
  11544 
  11545  ga_init(&indent, 1, 64);
  11546  ga_append(&indent, NUL);
  11547  nfa_print_state2(debugf, state, &indent);
  11548  ga_clear(&indent);
  11549 }
  11550 
  11551 static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
  11552 {
  11553  uint8_t *p;
  11554 
  11555  if (state == NULL) {
  11556    return;
  11557  }
  11558 
  11559  fprintf(debugf, "(%2d)", abs(state->id));
  11560 
  11561  // Output indent
  11562  p = (uint8_t *)indent->ga_data;
  11563  if (indent->ga_len >= 3) {
  11564    int last = indent->ga_len - 3;
  11565    uint8_t save[2];
  11566 
  11567    strncpy(save, &p[last], 2);  // NOLINT(runtime/printf)
  11568    memcpy(&p[last], "+-", 2);
  11569    fprintf(debugf, " %s", p);
  11570    strncpy(&p[last], save, 2);  // NOLINT(runtime/printf)
  11571  } else {
  11572    fprintf(debugf, " %s", p);
  11573  }
  11574 
  11575  nfa_set_code(state->c);
  11576  fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
  11577          code,
  11578          state->c,
  11579          abs(state->id),
  11580          state->val);
  11581  if (state->id < 0) {
  11582    return;
  11583  }
  11584 
  11585  state->id = abs(state->id) * -1;
  11586 
  11587  // grow indent for state->out
  11588  indent->ga_len -= 1;
  11589  if (state->out1) {
  11590    GA_CONCAT_LITERAL(indent, "| ");
  11591  } else {
  11592    GA_CONCAT_LITERAL(indent, "  ");
  11593  }
  11594  ga_append(indent, NUL);
  11595 
  11596  nfa_print_state2(debugf, state->out, indent);
  11597 
  11598  // replace last part of indent for state->out1
  11599  indent->ga_len -= 3;
  11600  GA_CONCAT_LITERAL(indent, "  ");
  11601  ga_append(indent, NUL);
  11602 
  11603  nfa_print_state2(debugf, state->out1, indent);
  11604 
  11605  // shrink indent
  11606  indent->ga_len -= 3;
  11607  ga_append(indent, NUL);
  11608 }
  11609 
  11610 // Print the NFA state machine.
  11611 static void nfa_dump(nfa_regprog_T *prog)
  11612 {
  11613  FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
  11614 
  11615  if (debugf == NULL) {
  11616    return;
  11617  }
  11618 
  11619  nfa_print_state(debugf, prog->start);
  11620 
  11621  if (prog->reganch) {
  11622    fprintf(debugf, "reganch: %d\n", prog->reganch);
  11623  }
  11624  if (prog->regstart != NUL) {
  11625    fprintf(debugf, "regstart: %c (decimal: %d)\n",
  11626            prog->regstart, prog->regstart);
  11627  }
  11628  if (prog->match_text != NULL) {
  11629    fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
  11630  }
  11631 
  11632  fclose(debugf);
  11633 }
  11634 #endif  // REGEXP_DEBUG
  11635 
  11636 // Parse r.e. @expr and convert it into postfix form.
  11637 // Return the postfix string on success, NULL otherwise.
  11638 static int *re2post(void)
  11639 {
  11640  if (nfa_reg(REG_NOPAREN) == FAIL) {
  11641    return NULL;
  11642  }
  11643  EMIT(NFA_MOPEN);
  11644  return post_start;
  11645 }
  11646 
  11647 // NB. Some of the code below is inspired by Russ's.
  11648 
  11649 // Represents an NFA state plus zero or one or two arrows exiting.
  11650 // if c == MATCH, no arrows out; matching state.
  11651 // If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
  11652 // If c < 256, labeled arrow with character c to out.
  11653 
  11654 static nfa_state_T *state_ptr;  // points to nfa_prog->state
  11655 
  11656 // Allocate and initialize nfa_state_T.
  11657 static nfa_state_T *alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
  11658 {
  11659  nfa_state_T *s;
  11660 
  11661  if (istate >= nstate) {
  11662    return NULL;
  11663  }
  11664 
  11665  s = &state_ptr[istate++];
  11666 
  11667  s->c = c;
  11668  s->out = out;
  11669  s->out1 = out1;
  11670  s->val = 0;
  11671 
  11672  s->id = istate;
  11673  s->lastlist[0] = 0;
  11674  s->lastlist[1] = 0;
  11675 
  11676  return s;
  11677 }
  11678 
  11679 // A partially built NFA without the matching state filled in.
  11680 // Frag_T.start points at the start state.
  11681 // Frag_T.out is a list of places that need to be set to the
  11682 // next state for this fragment.
  11683 
  11684 // Initialize a Frag_T struct and return it.
  11685 static Frag_T frag(nfa_state_T *start, Ptrlist *out)
  11686 {
  11687  Frag_T n;
  11688 
  11689  n.start = start;
  11690  n.out = out;
  11691  return n;
  11692 }
  11693 
  11694 // Create singleton list containing just outp.
  11695 static Ptrlist *list1(nfa_state_T **outp)
  11696 {
  11697  Ptrlist *l;
  11698 
  11699  l = (Ptrlist *)outp;
  11700  l->next = NULL;
  11701  return l;
  11702 }
  11703 
  11704 // Patch the list of states at out to point to start.
  11705 static void patch(Ptrlist *l, nfa_state_T *s)
  11706 {
  11707  Ptrlist *next;
  11708 
  11709  for (; l; l = next) {
  11710    next = l->next;
  11711    l->s = s;
  11712  }
  11713 }
  11714 
  11715 // Join the two lists l1 and l2, returning the combination.
  11716 static Ptrlist *append(Ptrlist *l1, Ptrlist *l2)
  11717 {
  11718  Ptrlist *oldl1;
  11719 
  11720  oldl1 = l1;
  11721  while (l1->next) {
  11722    l1 = l1->next;
  11723  }
  11724  l1->next = l2;
  11725  return oldl1;
  11726 }
  11727 
  11728 // Stack used for transforming postfix form into NFA.
  11729 static Frag_T empty;
  11730 
  11731 static void st_error(int *postfix, int *end, int *p)
  11732 {
  11733 #ifdef NFA_REGEXP_ERROR_LOG
  11734  FILE *df;
  11735  int *p2;
  11736 
  11737  df = fopen(NFA_REGEXP_ERROR_LOG, "a");
  11738  if (df) {
  11739    fprintf(df, "Error popping the stack!\n");
  11740 # ifdef REGEXP_DEBUG
  11741    fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
  11742 # endif
  11743    fprintf(df, "Postfix form is: ");
  11744 # ifdef REGEXP_DEBUG
  11745    for (p2 = postfix; p2 < end; p2++) {
  11746      nfa_set_code(*p2);
  11747      fprintf(df, "%s, ", code);
  11748    }
  11749    nfa_set_code(*p);
  11750    fprintf(df, "\nCurrent position is: ");
  11751    for (p2 = postfix; p2 <= p; p2++) {
  11752      nfa_set_code(*p2);
  11753      fprintf(df, "%s, ", code);
  11754    }
  11755 # else
  11756    for (p2 = postfix; p2 < end; p2++) {
  11757      fprintf(df, "%d, ", *p2);
  11758    }
  11759    fprintf(df, "\nCurrent position is: ");
  11760    for (p2 = postfix; p2 <= p; p2++) {
  11761      fprintf(df, "%d, ", *p2);
  11762    }
  11763 # endif
  11764    fprintf(df, "\n--------------------------\n");
  11765    fclose(df);
  11766  }
  11767 #endif
  11768  emsg(_("E874: (NFA) Could not pop the stack!"));
  11769 }
  11770 
  11771 // Push an item onto the stack.
  11772 static void st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
  11773 {
  11774  Frag_T *stackp = *p;
  11775 
  11776  if (stackp >= stack_end) {
  11777    return;
  11778  }
  11779  *stackp = s;
  11780  *p = *p + 1;
  11781 }
  11782 
  11783 // Pop an item from the stack.
  11784 static Frag_T st_pop(Frag_T **p, Frag_T *stack)
  11785 {
  11786  Frag_T *stackp;
  11787 
  11788  *p = *p - 1;
  11789  stackp = *p;
  11790  if (stackp < stack) {
  11791    return empty;
  11792  }
  11793  return **p;
  11794 }
  11795 
  11796 // Estimate the maximum byte length of anything matching "state".
  11797 // When unknown or unlimited return -1.
  11798 static int nfa_max_width(nfa_state_T *startstate, int depth)
  11799 {
  11800  int l, r;
  11801  nfa_state_T *state = startstate;
  11802  int len = 0;
  11803 
  11804  // detect looping in a NFA_SPLIT
  11805  if (depth > 4) {
  11806    return -1;
  11807  }
  11808 
  11809  while (state != NULL) {
  11810    switch (state->c) {
  11811    case NFA_END_INVISIBLE:
  11812    case NFA_END_INVISIBLE_NEG:
  11813      // the end, return what we have
  11814      return len;
  11815 
  11816    case NFA_SPLIT:
  11817      // two alternatives, use the maximum
  11818      l = nfa_max_width(state->out, depth + 1);
  11819      r = nfa_max_width(state->out1, depth + 1);
  11820      if (l < 0 || r < 0) {
  11821        return -1;
  11822      }
  11823      return len + (l > r ? l : r);
  11824 
  11825    case NFA_ANY:
  11826    case NFA_START_COLL:
  11827    case NFA_START_NEG_COLL:
  11828      // Matches some character, including composing chars.
  11829      len += MB_MAXBYTES;
  11830      if (state->c != NFA_ANY) {
  11831        // Skip over the characters.
  11832        state = state->out1->out;
  11833        continue;
  11834      }
  11835      break;
  11836 
  11837    case NFA_DIGIT:
  11838    case NFA_WHITE:
  11839    case NFA_HEX:
  11840    case NFA_OCTAL:
  11841      // ascii
  11842      len++;
  11843      break;
  11844 
  11845    case NFA_IDENT:
  11846    case NFA_SIDENT:
  11847    case NFA_KWORD:
  11848    case NFA_SKWORD:
  11849    case NFA_FNAME:
  11850    case NFA_SFNAME:
  11851    case NFA_PRINT:
  11852    case NFA_SPRINT:
  11853    case NFA_NWHITE:
  11854    case NFA_NDIGIT:
  11855    case NFA_NHEX:
  11856    case NFA_NOCTAL:
  11857    case NFA_WORD:
  11858    case NFA_NWORD:
  11859    case NFA_HEAD:
  11860    case NFA_NHEAD:
  11861    case NFA_ALPHA:
  11862    case NFA_NALPHA:
  11863    case NFA_LOWER:
  11864    case NFA_NLOWER:
  11865    case NFA_UPPER:
  11866    case NFA_NUPPER:
  11867    case NFA_LOWER_IC:
  11868    case NFA_NLOWER_IC:
  11869    case NFA_UPPER_IC:
  11870    case NFA_NUPPER_IC:
  11871    case NFA_ANY_COMPOSING:
  11872      // possibly non-ascii
  11873      len += 3;
  11874      break;
  11875 
  11876    case NFA_START_INVISIBLE:
  11877    case NFA_START_INVISIBLE_NEG:
  11878    case NFA_START_INVISIBLE_BEFORE:
  11879    case NFA_START_INVISIBLE_BEFORE_NEG:
  11880      // zero-width, out1 points to the END state
  11881      state = state->out1->out;
  11882      continue;
  11883 
  11884    case NFA_BACKREF1:
  11885    case NFA_BACKREF2:
  11886    case NFA_BACKREF3:
  11887    case NFA_BACKREF4:
  11888    case NFA_BACKREF5:
  11889    case NFA_BACKREF6:
  11890    case NFA_BACKREF7:
  11891    case NFA_BACKREF8:
  11892    case NFA_BACKREF9:
  11893    case NFA_ZREF1:
  11894    case NFA_ZREF2:
  11895    case NFA_ZREF3:
  11896    case NFA_ZREF4:
  11897    case NFA_ZREF5:
  11898    case NFA_ZREF6:
  11899    case NFA_ZREF7:
  11900    case NFA_ZREF8:
  11901    case NFA_ZREF9:
  11902    case NFA_NEWL:
  11903    case NFA_SKIP:
  11904      // unknown width
  11905      return -1;
  11906 
  11907    case NFA_BOL:
  11908    case NFA_EOL:
  11909    case NFA_BOF:
  11910    case NFA_EOF:
  11911    case NFA_BOW:
  11912    case NFA_EOW:
  11913    case NFA_MOPEN:
  11914    case NFA_MOPEN1:
  11915    case NFA_MOPEN2:
  11916    case NFA_MOPEN3:
  11917    case NFA_MOPEN4:
  11918    case NFA_MOPEN5:
  11919    case NFA_MOPEN6:
  11920    case NFA_MOPEN7:
  11921    case NFA_MOPEN8:
  11922    case NFA_MOPEN9:
  11923    case NFA_ZOPEN:
  11924    case NFA_ZOPEN1:
  11925    case NFA_ZOPEN2:
  11926    case NFA_ZOPEN3:
  11927    case NFA_ZOPEN4:
  11928    case NFA_ZOPEN5:
  11929    case NFA_ZOPEN6:
  11930    case NFA_ZOPEN7:
  11931    case NFA_ZOPEN8:
  11932    case NFA_ZOPEN9:
  11933    case NFA_ZCLOSE:
  11934    case NFA_ZCLOSE1:
  11935    case NFA_ZCLOSE2:
  11936    case NFA_ZCLOSE3:
  11937    case NFA_ZCLOSE4:
  11938    case NFA_ZCLOSE5:
  11939    case NFA_ZCLOSE6:
  11940    case NFA_ZCLOSE7:
  11941    case NFA_ZCLOSE8:
  11942    case NFA_ZCLOSE9:
  11943    case NFA_MCLOSE:
  11944    case NFA_MCLOSE1:
  11945    case NFA_MCLOSE2:
  11946    case NFA_MCLOSE3:
  11947    case NFA_MCLOSE4:
  11948    case NFA_MCLOSE5:
  11949    case NFA_MCLOSE6:
  11950    case NFA_MCLOSE7:
  11951    case NFA_MCLOSE8:
  11952    case NFA_MCLOSE9:
  11953    case NFA_NOPEN:
  11954    case NFA_NCLOSE:
  11955 
  11956    case NFA_LNUM_GT:
  11957    case NFA_LNUM_LT:
  11958    case NFA_COL_GT:
  11959    case NFA_COL_LT:
  11960    case NFA_VCOL_GT:
  11961    case NFA_VCOL_LT:
  11962    case NFA_MARK_GT:
  11963    case NFA_MARK_LT:
  11964    case NFA_VISUAL:
  11965    case NFA_LNUM:
  11966    case NFA_CURSOR:
  11967    case NFA_COL:
  11968    case NFA_VCOL:
  11969    case NFA_MARK:
  11970 
  11971    case NFA_ZSTART:
  11972    case NFA_ZEND:
  11973    case NFA_OPT_CHARS:
  11974    case NFA_EMPTY:
  11975    case NFA_START_PATTERN:
  11976    case NFA_END_PATTERN:
  11977    case NFA_COMPOSING:
  11978    case NFA_END_COMPOSING:
  11979      // zero-width
  11980      break;
  11981 
  11982    default:
  11983      if (state->c < 0) {
  11984        // don't know what this is
  11985        return -1;
  11986      }
  11987      // normal character
  11988      len += utf_char2len(state->c);
  11989      break;
  11990    }
  11991 
  11992    // normal way to continue
  11993    state = state->out;
  11994  }
  11995 
  11996  // unrecognized, "cannot happen"
  11997  return -1;
  11998 }
  11999 
  12000 // Convert a postfix form into its equivalent NFA.
  12001 // Return the NFA start state on success, NULL otherwise.
  12002 static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size)
  12003 {
  12004  int *p;
  12005  int mopen;
  12006  int mclose;
  12007  Frag_T *stack = NULL;
  12008  Frag_T *stackp = NULL;
  12009  Frag_T *stack_end = NULL;
  12010  Frag_T e1;
  12011  Frag_T e2;
  12012  Frag_T e;
  12013  nfa_state_T *s;
  12014  nfa_state_T *s1;
  12015  nfa_state_T *matchstate;
  12016  nfa_state_T *ret = NULL;
  12017 
  12018  if (postfix == NULL) {
  12019    return NULL;
  12020  }
  12021 
  12022 #define PUSH(s)     st_push((s), &stackp, stack_end)
  12023 #define POP()       st_pop(&stackp, stack); \
  12024  if (stackp < stack) { \
  12025    st_error(postfix, end, p); \
  12026    xfree(stack); \
  12027    return NULL; \
  12028  }
  12029 
  12030  if (nfa_calc_size == false) {
  12031    // Allocate space for the stack. Max states on the stack: "nstate".
  12032    stack = xmalloc((size_t)(nstate + 1) * sizeof(Frag_T));
  12033    stackp = stack;
  12034    stack_end = stack + (nstate + 1);
  12035  }
  12036 
  12037  for (p = postfix; p < end; p++) {
  12038    switch (*p) {
  12039    case NFA_CONCAT:
  12040      // Concatenation.
  12041      // Pay attention: this operator does not exist in the r.e. itself
  12042      // (it is implicit, really).  It is added when r.e. is translated
  12043      // to postfix form in re2post().
  12044      if (nfa_calc_size == true) {
  12045        // nstate += 0;
  12046        break;
  12047      }
  12048      e2 = POP();
  12049      e1 = POP();
  12050      patch(e1.out, e2.start);
  12051      PUSH(frag(e1.start, e2.out));
  12052      break;
  12053 
  12054    case NFA_OR:
  12055      // Alternation
  12056      if (nfa_calc_size == true) {
  12057        nstate++;
  12058        break;
  12059      }
  12060      e2 = POP();
  12061      e1 = POP();
  12062      s = alloc_state(NFA_SPLIT, e1.start, e2.start);
  12063      if (s == NULL) {
  12064        goto theend;
  12065      }
  12066      PUSH(frag(s, append(e1.out, e2.out)));
  12067      break;
  12068 
  12069    case NFA_STAR:
  12070      // Zero or more, prefer more
  12071      if (nfa_calc_size == true) {
  12072        nstate++;
  12073        break;
  12074      }
  12075      e = POP();
  12076      s = alloc_state(NFA_SPLIT, e.start, NULL);
  12077      if (s == NULL) {
  12078        goto theend;
  12079      }
  12080      patch(e.out, s);
  12081      PUSH(frag(s, list1(&s->out1)));
  12082      break;
  12083 
  12084    case NFA_STAR_NONGREEDY:
  12085      // Zero or more, prefer zero
  12086      if (nfa_calc_size == true) {
  12087        nstate++;
  12088        break;
  12089      }
  12090      e = POP();
  12091      s = alloc_state(NFA_SPLIT, NULL, e.start);
  12092      if (s == NULL) {
  12093        goto theend;
  12094      }
  12095      patch(e.out, s);
  12096      PUSH(frag(s, list1(&s->out)));
  12097      break;
  12098 
  12099    case NFA_QUEST:
  12100      // one or zero atoms=> greedy match
  12101      if (nfa_calc_size == true) {
  12102        nstate++;
  12103        break;
  12104      }
  12105      e = POP();
  12106      s = alloc_state(NFA_SPLIT, e.start, NULL);
  12107      if (s == NULL) {
  12108        goto theend;
  12109      }
  12110      PUSH(frag(s, append(e.out, list1(&s->out1))));
  12111      break;
  12112 
  12113    case NFA_QUEST_NONGREEDY:
  12114      // zero or one atoms => non-greedy match
  12115      if (nfa_calc_size == true) {
  12116        nstate++;
  12117        break;
  12118      }
  12119      e = POP();
  12120      s = alloc_state(NFA_SPLIT, NULL, e.start);
  12121      if (s == NULL) {
  12122        goto theend;
  12123      }
  12124      PUSH(frag(s, append(e.out, list1(&s->out))));
  12125      break;
  12126 
  12127    case NFA_END_COLL:
  12128    case NFA_END_NEG_COLL:
  12129      // On the stack is the sequence starting with NFA_START_COLL or
  12130      // NFA_START_NEG_COLL and all possible characters. Patch it to
  12131      // add the output to the start.
  12132      if (nfa_calc_size == true) {
  12133        nstate++;
  12134        break;
  12135      }
  12136      e = POP();
  12137      s = alloc_state(NFA_END_COLL, NULL, NULL);
  12138      if (s == NULL) {
  12139        goto theend;
  12140      }
  12141      patch(e.out, s);
  12142      e.start->out1 = s;
  12143      PUSH(frag(e.start, list1(&s->out)));
  12144      break;
  12145 
  12146    case NFA_RANGE:
  12147      // Before this are two characters, the low and high end of a
  12148      // range.  Turn them into two states with MIN and MAX.
  12149      if (nfa_calc_size == true) {
  12150        // nstate += 0;
  12151        break;
  12152      }
  12153      e2 = POP();
  12154      e1 = POP();
  12155      e2.start->val = e2.start->c;
  12156      e2.start->c = NFA_RANGE_MAX;
  12157      e1.start->val = e1.start->c;
  12158      e1.start->c = NFA_RANGE_MIN;
  12159      patch(e1.out, e2.start);
  12160      PUSH(frag(e1.start, e2.out));
  12161      break;
  12162 
  12163    case NFA_EMPTY:
  12164      // 0-length, used in a repetition with max/min count of 0
  12165      if (nfa_calc_size == true) {
  12166        nstate++;
  12167        break;
  12168      }
  12169      s = alloc_state(NFA_EMPTY, NULL, NULL);
  12170      if (s == NULL) {
  12171        goto theend;
  12172      }
  12173      PUSH(frag(s, list1(&s->out)));
  12174      break;
  12175 
  12176    case NFA_OPT_CHARS: {
  12177      int n;
  12178 
  12179      // \%[abc] implemented as:
  12180      //    NFA_SPLIT
  12181      //    +-CHAR(a)
  12182      //    | +-NFA_SPLIT
  12183      //    |   +-CHAR(b)
  12184      //    |   | +-NFA_SPLIT
  12185      //    |   |   +-CHAR(c)
  12186      //    |   |   | +-next
  12187      //    |   |   +- next
  12188      //    |   +- next
  12189      //    +- next
  12190      n = *++p;  // get number of characters
  12191      if (nfa_calc_size == true) {
  12192        nstate += n;
  12193        break;
  12194      }
  12195      s = NULL;       // avoid compiler warning
  12196      e1.out = NULL;       // stores list with out1's
  12197      s1 = NULL;       // previous NFA_SPLIT to connect to
  12198      while (n-- > 0) {
  12199        e = POP();         // get character
  12200        s = alloc_state(NFA_SPLIT, e.start, NULL);
  12201        if (s == NULL) {
  12202          goto theend;
  12203        }
  12204        if (e1.out == NULL) {
  12205          e1 = e;
  12206        }
  12207        patch(e.out, s1);
  12208        append(e1.out, list1(&s->out1));
  12209        s1 = s;
  12210      }
  12211      PUSH(frag(s, e1.out));
  12212      break;
  12213    }
  12214 
  12215    case NFA_PREV_ATOM_NO_WIDTH:
  12216    case NFA_PREV_ATOM_NO_WIDTH_NEG:
  12217    case NFA_PREV_ATOM_JUST_BEFORE:
  12218    case NFA_PREV_ATOM_JUST_BEFORE_NEG:
  12219    case NFA_PREV_ATOM_LIKE_PATTERN: {
  12220      int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
  12221                    || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
  12222      int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
  12223      int start_state;
  12224      int end_state;
  12225      int n = 0;
  12226      nfa_state_T *zend;
  12227      nfa_state_T *skip;
  12228 
  12229      switch (*p) {
  12230      case NFA_PREV_ATOM_NO_WIDTH:
  12231        start_state = NFA_START_INVISIBLE;
  12232        end_state = NFA_END_INVISIBLE;
  12233        break;
  12234      case NFA_PREV_ATOM_NO_WIDTH_NEG:
  12235        start_state = NFA_START_INVISIBLE_NEG;
  12236        end_state = NFA_END_INVISIBLE_NEG;
  12237        break;
  12238      case NFA_PREV_ATOM_JUST_BEFORE:
  12239        start_state = NFA_START_INVISIBLE_BEFORE;
  12240        end_state = NFA_END_INVISIBLE;
  12241        break;
  12242      case NFA_PREV_ATOM_JUST_BEFORE_NEG:
  12243        start_state = NFA_START_INVISIBLE_BEFORE_NEG;
  12244        end_state = NFA_END_INVISIBLE_NEG;
  12245        break;
  12246      default:           // NFA_PREV_ATOM_LIKE_PATTERN:
  12247        start_state = NFA_START_PATTERN;
  12248        end_state = NFA_END_PATTERN;
  12249        break;
  12250      }
  12251 
  12252      if (before) {
  12253        n = *++p;         // get the count
  12254      }
  12255      // The \@= operator: match the preceding atom with zero width.
  12256      // The \@! operator: no match for the preceding atom.
  12257      // The \@<= operator: match for the preceding atom.
  12258      // The \@<! operator: no match for the preceding atom.
  12259      // Surrounds the preceding atom with START_INVISIBLE and
  12260      // END_INVISIBLE, similarly to MOPEN.
  12261 
  12262      if (nfa_calc_size == true) {
  12263        nstate += pattern ? 4 : 2;
  12264        break;
  12265      }
  12266      e = POP();
  12267      s1 = alloc_state(end_state, NULL, NULL);
  12268      if (s1 == NULL) {
  12269        goto theend;
  12270      }
  12271 
  12272      s = alloc_state(start_state, e.start, s1);
  12273      if (s == NULL) {
  12274        goto theend;
  12275      }
  12276      if (pattern) {
  12277        // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
  12278        skip = alloc_state(NFA_SKIP, NULL, NULL);
  12279        if (skip == NULL) {
  12280          goto theend;
  12281        }
  12282        zend = alloc_state(NFA_ZEND, s1, NULL);
  12283        if (zend == NULL) {
  12284          goto theend;
  12285        }
  12286        s1->out = skip;
  12287        patch(e.out, zend);
  12288        PUSH(frag(s, list1(&skip->out)));
  12289      } else {
  12290        patch(e.out, s1);
  12291        PUSH(frag(s, list1(&s1->out)));
  12292        if (before) {
  12293          if (n <= 0) {
  12294            // See if we can guess the maximum width, it avoids a
  12295            // lot of pointless tries.
  12296            n = nfa_max_width(e.start, 0);
  12297          }
  12298          s->val = n;           // store the count
  12299        }
  12300      }
  12301      break;
  12302    }
  12303 
  12304    case NFA_COMPOSING:         // char with composing char
  12305      FALLTHROUGH;
  12306 
  12307    case NFA_MOPEN:     // \( \) Submatch
  12308    case NFA_MOPEN1:
  12309    case NFA_MOPEN2:
  12310    case NFA_MOPEN3:
  12311    case NFA_MOPEN4:
  12312    case NFA_MOPEN5:
  12313    case NFA_MOPEN6:
  12314    case NFA_MOPEN7:
  12315    case NFA_MOPEN8:
  12316    case NFA_MOPEN9:
  12317    case NFA_ZOPEN:     // \z( \) Submatch
  12318    case NFA_ZOPEN1:
  12319    case NFA_ZOPEN2:
  12320    case NFA_ZOPEN3:
  12321    case NFA_ZOPEN4:
  12322    case NFA_ZOPEN5:
  12323    case NFA_ZOPEN6:
  12324    case NFA_ZOPEN7:
  12325    case NFA_ZOPEN8:
  12326    case NFA_ZOPEN9:
  12327    case NFA_NOPEN:     // \%( \) "Invisible Submatch"
  12328      if (nfa_calc_size == true) {
  12329        nstate += 2;
  12330        break;
  12331      }
  12332 
  12333      mopen = *p;
  12334      switch (*p) {
  12335      case NFA_NOPEN:
  12336        mclose = NFA_NCLOSE; break;
  12337      case NFA_ZOPEN:
  12338        mclose = NFA_ZCLOSE; break;
  12339      case NFA_ZOPEN1:
  12340        mclose = NFA_ZCLOSE1; break;
  12341      case NFA_ZOPEN2:
  12342        mclose = NFA_ZCLOSE2; break;
  12343      case NFA_ZOPEN3:
  12344        mclose = NFA_ZCLOSE3; break;
  12345      case NFA_ZOPEN4:
  12346        mclose = NFA_ZCLOSE4; break;
  12347      case NFA_ZOPEN5:
  12348        mclose = NFA_ZCLOSE5; break;
  12349      case NFA_ZOPEN6:
  12350        mclose = NFA_ZCLOSE6; break;
  12351      case NFA_ZOPEN7:
  12352        mclose = NFA_ZCLOSE7; break;
  12353      case NFA_ZOPEN8:
  12354        mclose = NFA_ZCLOSE8; break;
  12355      case NFA_ZOPEN9:
  12356        mclose = NFA_ZCLOSE9; break;
  12357      case NFA_COMPOSING:
  12358        mclose = NFA_END_COMPOSING; break;
  12359      default:
  12360        // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
  12361        mclose = *p + NSUBEXP;
  12362        break;
  12363      }
  12364 
  12365      // Allow "NFA_MOPEN" as a valid postfix representation for
  12366      // the empty regexp "". In this case, the NFA will be
  12367      // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
  12368      // empty groups of parenthesis, and empty mbyte chars
  12369      if (stackp == stack) {
  12370        s = alloc_state(mopen, NULL, NULL);
  12371        if (s == NULL) {
  12372          goto theend;
  12373        }
  12374        s1 = alloc_state(mclose, NULL, NULL);
  12375        if (s1 == NULL) {
  12376          goto theend;
  12377        }
  12378        patch(list1(&s->out), s1);
  12379        PUSH(frag(s, list1(&s1->out)));
  12380        break;
  12381      }
  12382 
  12383      // At least one node was emitted before NFA_MOPEN, so
  12384      // at least one node will be between NFA_MOPEN and NFA_MCLOSE
  12385      e = POP();
  12386      s = alloc_state(mopen, e.start, NULL);         // `('
  12387      if (s == NULL) {
  12388        goto theend;
  12389      }
  12390 
  12391      s1 = alloc_state(mclose, NULL, NULL);         // `)'
  12392      if (s1 == NULL) {
  12393        goto theend;
  12394      }
  12395      patch(e.out, s1);
  12396 
  12397      if (mopen == NFA_COMPOSING) {
  12398        // COMPOSING->out1 = END_COMPOSING
  12399        patch(list1(&s->out1), s1);
  12400      }
  12401 
  12402      PUSH(frag(s, list1(&s1->out)));
  12403      break;
  12404 
  12405    case NFA_BACKREF1:
  12406    case NFA_BACKREF2:
  12407    case NFA_BACKREF3:
  12408    case NFA_BACKREF4:
  12409    case NFA_BACKREF5:
  12410    case NFA_BACKREF6:
  12411    case NFA_BACKREF7:
  12412    case NFA_BACKREF8:
  12413    case NFA_BACKREF9:
  12414    case NFA_ZREF1:
  12415    case NFA_ZREF2:
  12416    case NFA_ZREF3:
  12417    case NFA_ZREF4:
  12418    case NFA_ZREF5:
  12419    case NFA_ZREF6:
  12420    case NFA_ZREF7:
  12421    case NFA_ZREF8:
  12422    case NFA_ZREF9:
  12423      if (nfa_calc_size == true) {
  12424        nstate += 2;
  12425        break;
  12426      }
  12427      s = alloc_state(*p, NULL, NULL);
  12428      if (s == NULL) {
  12429        goto theend;
  12430      }
  12431      s1 = alloc_state(NFA_SKIP, NULL, NULL);
  12432      if (s1 == NULL) {
  12433        goto theend;
  12434      }
  12435      patch(list1(&s->out), s1);
  12436      PUSH(frag(s, list1(&s1->out)));
  12437      break;
  12438 
  12439    case NFA_LNUM:
  12440    case NFA_LNUM_GT:
  12441    case NFA_LNUM_LT:
  12442    case NFA_VCOL:
  12443    case NFA_VCOL_GT:
  12444    case NFA_VCOL_LT:
  12445    case NFA_COL:
  12446    case NFA_COL_GT:
  12447    case NFA_COL_LT:
  12448    case NFA_MARK:
  12449    case NFA_MARK_GT:
  12450    case NFA_MARK_LT: {
  12451      int n = *++p;       // lnum, col or mark name
  12452 
  12453      if (nfa_calc_size == true) {
  12454        nstate += 1;
  12455        break;
  12456      }
  12457      s = alloc_state(p[-1], NULL, NULL);
  12458      if (s == NULL) {
  12459        goto theend;
  12460      }
  12461      s->val = n;
  12462      PUSH(frag(s, list1(&s->out)));
  12463      break;
  12464    }
  12465 
  12466    case NFA_ZSTART:
  12467    case NFA_ZEND:
  12468    default:
  12469      // Operands
  12470      if (nfa_calc_size == true) {
  12471        nstate++;
  12472        break;
  12473      }
  12474      s = alloc_state(*p, NULL, NULL);
  12475      if (s == NULL) {
  12476        goto theend;
  12477      }
  12478      PUSH(frag(s, list1(&s->out)));
  12479      break;
  12480    }     // switch(*p)
  12481  }   // for(p = postfix; *p; ++p)
  12482 
  12483  if (nfa_calc_size == true) {
  12484    nstate++;
  12485    goto theend;        // Return value when counting size is ignored anyway
  12486  }
  12487 
  12488  e = POP();
  12489  if (stackp != stack) {
  12490    xfree(stack);
  12491    EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA),"
  12492                    "too many states left on stack"));
  12493  }
  12494 
  12495  if (istate >= nstate) {
  12496    xfree(stack);
  12497    EMSG_RET_NULL(_("E876: (NFA regexp) "
  12498                    "Not enough space to store the whole NFA "));
  12499  }
  12500 
  12501  matchstate = &state_ptr[istate++];   // the match state
  12502  matchstate->c = NFA_MATCH;
  12503  matchstate->out = matchstate->out1 = NULL;
  12504  matchstate->id = 0;
  12505 
  12506  patch(e.out, matchstate);
  12507  ret = e.start;
  12508 
  12509 theend:
  12510  xfree(stack);
  12511  return ret;
  12512 
  12513 #undef POP1
  12514 #undef PUSH1
  12515 #undef POP2
  12516 #undef PUSH2
  12517 #undef POP
  12518 #undef PUSH
  12519 }
  12520 
  12521 // After building the NFA program, inspect it to add optimization hints.
  12522 static void nfa_postprocess(nfa_regprog_T *prog)
  12523 {
  12524  int i;
  12525  int c;
  12526 
  12527  for (i = 0; i < prog->nstate; i++) {
  12528    c = prog->state[i].c;
  12529    if (c == NFA_START_INVISIBLE
  12530        || c == NFA_START_INVISIBLE_NEG
  12531        || c == NFA_START_INVISIBLE_BEFORE
  12532        || c == NFA_START_INVISIBLE_BEFORE_NEG) {
  12533      int directly;
  12534 
  12535      // Do it directly when what follows is possibly the end of the
  12536      // match.
  12537      if (match_follows(prog->state[i].out1->out, 0)) {
  12538        directly = true;
  12539      } else {
  12540        int ch_invisible = failure_chance(prog->state[i].out, 0);
  12541        int ch_follows = failure_chance(prog->state[i].out1->out, 0);
  12542 
  12543        // Postpone when the invisible match is expensive or has a
  12544        // lower chance of failing.
  12545        if (c == NFA_START_INVISIBLE_BEFORE
  12546            || c == NFA_START_INVISIBLE_BEFORE_NEG) {
  12547          // "before" matches are very expensive when
  12548          // unbounded, always prefer what follows then,
  12549          // unless what follows will always match.
  12550          // Otherwise strongly prefer what follows.
  12551          if (prog->state[i].val <= 0 && ch_follows > 0) {
  12552            directly = false;
  12553          } else {
  12554            directly = ch_follows * 10 < ch_invisible;
  12555          }
  12556        } else {
  12557          // normal invisible, first do the one with the
  12558          // highest failure chance
  12559          directly = ch_follows < ch_invisible;
  12560        }
  12561      }
  12562      if (directly) {
  12563        // switch to the _FIRST state
  12564        prog->state[i].c++;
  12565      }
  12566    }
  12567  }
  12568 }
  12569 
  12570 /////////////////////////////////////////////////////////////////
  12571 // NFA execution code.
  12572 /////////////////////////////////////////////////////////////////
  12573 
  12574 // Values for done in nfa_pim_T.
  12575 #define NFA_PIM_UNUSED   0      // pim not used
  12576 #define NFA_PIM_TODO     1      // pim not done yet
  12577 #define NFA_PIM_MATCH    2      // pim executed, matches
  12578 #define NFA_PIM_NOMATCH  3      // pim executed, no match
  12579 
  12580 #ifdef REGEXP_DEBUG
  12581 static void log_subsexpr(regsubs_T *subs)
  12582 {
  12583  log_subexpr(&subs->norm);
  12584  if (rex.nfa_has_zsubexpr) {
  12585    log_subexpr(&subs->synt);
  12586  }
  12587 }
  12588 
  12589 static void log_subexpr(regsub_T *sub)
  12590 {
  12591  int j;
  12592 
  12593  for (j = 0; j < sub->in_use; j++) {
  12594    if (REG_MULTI) {
  12595      fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
  12596              j,
  12597              sub->list.multi[j].start_col,
  12598              (int)sub->list.multi[j].start_lnum,
  12599              sub->list.multi[j].end_col,
  12600              (int)sub->list.multi[j].end_lnum);
  12601    } else {
  12602      char *s = (char *)sub->list.line[j].start;
  12603      char *e = (char *)sub->list.line[j].end;
  12604 
  12605      fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
  12606              j,
  12607              s == NULL ? "NULL" : s,
  12608              e == NULL ? "NULL" : e);
  12609    }
  12610  }
  12611 }
  12612 
  12613 static char *pim_info(const nfa_pim_T *pim)
  12614 {
  12615  static char buf[30];
  12616 
  12617  if (pim == NULL || pim->result == NFA_PIM_UNUSED) {
  12618    buf[0] = NUL;
  12619  } else {
  12620    snprintf(buf, sizeof(buf), " PIM col %d",
  12621             REG_MULTI
  12622             ? (int)pim->end.pos.col
  12623             : (int)(pim->end.ptr - rex.input));
  12624  }
  12625  return buf;
  12626 }
  12627 
  12628 #endif
  12629 
  12630 // Used during execution: whether a match has been found.
  12631 static int nfa_match;
  12632 static proftime_T *nfa_time_limit;
  12633 static int *nfa_timed_out;
  12634 static int nfa_time_count;
  12635 
  12636 // Copy postponed invisible match info from "from" to "to".
  12637 static void copy_pim(nfa_pim_T *to, nfa_pim_T *from)
  12638 {
  12639  to->result = from->result;
  12640  to->state = from->state;
  12641  copy_sub(&to->subs.norm, &from->subs.norm);
  12642  if (rex.nfa_has_zsubexpr) {
  12643    copy_sub(&to->subs.synt, &from->subs.synt);
  12644  }
  12645  to->end = from->end;
  12646 }
  12647 
  12648 static void clear_sub(regsub_T *sub)
  12649 {
  12650  if (REG_MULTI) {
  12651    // Use 0xff to set lnum to -1
  12652    memset(sub->list.multi, 0xff, sizeof(struct multipos) * (size_t)rex.nfa_nsubexpr);
  12653  } else {
  12654    memset(sub->list.line, 0, sizeof(struct linepos) * (size_t)rex.nfa_nsubexpr);
  12655  }
  12656  sub->in_use = 0;
  12657 }
  12658 
  12659 // Copy the submatches from "from" to "to".
  12660 static void copy_sub(regsub_T *to, regsub_T *from)
  12661 {
  12662  to->in_use = from->in_use;
  12663  if (from->in_use <= 0) {
  12664    return;
  12665  }
  12666 
  12667  // Copy the match start and end positions.
  12668  if (REG_MULTI) {
  12669    memmove(&to->list.multi[0], &from->list.multi[0],
  12670            sizeof(struct multipos) * (size_t)from->in_use);
  12671    to->orig_start_col = from->orig_start_col;
  12672  } else {
  12673    memmove(&to->list.line[0], &from->list.line[0],
  12674            sizeof(struct linepos) * (size_t)from->in_use);
  12675  }
  12676 }
  12677 
  12678 // Like copy_sub() but exclude the main match.
  12679 static void copy_sub_off(regsub_T *to, regsub_T *from)
  12680 {
  12681  if (to->in_use < from->in_use) {
  12682    to->in_use = from->in_use;
  12683  }
  12684  if (from->in_use <= 1) {
  12685    return;
  12686  }
  12687 
  12688  // Copy the match start and end positions.
  12689  if (REG_MULTI) {
  12690    memmove(&to->list.multi[1], &from->list.multi[1],
  12691            sizeof(struct multipos) * (size_t)(from->in_use - 1));
  12692  } else {
  12693    memmove(&to->list.line[1], &from->list.line[1],
  12694            sizeof(struct linepos) * (size_t)(from->in_use - 1));
  12695  }
  12696 }
  12697 
  12698 // Like copy_sub() but only do the end of the main match if \ze is present.
  12699 static void copy_ze_off(regsub_T *to, regsub_T *from)
  12700 {
  12701  if (!rex.nfa_has_zend) {
  12702    return;
  12703  }
  12704 
  12705  if (REG_MULTI) {
  12706    if (from->list.multi[0].end_lnum >= 0) {
  12707      to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
  12708      to->list.multi[0].end_col = from->list.multi[0].end_col;
  12709    }
  12710  } else {
  12711    if (from->list.line[0].end != NULL) {
  12712      to->list.line[0].end = from->list.line[0].end;
  12713    }
  12714  }
  12715 }
  12716 
  12717 // Return true if "sub1" and "sub2" have the same start positions.
  12718 // When using back-references also check the end position.
  12719 static bool sub_equal(regsub_T *sub1, regsub_T *sub2)
  12720 {
  12721  int i;
  12722  int todo;
  12723  linenr_T s1;
  12724  linenr_T s2;
  12725  uint8_t *sp1;
  12726  uint8_t *sp2;
  12727 
  12728  todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
  12729  if (REG_MULTI) {
  12730    for (i = 0; i < todo; i++) {
  12731      if (i < sub1->in_use) {
  12732        s1 = sub1->list.multi[i].start_lnum;
  12733      } else {
  12734        s1 = -1;
  12735      }
  12736      if (i < sub2->in_use) {
  12737        s2 = sub2->list.multi[i].start_lnum;
  12738      } else {
  12739        s2 = -1;
  12740      }
  12741      if (s1 != s2) {
  12742        return false;
  12743      }
  12744      if (s1 != -1 && sub1->list.multi[i].start_col
  12745          != sub2->list.multi[i].start_col) {
  12746        return false;
  12747      }
  12748      if (rex.nfa_has_backref) {
  12749        if (i < sub1->in_use) {
  12750          s1 = sub1->list.multi[i].end_lnum;
  12751        } else {
  12752          s1 = -1;
  12753        }
  12754        if (i < sub2->in_use) {
  12755          s2 = sub2->list.multi[i].end_lnum;
  12756        } else {
  12757          s2 = -1;
  12758        }
  12759        if (s1 != s2) {
  12760          return false;
  12761        }
  12762        if (s1 != -1
  12763            && sub1->list.multi[i].end_col != sub2->list.multi[i].end_col) {
  12764          return false;
  12765        }
  12766      }
  12767    }
  12768  } else {
  12769    for (i = 0; i < todo; i++) {
  12770      if (i < sub1->in_use) {
  12771        sp1 = sub1->list.line[i].start;
  12772      } else {
  12773        sp1 = NULL;
  12774      }
  12775      if (i < sub2->in_use) {
  12776        sp2 = sub2->list.line[i].start;
  12777      } else {
  12778        sp2 = NULL;
  12779      }
  12780      if (sp1 != sp2) {
  12781        return false;
  12782      }
  12783      if (rex.nfa_has_backref) {
  12784        if (i < sub1->in_use) {
  12785          sp1 = sub1->list.line[i].end;
  12786        } else {
  12787          sp1 = NULL;
  12788        }
  12789        if (i < sub2->in_use) {
  12790          sp2 = sub2->list.line[i].end;
  12791        } else {
  12792          sp2 = NULL;
  12793        }
  12794        if (sp1 != sp2) {
  12795          return false;
  12796        }
  12797      }
  12798    }
  12799  }
  12800 
  12801  return true;
  12802 }
  12803 
  12804 #ifdef REGEXP_DEBUG
  12805 static void open_debug_log(TriState result)
  12806 {
  12807  log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
  12808  if (log_fd == NULL) {
  12809    emsg(_(e_log_open_failed));
  12810    log_fd = stderr;
  12811  }
  12812 
  12813  fprintf(log_fd, "****************************\n");
  12814  fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
  12815  fprintf(log_fd, "MATCH = %s\n", result == kTrue ? "OK" : result == kNone ? "MAYBE" : "FALSE");
  12816  fprintf(log_fd, "****************************\n");
  12817 }
  12818 
  12819 static void report_state(char *action, regsub_T *sub, nfa_state_T *state, int lid, nfa_pim_T *pim)
  12820 {
  12821  int col;
  12822 
  12823  if (sub->in_use <= 0) {
  12824    col = -1;
  12825  } else if (REG_MULTI) {
  12826    col = sub->list.multi[0].start_col;
  12827  } else {
  12828    col = (int)(sub->list.line[0].start - rex.line);
  12829  }
  12830  nfa_set_code(state->c);
  12831  if (log_fd == NULL) {
  12832    open_debug_log(kNone);
  12833  }
  12834  fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
  12835          action, abs(state->id), lid, state->c, code, col,
  12836          pim_info(pim));
  12837 }
  12838 
  12839 #endif
  12840 
  12841 /// @param l      runtime state list
  12842 /// @param state  state to update
  12843 /// @param subs   pointers to subexpressions
  12844 /// @param pim    postponed match or NULL
  12845 ///
  12846 /// @return  true if the same state is already in list "l" with the same
  12847 ///          positions as "subs".
  12848 static bool has_state_with_pos(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs, nfa_pim_T *pim)
  12849  FUNC_ATTR_NONNULL_ARG(1, 2, 3)
  12850 {
  12851  for (int i = 0; i < l->n; i++) {
  12852    nfa_thread_T *thread = &l->t[i];
  12853    if (thread->state->id == state->id
  12854        && sub_equal(&thread->subs.norm, &subs->norm)
  12855        && (!rex.nfa_has_zsubexpr
  12856            || sub_equal(&thread->subs.synt, &subs->synt))
  12857        && pim_equal(&thread->pim, pim)) {
  12858      return true;
  12859    }
  12860  }
  12861  return false;
  12862 }
  12863 
  12864 // Return true if "one" and "two" are equal.  That includes when both are not
  12865 // set.
  12866 static bool pim_equal(const nfa_pim_T *one, const nfa_pim_T *two)
  12867 {
  12868  const bool one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
  12869  const bool two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
  12870 
  12871  if (one_unused) {
  12872    // one is unused: equal when two is also unused
  12873    return two_unused;
  12874  }
  12875  if (two_unused) {
  12876    // one is used and two is not: not equal
  12877    return false;
  12878  }
  12879  // compare the state id
  12880  if (one->state->id != two->state->id) {
  12881    return false;
  12882  }
  12883  // compare the position
  12884  if (REG_MULTI) {
  12885    return one->end.pos.lnum == two->end.pos.lnum
  12886           && one->end.pos.col == two->end.pos.col;
  12887  }
  12888  return one->end.ptr == two->end.ptr;
  12889 }
  12890 
  12891 // Return true if "state" leads to a NFA_MATCH without advancing the input.
  12892 static bool match_follows(const nfa_state_T *startstate, int depth)
  12893  FUNC_ATTR_NONNULL_ALL
  12894 {
  12895  const nfa_state_T *state = startstate;
  12896 
  12897  // avoid too much recursion
  12898  if (depth > 10) {
  12899    return false;
  12900  }
  12901  while (state != NULL) {
  12902    switch (state->c) {
  12903    case NFA_MATCH:
  12904    case NFA_MCLOSE:
  12905    case NFA_END_INVISIBLE:
  12906    case NFA_END_INVISIBLE_NEG:
  12907    case NFA_END_PATTERN:
  12908      return true;
  12909 
  12910    case NFA_SPLIT:
  12911      return match_follows(state->out, depth + 1)
  12912             || match_follows(state->out1, depth + 1);
  12913 
  12914    case NFA_START_INVISIBLE:
  12915    case NFA_START_INVISIBLE_FIRST:
  12916    case NFA_START_INVISIBLE_BEFORE:
  12917    case NFA_START_INVISIBLE_BEFORE_FIRST:
  12918    case NFA_START_INVISIBLE_NEG:
  12919    case NFA_START_INVISIBLE_NEG_FIRST:
  12920    case NFA_START_INVISIBLE_BEFORE_NEG:
  12921    case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
  12922    case NFA_COMPOSING:
  12923      // skip ahead to next state
  12924      state = state->out1->out;
  12925      continue;
  12926 
  12927    case NFA_ANY:
  12928    case NFA_ANY_COMPOSING:
  12929    case NFA_IDENT:
  12930    case NFA_SIDENT:
  12931    case NFA_KWORD:
  12932    case NFA_SKWORD:
  12933    case NFA_FNAME:
  12934    case NFA_SFNAME:
  12935    case NFA_PRINT:
  12936    case NFA_SPRINT:
  12937    case NFA_WHITE:
  12938    case NFA_NWHITE:
  12939    case NFA_DIGIT:
  12940    case NFA_NDIGIT:
  12941    case NFA_HEX:
  12942    case NFA_NHEX:
  12943    case NFA_OCTAL:
  12944    case NFA_NOCTAL:
  12945    case NFA_WORD:
  12946    case NFA_NWORD:
  12947    case NFA_HEAD:
  12948    case NFA_NHEAD:
  12949    case NFA_ALPHA:
  12950    case NFA_NALPHA:
  12951    case NFA_LOWER:
  12952    case NFA_NLOWER:
  12953    case NFA_UPPER:
  12954    case NFA_NUPPER:
  12955    case NFA_LOWER_IC:
  12956    case NFA_NLOWER_IC:
  12957    case NFA_UPPER_IC:
  12958    case NFA_NUPPER_IC:
  12959    case NFA_START_COLL:
  12960    case NFA_START_NEG_COLL:
  12961    case NFA_NEWL:
  12962      // state will advance input
  12963      return false;
  12964 
  12965    default:
  12966      if (state->c > 0) {
  12967        // state will advance input
  12968        return false;
  12969      }
  12970      // Others: zero-width or possibly zero-width, might still find
  12971      // a match at the same position, keep looking.
  12972      break;
  12973    }
  12974    state = state->out;
  12975  }
  12976  return false;
  12977 }
  12978 
  12979 /// @param l      runtime state list
  12980 /// @param state  state to update
  12981 /// @param subs   pointers to subexpressions
  12982 ///
  12983 /// @return  true if "state" is already in list "l".
  12984 static bool state_in_list(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs)
  12985  FUNC_ATTR_NONNULL_ALL
  12986 {
  12987  if (state->lastlist[nfa_ll_index] == l->id) {
  12988    if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL)) {
  12989      return true;
  12990    }
  12991  }
  12992  return false;
  12993 }
  12994 
  12995 // Offset used for "off" by addstate_here().
  12996 #define ADDSTATE_HERE_OFFSET 10
  12997 
  12998 /// Add "state" and possibly what follows to state list ".".
  12999 ///
  13000 /// @param l         runtime state list
  13001 /// @param state     state to update
  13002 /// @param subs_arg  pointers to subexpressions
  13003 /// @param pim       postponed look-behind match
  13004 /// @param off_arg   byte offset, when -1 go to next line
  13005 ///
  13006 /// @return  "subs_arg", possibly copied into temp_subs.
  13007 ///          NULL when recursiveness is too deep.
  13008 static regsubs_T *addstate(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs_arg, nfa_pim_T *pim,
  13009                           int off_arg)
  13010  FUNC_ATTR_NONNULL_ARG(1, 2) FUNC_ATTR_WARN_UNUSED_RESULT
  13011 {
  13012  int subidx;
  13013  int off = off_arg;
  13014  int add_here = false;
  13015  int listindex = 0;
  13016  int k;
  13017  int found = false;
  13018  nfa_thread_T *thread;
  13019  struct multipos save_multipos;
  13020  int save_in_use;
  13021  uint8_t *save_ptr;
  13022  int i;
  13023  regsub_T *sub;
  13024  regsubs_T *subs = subs_arg;
  13025  static regsubs_T temp_subs;
  13026 #ifdef REGEXP_DEBUG
  13027  int did_print = false;
  13028 #endif
  13029  static int depth = 0;
  13030 
  13031  // This function is called recursively.  When the depth is too much we run
  13032  // out of stack and crash, limit recursiveness here.
  13033  if (++depth >= 5000 || subs == NULL) {
  13034    depth--;
  13035    return NULL;
  13036  }
  13037 
  13038  if (off_arg <= -ADDSTATE_HERE_OFFSET) {
  13039    add_here = true;
  13040    off = 0;
  13041    listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
  13042  }
  13043 
  13044  switch (state->c) {
  13045  case NFA_NCLOSE:
  13046  case NFA_MCLOSE:
  13047  case NFA_MCLOSE1:
  13048  case NFA_MCLOSE2:
  13049  case NFA_MCLOSE3:
  13050  case NFA_MCLOSE4:
  13051  case NFA_MCLOSE5:
  13052  case NFA_MCLOSE6:
  13053  case NFA_MCLOSE7:
  13054  case NFA_MCLOSE8:
  13055  case NFA_MCLOSE9:
  13056  case NFA_ZCLOSE:
  13057  case NFA_ZCLOSE1:
  13058  case NFA_ZCLOSE2:
  13059  case NFA_ZCLOSE3:
  13060  case NFA_ZCLOSE4:
  13061  case NFA_ZCLOSE5:
  13062  case NFA_ZCLOSE6:
  13063  case NFA_ZCLOSE7:
  13064  case NFA_ZCLOSE8:
  13065  case NFA_ZCLOSE9:
  13066  case NFA_MOPEN:
  13067  case NFA_ZEND:
  13068  case NFA_SPLIT:
  13069  case NFA_EMPTY:
  13070    // These nodes are not added themselves but their "out" and/or
  13071    // "out1" may be added below.
  13072    break;
  13073 
  13074  case NFA_BOL:
  13075  case NFA_BOF:
  13076    // "^" won't match past end-of-line, don't bother trying.
  13077    // Except when at the end of the line, or when we are going to the
  13078    // next line for a look-behind match.
  13079    if (rex.input > rex.line
  13080        && *rex.input != NUL
  13081        && (nfa_endp == NULL
  13082            || !REG_MULTI
  13083            || rex.lnum == nfa_endp->se_u.pos.lnum)) {
  13084      goto skip_add;
  13085    }
  13086    FALLTHROUGH;
  13087 
  13088  case NFA_MOPEN1:
  13089  case NFA_MOPEN2:
  13090  case NFA_MOPEN3:
  13091  case NFA_MOPEN4:
  13092  case NFA_MOPEN5:
  13093  case NFA_MOPEN6:
  13094  case NFA_MOPEN7:
  13095  case NFA_MOPEN8:
  13096  case NFA_MOPEN9:
  13097  case NFA_ZOPEN:
  13098  case NFA_ZOPEN1:
  13099  case NFA_ZOPEN2:
  13100  case NFA_ZOPEN3:
  13101  case NFA_ZOPEN4:
  13102  case NFA_ZOPEN5:
  13103  case NFA_ZOPEN6:
  13104  case NFA_ZOPEN7:
  13105  case NFA_ZOPEN8:
  13106  case NFA_ZOPEN9:
  13107  case NFA_NOPEN:
  13108  case NFA_ZSTART:
  13109  // These nodes need to be added so that we can bail out when it
  13110  // was added to this list before at the same position to avoid an
  13111  // endless loop for "\(\)*"
  13112 
  13113  default:
  13114    if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) {
  13115      // This state is already in the list, don't add it again,
  13116      // unless it is an MOPEN that is used for a backreference or
  13117      // when there is a PIM. For NFA_MATCH check the position,
  13118      // lower position is preferred.
  13119      if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
  13120          && state->c != NFA_MATCH) {
  13121        // When called from addstate_here() do insert before
  13122        // existing states.
  13123        if (add_here) {
  13124          for (k = 0; k < l->n && k < listindex; k++) {
  13125            if (l->t[k].state->id == state->id) {
  13126              found = true;
  13127              break;
  13128            }
  13129          }
  13130        }
  13131 
  13132        if (!add_here || found) {
  13133 skip_add:
  13134 #ifdef REGEXP_DEBUG
  13135          nfa_set_code(state->c);
  13136          fprintf(log_fd,
  13137                  "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
  13138                  abs(state->id), l->id, state->c, code,
  13139                  pim == NULL ? "NULL" : "yes", l->has_pim, found);
  13140 #endif
  13141          depth--;
  13142          return subs;
  13143        }
  13144      }
  13145 
  13146      // Do not add the state again when it exists with the same
  13147      // positions.
  13148      if (has_state_with_pos(l, state, subs, pim)) {
  13149        goto skip_add;
  13150      }
  13151    }
  13152 
  13153    // When there are backreferences or PIMs the number of states may
  13154    // be (a lot) bigger than anticipated.
  13155    if (l->n == l->len) {
  13156      const int newlen = l->len * 3 / 2 + 50;
  13157      const size_t newsize = (size_t)newlen * sizeof(nfa_thread_T);
  13158 
  13159      if ((int64_t)(newsize >> 10) >= p_mmp) {
  13160        emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
  13161        depth--;
  13162        return NULL;
  13163      }
  13164      if (subs != &temp_subs) {
  13165        // "subs" may point into the current array, need to make a
  13166        // copy before it becomes invalid.
  13167        copy_sub(&temp_subs.norm, &subs->norm);
  13168        if (rex.nfa_has_zsubexpr) {
  13169          copy_sub(&temp_subs.synt, &subs->synt);
  13170        }
  13171        subs = &temp_subs;
  13172      }
  13173 
  13174      nfa_thread_T *const newt = xrealloc(l->t, newsize);
  13175      l->t = newt;
  13176      l->len = newlen;
  13177    }
  13178 
  13179    // add the state to the list
  13180    state->lastlist[nfa_ll_index] = l->id;
  13181    thread = &l->t[l->n++];
  13182    thread->state = state;
  13183    if (pim == NULL) {
  13184      thread->pim.result = NFA_PIM_UNUSED;
  13185    } else {
  13186      copy_pim(&thread->pim, pim);
  13187      l->has_pim = true;
  13188    }
  13189    copy_sub(&thread->subs.norm, &subs->norm);
  13190    if (rex.nfa_has_zsubexpr) {
  13191      copy_sub(&thread->subs.synt, &subs->synt);
  13192    }
  13193 #ifdef REGEXP_DEBUG
  13194    report_state("Adding", &thread->subs.norm, state, l->id, pim);
  13195    did_print = true;
  13196 #endif
  13197  }
  13198 
  13199 #ifdef REGEXP_DEBUG
  13200  if (!did_print) {
  13201    report_state("Processing", &subs->norm, state, l->id, pim);
  13202  }
  13203 #endif
  13204  switch (state->c) {
  13205  case NFA_MATCH:
  13206    break;
  13207 
  13208  case NFA_SPLIT:
  13209    // order matters here
  13210    subs = addstate(l, state->out, subs, pim, off_arg);
  13211    subs = addstate(l, state->out1, subs, pim, off_arg);
  13212    break;
  13213 
  13214  case NFA_EMPTY:
  13215  case NFA_NOPEN:
  13216  case NFA_NCLOSE:
  13217    subs = addstate(l, state->out, subs, pim, off_arg);
  13218    break;
  13219 
  13220  case NFA_MOPEN:
  13221  case NFA_MOPEN1:
  13222  case NFA_MOPEN2:
  13223  case NFA_MOPEN3:
  13224  case NFA_MOPEN4:
  13225  case NFA_MOPEN5:
  13226  case NFA_MOPEN6:
  13227  case NFA_MOPEN7:
  13228  case NFA_MOPEN8:
  13229  case NFA_MOPEN9:
  13230  case NFA_ZOPEN:
  13231  case NFA_ZOPEN1:
  13232  case NFA_ZOPEN2:
  13233  case NFA_ZOPEN3:
  13234  case NFA_ZOPEN4:
  13235  case NFA_ZOPEN5:
  13236  case NFA_ZOPEN6:
  13237  case NFA_ZOPEN7:
  13238  case NFA_ZOPEN8:
  13239  case NFA_ZOPEN9:
  13240  case NFA_ZSTART:
  13241    if (state->c == NFA_ZSTART) {
  13242      subidx = 0;
  13243      sub = &subs->norm;
  13244    } else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) {
  13245      subidx = state->c - NFA_ZOPEN;
  13246      sub = &subs->synt;
  13247    } else {
  13248      subidx = state->c - NFA_MOPEN;
  13249      sub = &subs->norm;
  13250    }
  13251 
  13252    // avoid compiler warnings
  13253    save_ptr = NULL;
  13254    CLEAR_FIELD(save_multipos);
  13255 
  13256    // Set the position (with "off" added) in the subexpression.  Save
  13257    // and restore it when it was in use.  Otherwise fill any gap.
  13258    if (REG_MULTI) {
  13259      if (subidx < sub->in_use) {
  13260        save_multipos = sub->list.multi[subidx];
  13261        save_in_use = -1;
  13262      } else {
  13263        save_in_use = sub->in_use;
  13264        for (i = sub->in_use; i < subidx; i++) {
  13265          sub->list.multi[i].start_lnum = -1;
  13266          sub->list.multi[i].end_lnum = -1;
  13267        }
  13268        sub->in_use = subidx + 1;
  13269      }
  13270      if (off == -1) {
  13271        sub->list.multi[subidx].start_lnum = rex.lnum + 1;
  13272        sub->list.multi[subidx].start_col = 0;
  13273      } else {
  13274        sub->list.multi[subidx].start_lnum = rex.lnum;
  13275        sub->list.multi[subidx].start_col =
  13276          (colnr_T)(rex.input - rex.line + off);
  13277      }
  13278      sub->list.multi[subidx].end_lnum = -1;
  13279    } else {
  13280      if (subidx < sub->in_use) {
  13281        save_ptr = sub->list.line[subidx].start;
  13282        save_in_use = -1;
  13283      } else {
  13284        save_in_use = sub->in_use;
  13285        for (i = sub->in_use; i < subidx; i++) {
  13286          sub->list.line[i].start = NULL;
  13287          sub->list.line[i].end = NULL;
  13288        }
  13289        sub->in_use = subidx + 1;
  13290      }
  13291      sub->list.line[subidx].start = rex.input + off;
  13292    }
  13293 
  13294    subs = addstate(l, state->out, subs, pim, off_arg);
  13295    if (subs == NULL) {
  13296      break;
  13297    }
  13298    // "subs" may have changed, need to set "sub" again.
  13299    if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) {
  13300      sub = &subs->synt;
  13301    } else {
  13302      sub = &subs->norm;
  13303    }
  13304 
  13305    if (save_in_use == -1) {
  13306      if (REG_MULTI) {
  13307        sub->list.multi[subidx] = save_multipos;
  13308      } else {
  13309        sub->list.line[subidx].start = save_ptr;
  13310      }
  13311    } else {
  13312      sub->in_use = save_in_use;
  13313    }
  13314    break;
  13315 
  13316  case NFA_MCLOSE:
  13317    if (rex.nfa_has_zend
  13318        && (REG_MULTI
  13319            ? subs->norm.list.multi[0].end_lnum >= 0
  13320            : subs->norm.list.line[0].end != NULL)) {
  13321      // Do not overwrite the position set by \ze.
  13322      subs = addstate(l, state->out, subs, pim, off_arg);
  13323      break;
  13324    }
  13325    FALLTHROUGH;
  13326  case NFA_MCLOSE1:
  13327  case NFA_MCLOSE2:
  13328  case NFA_MCLOSE3:
  13329  case NFA_MCLOSE4:
  13330  case NFA_MCLOSE5:
  13331  case NFA_MCLOSE6:
  13332  case NFA_MCLOSE7:
  13333  case NFA_MCLOSE8:
  13334  case NFA_MCLOSE9:
  13335  case NFA_ZCLOSE:
  13336  case NFA_ZCLOSE1:
  13337  case NFA_ZCLOSE2:
  13338  case NFA_ZCLOSE3:
  13339  case NFA_ZCLOSE4:
  13340  case NFA_ZCLOSE5:
  13341  case NFA_ZCLOSE6:
  13342  case NFA_ZCLOSE7:
  13343  case NFA_ZCLOSE8:
  13344  case NFA_ZCLOSE9:
  13345  case NFA_ZEND:
  13346    if (state->c == NFA_ZEND) {
  13347      subidx = 0;
  13348      sub = &subs->norm;
  13349    } else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) {
  13350      subidx = state->c - NFA_ZCLOSE;
  13351      sub = &subs->synt;
  13352    } else {
  13353      subidx = state->c - NFA_MCLOSE;
  13354      sub = &subs->norm;
  13355    }
  13356 
  13357    // We don't fill in gaps here, there must have been an MOPEN that
  13358    // has done that.
  13359    save_in_use = sub->in_use;
  13360    if (sub->in_use <= subidx) {
  13361      sub->in_use = subidx + 1;
  13362    }
  13363    if (REG_MULTI) {
  13364      save_multipos = sub->list.multi[subidx];
  13365      if (off == -1) {
  13366        sub->list.multi[subidx].end_lnum = rex.lnum + 1;
  13367        sub->list.multi[subidx].end_col = 0;
  13368      } else {
  13369        sub->list.multi[subidx].end_lnum = rex.lnum;
  13370        sub->list.multi[subidx].end_col =
  13371          (colnr_T)(rex.input - rex.line + off);
  13372      }
  13373      // avoid compiler warnings
  13374      save_ptr = NULL;
  13375    } else {
  13376      save_ptr = sub->list.line[subidx].end;
  13377      sub->list.line[subidx].end = rex.input + off;
  13378      // avoid compiler warnings
  13379      CLEAR_FIELD(save_multipos);
  13380    }
  13381 
  13382    subs = addstate(l, state->out, subs, pim, off_arg);
  13383    if (subs == NULL) {
  13384      break;
  13385    }
  13386    // "subs" may have changed, need to set "sub" again.
  13387    if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) {
  13388      sub = &subs->synt;
  13389    } else {
  13390      sub = &subs->norm;
  13391    }
  13392 
  13393    if (REG_MULTI) {
  13394      sub->list.multi[subidx] = save_multipos;
  13395    } else {
  13396      sub->list.line[subidx].end = save_ptr;
  13397    }
  13398    sub->in_use = save_in_use;
  13399    break;
  13400  }
  13401  depth--;
  13402  return subs;
  13403 }
  13404 
  13405 /// Like addstate(), but the new state(s) are put at position "*ip".
  13406 /// Used for zero-width matches, next state to use is the added one.
  13407 /// This makes sure the order of states to be tried does not change, which
  13408 /// matters for alternatives.
  13409 ///
  13410 /// @param l      runtime state list
  13411 /// @param state  state to update
  13412 /// @param subs   pointers to subexpressions
  13413 /// @param pim    postponed look-behind match
  13414 static regsubs_T *addstate_here(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs, nfa_pim_T *pim,
  13415                                int *ip)
  13416  FUNC_ATTR_NONNULL_ARG(1, 2, 5) FUNC_ATTR_WARN_UNUSED_RESULT
  13417 {
  13418  int tlen = l->n;
  13419  int count;
  13420  int listidx = *ip;
  13421 
  13422  // First add the state(s) at the end, so that we know how many there are.
  13423  // Pass the listidx as offset (avoids adding another argument to
  13424  // addstate()).
  13425  regsubs_T *r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
  13426  if (r == NULL) {
  13427    return NULL;
  13428  }
  13429 
  13430  // when "*ip" was at the end of the list, nothing to do
  13431  if (listidx + 1 == tlen) {
  13432    return r;
  13433  }
  13434 
  13435  // re-order to put the new state at the current position
  13436  count = l->n - tlen;
  13437  if (count == 0) {
  13438    return r;  // no state got added
  13439  }
  13440  if (count == 1) {
  13441    // overwrite the current state
  13442    l->t[listidx] = l->t[l->n - 1];
  13443  } else if (count > 1) {
  13444    if (l->n + count - 1 >= l->len) {
  13445      // not enough space to move the new states, reallocate the list
  13446      // and move the states to the right position
  13447      const int newlen = l->len * 3 / 2 + 50;
  13448      const size_t newsize = (size_t)newlen * sizeof(nfa_thread_T);
  13449 
  13450      if ((int64_t)(newsize >> 10) >= p_mmp) {
  13451        emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
  13452        return NULL;
  13453      }
  13454      nfa_thread_T *const newl = xmalloc(newsize);
  13455      l->len = newlen;
  13456      memmove(&(newl[0]),
  13457              &(l->t[0]),
  13458              sizeof(nfa_thread_T) * (size_t)listidx);
  13459      memmove(&(newl[listidx]),
  13460              &(l->t[l->n - count]),
  13461              sizeof(nfa_thread_T) * (size_t)count);
  13462      memmove(&(newl[listidx + count]),
  13463              &(l->t[listidx + 1]),
  13464              sizeof(nfa_thread_T) * (size_t)(l->n - count - listidx - 1));
  13465      xfree(l->t);
  13466      l->t = newl;
  13467    } else {
  13468      // make space for new states, then move them from the
  13469      // end to the current position
  13470      memmove(&(l->t[listidx + count]),
  13471              &(l->t[listidx + 1]),
  13472              sizeof(nfa_thread_T) * (size_t)(l->n - listidx - 1));
  13473      memmove(&(l->t[listidx]),
  13474              &(l->t[l->n - 1]),
  13475              sizeof(nfa_thread_T) * (size_t)count);
  13476    }
  13477  }
  13478  l->n--;
  13479  *ip = listidx - 1;
  13480 
  13481  return r;
  13482 }
  13483 
  13484 // Check character class "class" against current character c.
  13485 static int check_char_class(int cls, int c)
  13486 {
  13487  switch (cls) {
  13488  case NFA_CLASS_ALNUM:
  13489    if (c >= 1 && c < 128 && isalnum(c)) {
  13490      return OK;
  13491    }
  13492    break;
  13493  case NFA_CLASS_ALPHA:
  13494    if (c >= 1 && c < 128 && isalpha(c)) {
  13495      return OK;
  13496    }
  13497    break;
  13498  case NFA_CLASS_BLANK:
  13499    if (c == ' ' || c == '\t') {
  13500      return OK;
  13501    }
  13502    break;
  13503  case NFA_CLASS_CNTRL:
  13504    if (c >= 1 && c <= 127 && iscntrl(c)) {
  13505      return OK;
  13506    }
  13507    break;
  13508  case NFA_CLASS_DIGIT:
  13509    if (ascii_isdigit(c)) {
  13510      return OK;
  13511    }
  13512    break;
  13513  case NFA_CLASS_GRAPH:
  13514    if (c >= 1 && c <= 127 && isgraph(c)) {
  13515      return OK;
  13516    }
  13517    break;
  13518  case NFA_CLASS_LOWER:
  13519    if (mb_islower(c) && c != 170 && c != 186) {
  13520      return OK;
  13521    }
  13522    break;
  13523  case NFA_CLASS_PRINT:
  13524    if (vim_isprintc(c)) {
  13525      return OK;
  13526    }
  13527    break;
  13528  case NFA_CLASS_PUNCT:
  13529    if (c >= 1 && c < 128 && ispunct(c)) {
  13530      return OK;
  13531    }
  13532    break;
  13533  case NFA_CLASS_SPACE:
  13534    if ((c >= 9 && c <= 13) || (c == ' ')) {
  13535      return OK;
  13536    }
  13537    break;
  13538  case NFA_CLASS_UPPER:
  13539    if (mb_isupper(c)) {
  13540      return OK;
  13541    }
  13542    break;
  13543  case NFA_CLASS_XDIGIT:
  13544    if (ascii_isxdigit(c)) {
  13545      return OK;
  13546    }
  13547    break;
  13548  case NFA_CLASS_TAB:
  13549    if (c == '\t') {
  13550      return OK;
  13551    }
  13552    break;
  13553  case NFA_CLASS_RETURN:
  13554    if (c == '\r') {
  13555      return OK;
  13556    }
  13557    break;
  13558  case NFA_CLASS_BACKSPACE:
  13559    if (c == '\b') {
  13560      return OK;
  13561    }
  13562    break;
  13563  case NFA_CLASS_ESCAPE:
  13564    if (c == ESC) {
  13565      return OK;
  13566    }
  13567    break;
  13568  case NFA_CLASS_IDENT:
  13569    if (vim_isIDc(c)) {
  13570      return OK;
  13571    }
  13572    break;
  13573  case NFA_CLASS_KEYWORD:
  13574    if (reg_iswordc(c)) {
  13575      return OK;
  13576    }
  13577    break;
  13578  case NFA_CLASS_FNAME:
  13579    if (vim_isfilec(c)) {
  13580      return OK;
  13581    }
  13582    break;
  13583 
  13584  default:
  13585    // should not be here :P
  13586    siemsg(_(e_ill_char_class), (int64_t)cls);
  13587    return FAIL;
  13588  }
  13589  return FAIL;
  13590 }
  13591 
  13592 /// Check for a match with subexpression "subidx".
  13593 ///
  13594 /// @param sub      pointers to subexpressions
  13595 /// @param bytelen  out: length of match in bytes
  13596 ///
  13597 /// @return  true if it matches.
  13598 static int match_backref(regsub_T *sub, int subidx, int *bytelen)
  13599 {
  13600  int len;
  13601 
  13602  if (sub->in_use <= subidx) {
  13603 retempty:
  13604    // backref was not set, match an empty string
  13605    *bytelen = 0;
  13606    return true;
  13607  }
  13608 
  13609  if (REG_MULTI) {
  13610    if (sub->list.multi[subidx].start_lnum < 0
  13611        || sub->list.multi[subidx].end_lnum < 0) {
  13612      goto retempty;
  13613    }
  13614    if (sub->list.multi[subidx].start_lnum == rex.lnum
  13615        && sub->list.multi[subidx].end_lnum == rex.lnum) {
  13616      len = sub->list.multi[subidx].end_col
  13617            - sub->list.multi[subidx].start_col;
  13618      if (cstrncmp((char *)rex.line + sub->list.multi[subidx].start_col,
  13619                   (char *)rex.input, &len) == 0) {
  13620        *bytelen = len;
  13621        return true;
  13622      }
  13623    } else {
  13624      if (match_with_backref(sub->list.multi[subidx].start_lnum,
  13625                             sub->list.multi[subidx].start_col,
  13626                             sub->list.multi[subidx].end_lnum,
  13627                             sub->list.multi[subidx].end_col,
  13628                             bytelen) == RA_MATCH) {
  13629        return true;
  13630      }
  13631    }
  13632  } else {
  13633    if (sub->list.line[subidx].start == NULL
  13634        || sub->list.line[subidx].end == NULL) {
  13635      goto retempty;
  13636    }
  13637    len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
  13638    if (cstrncmp((char *)sub->list.line[subidx].start, (char *)rex.input, &len) == 0) {
  13639      *bytelen = len;
  13640      return true;
  13641    }
  13642  }
  13643  return false;
  13644 }
  13645 
  13646 /// Check for a match with \z subexpression "subidx".
  13647 ///
  13648 /// @param bytelen  out: length of match in bytes
  13649 ///
  13650 /// @return  true if it matches.
  13651 static int match_zref(int subidx, int *bytelen)
  13652 {
  13653  int len;
  13654 
  13655  cleanup_zsubexpr();
  13656  if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL) {
  13657    // backref was not set, match an empty string
  13658    *bytelen = 0;
  13659    return true;
  13660  }
  13661 
  13662  len = (int)strlen((char *)re_extmatch_in->matches[subidx]);
  13663  if (cstrncmp((char *)re_extmatch_in->matches[subidx], (char *)rex.input, &len) == 0) {
  13664    *bytelen = len;
  13665    return true;
  13666  }
  13667  return false;
  13668 }
  13669 
  13670 // Save list IDs for all NFA states of "prog" into "list".
  13671 // Also reset the IDs to zero.
  13672 // Only used for the recursive value lastlist[1].
  13673 static void nfa_save_listids(nfa_regprog_T *prog, int *list)
  13674 {
  13675  int i;
  13676  nfa_state_T *p;
  13677 
  13678  // Order in the list is reverse, it's a bit faster that way.
  13679  p = &prog->state[0];
  13680  for (i = prog->nstate; --i >= 0;) {
  13681    list[i] = p->lastlist[1];
  13682    p->lastlist[1] = 0;
  13683    p++;
  13684  }
  13685 }
  13686 
  13687 // Restore list IDs from "list" to all NFA states.
  13688 static void nfa_restore_listids(nfa_regprog_T *prog, const int *list)
  13689 {
  13690  int i;
  13691  nfa_state_T *p;
  13692 
  13693  p = &prog->state[0];
  13694  for (i = prog->nstate; --i >= 0;) {
  13695    p->lastlist[1] = list[i];
  13696    p++;
  13697  }
  13698 }
  13699 
  13700 static bool nfa_re_num_cmp(uintmax_t val, int op, uintmax_t pos)
  13701 {
  13702  if (op == 1) {
  13703    return pos > val;
  13704  }
  13705  if (op == 2) {
  13706    return pos < val;
  13707  }
  13708  return val == pos;
  13709 }
  13710 
  13711 // Recursively call nfa_regmatch()
  13712 // "pim" is NULL or contains info about a Postponed Invisible Match (start
  13713 // position).
  13714 static int recursive_regmatch(nfa_state_T *state, nfa_pim_T *pim, nfa_regprog_T *prog,
  13715                              regsubs_T *submatch, regsubs_T *m, int **listids, int *listids_len)
  13716  FUNC_ATTR_NONNULL_ARG(1, 3, 5, 6, 7)
  13717 {
  13718  const int save_reginput_col = (int)(rex.input - rex.line);
  13719  const int save_reglnum = rex.lnum;
  13720  const int save_nfa_match = nfa_match;
  13721  const int save_nfa_listid = rex.nfa_listid;
  13722  save_se_T *const save_nfa_endp = nfa_endp;
  13723  save_se_T endpos;
  13724  save_se_T *endposp = NULL;
  13725  int need_restore = false;
  13726 
  13727  if (pim != NULL) {
  13728    // start at the position where the postponed match was
  13729    if (REG_MULTI) {
  13730      rex.input = rex.line + pim->end.pos.col;
  13731    } else {
  13732      rex.input = pim->end.ptr;
  13733    }
  13734  }
  13735 
  13736  if (state->c == NFA_START_INVISIBLE_BEFORE
  13737      || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
  13738      || state->c == NFA_START_INVISIBLE_BEFORE_NEG
  13739      || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) {
  13740    // The recursive match must end at the current position. When "pim" is
  13741    // not NULL it specifies the current position.
  13742    endposp = &endpos;
  13743    if (REG_MULTI) {
  13744      if (pim == NULL) {
  13745        endpos.se_u.pos.col = (int)(rex.input - rex.line);
  13746        endpos.se_u.pos.lnum = rex.lnum;
  13747      } else {
  13748        endpos.se_u.pos = pim->end.pos;
  13749      }
  13750    } else {
  13751      if (pim == NULL) {
  13752        endpos.se_u.ptr = rex.input;
  13753      } else {
  13754        endpos.se_u.ptr = pim->end.ptr;
  13755      }
  13756    }
  13757 
  13758    // Go back the specified number of bytes, or as far as the
  13759    // start of the previous line, to try matching "\@<=" or
  13760    // not matching "\@<!". This is very inefficient, limit the number of
  13761    // bytes if possible.
  13762    if (state->val <= 0) {
  13763      if (REG_MULTI) {
  13764        rex.line = (uint8_t *)reg_getline(--rex.lnum);
  13765        if (rex.line == NULL) {
  13766          // can't go before the first line
  13767          rex.line = (uint8_t *)reg_getline(++rex.lnum);
  13768        }
  13769      }
  13770      rex.input = rex.line;
  13771    } else {
  13772      if (REG_MULTI && (int)(rex.input - rex.line) < state->val) {
  13773        // Not enough bytes in this line, go to end of
  13774        // previous line.
  13775        rex.line = (uint8_t *)reg_getline(--rex.lnum);
  13776        if (rex.line == NULL) {
  13777          // can't go before the first line
  13778          rex.line = (uint8_t *)reg_getline(++rex.lnum);
  13779          rex.input = rex.line;
  13780        } else {
  13781          rex.input = rex.line + reg_getline_len(rex.lnum);
  13782        }
  13783      }
  13784      if ((int)(rex.input - rex.line) >= state->val) {
  13785        rex.input -= state->val;
  13786        rex.input -= utf_head_off((char *)rex.line, (char *)rex.input);
  13787      } else {
  13788        rex.input = rex.line;
  13789      }
  13790    }
  13791  }
  13792 
  13793 #ifdef REGEXP_DEBUG
  13794  if (log_fd != stderr) {
  13795    fclose(log_fd);
  13796  }
  13797  log_fd = NULL;
  13798 #endif
  13799  // Have to clear the lastlist field of the NFA nodes, so that
  13800  // nfa_regmatch() and addstate() can run properly after recursion.
  13801  if (nfa_ll_index == 1) {
  13802    // Already calling nfa_regmatch() recursively.  Save the lastlist[1]
  13803    // values and clear them.
  13804    if (*listids == NULL || *listids_len < prog->nstate) {
  13805      xfree(*listids);
  13806      *listids = xmalloc(sizeof(**listids) * (size_t)prog->nstate);
  13807      *listids_len = prog->nstate;
  13808    }
  13809    nfa_save_listids(prog, *listids);
  13810    need_restore = true;
  13811    // any value of rex.nfa_listid will do
  13812  } else {
  13813    // First recursive nfa_regmatch() call, switch to the second lastlist
  13814    // entry.  Make sure rex.nfa_listid is different from a previous
  13815    // recursive call, because some states may still have this ID.
  13816    nfa_ll_index++;
  13817    if (rex.nfa_listid <= rex.nfa_alt_listid) {
  13818      rex.nfa_listid = rex.nfa_alt_listid;
  13819    }
  13820  }
  13821 
  13822  // Call nfa_regmatch() to check if the current concat matches at this
  13823  // position. The concat ends with the node NFA_END_INVISIBLE
  13824  nfa_endp = endposp;
  13825  const int result = nfa_regmatch(prog, state->out, submatch, m);
  13826 
  13827  if (need_restore) {
  13828    nfa_restore_listids(prog, *listids);
  13829  } else {
  13830    nfa_ll_index--;
  13831    rex.nfa_alt_listid = rex.nfa_listid;
  13832  }
  13833 
  13834  // restore position in input text
  13835  rex.lnum = save_reglnum;
  13836  if (REG_MULTI) {
  13837    rex.line = (uint8_t *)reg_getline(rex.lnum);
  13838  }
  13839  rex.input = rex.line + save_reginput_col;
  13840  if (result != NFA_TOO_EXPENSIVE) {
  13841    nfa_match = save_nfa_match;
  13842    rex.nfa_listid = save_nfa_listid;
  13843  }
  13844  nfa_endp = save_nfa_endp;
  13845 
  13846 #ifdef REGEXP_DEBUG
  13847  open_debug_log(result);
  13848 #endif
  13849 
  13850  return result;
  13851 }
  13852 
  13853 // Estimate the chance of a match with "state" failing.
  13854 // empty match: 0
  13855 // NFA_ANY: 1
  13856 // specific character: 99
  13857 static int failure_chance(nfa_state_T *state, int depth)
  13858 {
  13859  int c = state->c;
  13860  int l, r;
  13861 
  13862  // detect looping
  13863  if (depth > 4) {
  13864    return 1;
  13865  }
  13866 
  13867  switch (c) {
  13868  case NFA_SPLIT:
  13869    if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT) {
  13870      // avoid recursive stuff
  13871      return 1;
  13872    }
  13873    // two alternatives, use the lowest failure chance
  13874    l = failure_chance(state->out, depth + 1);
  13875    r = failure_chance(state->out1, depth + 1);
  13876    return l < r ? l : r;
  13877 
  13878  case NFA_ANY:
  13879    // matches anything, unlikely to fail
  13880    return 1;
  13881 
  13882  case NFA_MATCH:
  13883  case NFA_MCLOSE:
  13884  case NFA_ANY_COMPOSING:
  13885    // empty match works always
  13886    return 0;
  13887 
  13888  case NFA_START_INVISIBLE:
  13889  case NFA_START_INVISIBLE_FIRST:
  13890  case NFA_START_INVISIBLE_NEG:
  13891  case NFA_START_INVISIBLE_NEG_FIRST:
  13892  case NFA_START_INVISIBLE_BEFORE:
  13893  case NFA_START_INVISIBLE_BEFORE_FIRST:
  13894  case NFA_START_INVISIBLE_BEFORE_NEG:
  13895  case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
  13896  case NFA_START_PATTERN:
  13897    // recursive regmatch is expensive, use low failure chance
  13898    return 5;
  13899 
  13900  case NFA_BOL:
  13901  case NFA_EOL:
  13902  case NFA_BOF:
  13903  case NFA_EOF:
  13904  case NFA_NEWL:
  13905    return 99;
  13906 
  13907  case NFA_BOW:
  13908  case NFA_EOW:
  13909    return 90;
  13910 
  13911  case NFA_MOPEN:
  13912  case NFA_MOPEN1:
  13913  case NFA_MOPEN2:
  13914  case NFA_MOPEN3:
  13915  case NFA_MOPEN4:
  13916  case NFA_MOPEN5:
  13917  case NFA_MOPEN6:
  13918  case NFA_MOPEN7:
  13919  case NFA_MOPEN8:
  13920  case NFA_MOPEN9:
  13921  case NFA_ZOPEN:
  13922  case NFA_ZOPEN1:
  13923  case NFA_ZOPEN2:
  13924  case NFA_ZOPEN3:
  13925  case NFA_ZOPEN4:
  13926  case NFA_ZOPEN5:
  13927  case NFA_ZOPEN6:
  13928  case NFA_ZOPEN7:
  13929  case NFA_ZOPEN8:
  13930  case NFA_ZOPEN9:
  13931  case NFA_ZCLOSE:
  13932  case NFA_ZCLOSE1:
  13933  case NFA_ZCLOSE2:
  13934  case NFA_ZCLOSE3:
  13935  case NFA_ZCLOSE4:
  13936  case NFA_ZCLOSE5:
  13937  case NFA_ZCLOSE6:
  13938  case NFA_ZCLOSE7:
  13939  case NFA_ZCLOSE8:
  13940  case NFA_ZCLOSE9:
  13941  case NFA_NOPEN:
  13942  case NFA_MCLOSE1:
  13943  case NFA_MCLOSE2:
  13944  case NFA_MCLOSE3:
  13945  case NFA_MCLOSE4:
  13946  case NFA_MCLOSE5:
  13947  case NFA_MCLOSE6:
  13948  case NFA_MCLOSE7:
  13949  case NFA_MCLOSE8:
  13950  case NFA_MCLOSE9:
  13951  case NFA_NCLOSE:
  13952    return failure_chance(state->out, depth + 1);
  13953 
  13954  case NFA_BACKREF1:
  13955  case NFA_BACKREF2:
  13956  case NFA_BACKREF3:
  13957  case NFA_BACKREF4:
  13958  case NFA_BACKREF5:
  13959  case NFA_BACKREF6:
  13960  case NFA_BACKREF7:
  13961  case NFA_BACKREF8:
  13962  case NFA_BACKREF9:
  13963  case NFA_ZREF1:
  13964  case NFA_ZREF2:
  13965  case NFA_ZREF3:
  13966  case NFA_ZREF4:
  13967  case NFA_ZREF5:
  13968  case NFA_ZREF6:
  13969  case NFA_ZREF7:
  13970  case NFA_ZREF8:
  13971  case NFA_ZREF9:
  13972    // backreferences don't match in many places
  13973    return 94;
  13974 
  13975  case NFA_LNUM_GT:
  13976  case NFA_LNUM_LT:
  13977  case NFA_COL_GT:
  13978  case NFA_COL_LT:
  13979  case NFA_VCOL_GT:
  13980  case NFA_VCOL_LT:
  13981  case NFA_MARK_GT:
  13982  case NFA_MARK_LT:
  13983  case NFA_VISUAL:
  13984    // before/after positions don't match very often
  13985    return 85;
  13986 
  13987  case NFA_LNUM:
  13988    return 90;
  13989 
  13990  case NFA_CURSOR:
  13991  case NFA_COL:
  13992  case NFA_VCOL:
  13993  case NFA_MARK:
  13994    // specific positions rarely match
  13995    return 98;
  13996 
  13997  case NFA_COMPOSING:
  13998    return 95;
  13999 
  14000  default:
  14001    if (c > 0) {
  14002      // character match fails often
  14003      return 95;
  14004    }
  14005  }
  14006 
  14007  // something else, includes character classes
  14008  return 50;
  14009 }
  14010 
  14011 // Skip until the char "c" we know a match must start with.
  14012 static int skip_to_start(int c, colnr_T *colp)
  14013 {
  14014  const uint8_t *const s = (uint8_t *)cstrchr((char *)rex.line + *colp, c);
  14015  if (s == NULL) {
  14016    return FAIL;
  14017  }
  14018  *colp = (int)(s - rex.line);
  14019  return OK;
  14020 }
  14021 
  14022 // Check for a match with match_text.
  14023 // Called after skip_to_start() has found regstart.
  14024 // Returns zero for no match, 1 for a match.
  14025 static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
  14026 {
  14027  colnr_T col = *startcol;
  14028  const int regstart_len = utf_char2len(regstart);
  14029 
  14030  while (true) {
  14031    bool match = true;
  14032    uint8_t *s1 = match_text;
  14033    // skip regstart
  14034    int regstart_len2 = regstart_len;
  14035    if (regstart_len2 > 1 && utf_ptr2len((char *)rex.line + col) != regstart_len2) {
  14036      // because of case-folding of the previously matched text, we may need
  14037      // to skip fewer bytes than utf_char2len(regstart)
  14038      regstart_len2 = utf_char2len(utf_fold(regstart));
  14039    }
  14040    uint8_t *s2 = rex.line + col + regstart_len2;
  14041    while (*s1) {
  14042      int c1_len = utf_ptr2len((char *)s1);
  14043      int c1 = utf_ptr2char((char *)s1);
  14044      int c2_len = utf_ptr2len((char *)s2);
  14045      int c2 = utf_ptr2char((char *)s2);
  14046      if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) {
  14047        match = false;
  14048        break;
  14049      }
  14050      s1 += c1_len;
  14051      s2 += c2_len;
  14052    }
  14053    if (match
  14054        // check that no composing char follows
  14055        && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
  14056      cleanup_subexpr();
  14057      if (REG_MULTI) {
  14058        rex.reg_startpos[0].lnum = rex.lnum;
  14059        rex.reg_startpos[0].col = col;
  14060        rex.reg_endpos[0].lnum = rex.lnum;
  14061        rex.reg_endpos[0].col = (colnr_T)(s2 - rex.line);
  14062      } else {
  14063        rex.reg_startp[0] = rex.line + col;
  14064        rex.reg_endp[0] = s2;
  14065      }
  14066      *startcol = col;
  14067      return 1L;
  14068    }
  14069 
  14070    // Try finding regstart after the current match.
  14071    col += regstart_len;  // skip regstart
  14072    if (skip_to_start(regstart, &col) == FAIL) {
  14073      break;
  14074    }
  14075  }
  14076 
  14077  *startcol = col;
  14078  return 0L;
  14079 }
  14080 
  14081 static int nfa_did_time_out(void)
  14082 {
  14083  if (nfa_time_limit != NULL && profile_passed_limit(*nfa_time_limit)) {
  14084    if (nfa_timed_out != NULL) {
  14085      *nfa_timed_out = true;
  14086    }
  14087    return true;
  14088  }
  14089  return false;
  14090 }
  14091 
  14092 /// Main matching routine.
  14093 ///
  14094 /// Run NFA to determine whether it matches rex.input.
  14095 ///
  14096 /// When "nfa_endp" is not NULL it is a required end-of-match position.
  14097 ///
  14098 /// Return true if there is a match, false if there is no match,
  14099 /// NFA_TOO_EXPENSIVE if we end up with too many states.
  14100 /// When there is a match "submatch" contains the positions.
  14101 ///
  14102 /// Note: Caller must ensure that: start != NULL.
  14103 static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m)
  14104  FUNC_ATTR_NONNULL_ARG(1, 2, 4)
  14105 {
  14106  int result = false;
  14107  int flag = 0;
  14108  bool go_to_nextline = false;
  14109  nfa_thread_T *t;
  14110  nfa_list_T list[2];
  14111  int listidx;
  14112  nfa_list_T *thislist;
  14113  nfa_list_T *nextlist;
  14114  int *listids = NULL;
  14115  int listids_len = 0;
  14116  nfa_state_T *add_state;
  14117  bool add_here;
  14118  int add_count;
  14119  int add_off = 0;
  14120  int toplevel = start->c == NFA_MOPEN;
  14121  regsubs_T *r;
  14122  // Some patterns may take a long time to match, especially when using
  14123  // recursive_regmatch(). Allow interrupting them with CTRL-C.
  14124  reg_breakcheck();
  14125  if (got_int) {
  14126    return false;
  14127  }
  14128  if (nfa_did_time_out()) {
  14129    return false;
  14130  }
  14131 
  14132 #ifdef NFA_REGEXP_DEBUG_LOG
  14133  FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
  14134 
  14135  if (debug == NULL) {
  14136    semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
  14137    return false;
  14138  }
  14139 #endif
  14140  nfa_match = false;
  14141 
  14142  // Allocate memory for the lists of nodes.
  14143  size_t size = (size_t)(prog->nstate + 1) * sizeof(nfa_thread_T);
  14144  list[0].t = xmalloc(size);
  14145  list[0].len = prog->nstate + 1;
  14146  list[1].t = xmalloc(size);
  14147  list[1].len = prog->nstate + 1;
  14148 
  14149 #ifdef REGEXP_DEBUG
  14150  log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
  14151  if (log_fd == NULL) {
  14152    emsg(_(e_log_open_failed));
  14153    log_fd = stderr;
  14154  }
  14155  fprintf(log_fd, "**********************************\n");
  14156  nfa_set_code(start->c);
  14157  fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
  14158          abs(start->id), code);
  14159  fprintf(log_fd, "**********************************\n");
  14160 #endif
  14161 
  14162  thislist = &list[0];
  14163  thislist->n = 0;
  14164  thislist->has_pim = false;
  14165  nextlist = &list[1];
  14166  nextlist->n = 0;
  14167  nextlist->has_pim = false;
  14168 #ifdef REGEXP_DEBUG
  14169  fprintf(log_fd, "(---) STARTSTATE first\n");
  14170 #endif
  14171  thislist->id = rex.nfa_listid + 1;
  14172 
  14173  // Inline optimized code for addstate(thislist, start, m, 0) if we know
  14174  // it's the first MOPEN.
  14175  if (toplevel) {
  14176    if (REG_MULTI) {
  14177      m->norm.list.multi[0].start_lnum = rex.lnum;
  14178      m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
  14179      m->norm.orig_start_col = m->norm.list.multi[0].start_col;
  14180    } else {
  14181      m->norm.list.line[0].start = rex.input;
  14182    }
  14183    m->norm.in_use = 1;
  14184    r = addstate(thislist, start->out, m, NULL, 0);
  14185  } else {
  14186    r = addstate(thislist, start, m, NULL, 0);
  14187  }
  14188  if (r == NULL) {
  14189    nfa_match = NFA_TOO_EXPENSIVE;
  14190    goto theend;
  14191  }
  14192 
  14193 #define ADD_STATE_IF_MATCH(state) \
  14194  if (result) { \
  14195    add_state = (state)->out; \
  14196    add_off = clen; \
  14197  }
  14198 
  14199  // Run for each character.
  14200  while (true) {
  14201    int curc = utf_ptr2char((char *)rex.input);
  14202    int clen = utfc_ptr2len((char *)rex.input);
  14203    if (curc == NUL) {
  14204      clen = 0;
  14205      go_to_nextline = false;
  14206    }
  14207 
  14208    // swap lists
  14209    thislist = &list[flag];
  14210    nextlist = &list[flag ^= 1];
  14211    nextlist->n = 0;                // clear nextlist
  14212    nextlist->has_pim = false;
  14213    rex.nfa_listid++;
  14214    if (prog->re_engine == AUTOMATIC_ENGINE
  14215        && (rex.nfa_listid >= NFA_MAX_STATES)) {
  14216      // Too many states, retry with old engine.
  14217      nfa_match = NFA_TOO_EXPENSIVE;
  14218      goto theend;
  14219    }
  14220 
  14221    thislist->id = rex.nfa_listid;
  14222    nextlist->id = rex.nfa_listid + 1;
  14223 
  14224 #ifdef REGEXP_DEBUG
  14225    fprintf(log_fd, "------------------------------------------\n");
  14226    fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
  14227    fprintf(log_fd,
  14228            ">>> Advanced one character... Current char is %c (code %d) \n",
  14229            curc,
  14230            (int)curc);
  14231    fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
  14232    {
  14233      int i;
  14234 
  14235      for (i = 0; i < thislist->n; i++) {
  14236        fprintf(log_fd, "%d  ", abs(thislist->t[i].state->id));
  14237      }
  14238    }
  14239    fprintf(log_fd, "\n");
  14240 #endif
  14241 
  14242 #ifdef NFA_REGEXP_DEBUG_LOG
  14243    fprintf(debug, "\n-------------------\n");
  14244 #endif
  14245    // If the state lists are empty we can stop.
  14246    if (thislist->n == 0) {
  14247      break;
  14248    }
  14249 
  14250    // compute nextlist
  14251    for (listidx = 0; listidx < thislist->n; listidx++) {
  14252      // If the list gets very long there probably is something wrong.
  14253      // At least allow interrupting with CTRL-C.
  14254      reg_breakcheck();
  14255      if (got_int) {
  14256        break;
  14257      }
  14258      if (nfa_time_limit != NULL && ++nfa_time_count == 20) {
  14259        nfa_time_count = 0;
  14260        if (nfa_did_time_out()) {
  14261          break;
  14262        }
  14263      }
  14264      t = &thislist->t[listidx];
  14265 
  14266 #ifdef NFA_REGEXP_DEBUG_LOG
  14267      nfa_set_code(t->state->c);
  14268      fprintf(debug, "%s, ", code);
  14269 #endif
  14270 #ifdef REGEXP_DEBUG
  14271      {
  14272        int col;
  14273 
  14274        if (t->subs.norm.in_use <= 0) {
  14275          col = -1;
  14276        } else if (REG_MULTI) {
  14277          col = t->subs.norm.list.multi[0].start_col;
  14278        } else {
  14279          col = (int)(t->subs.norm.list.line[0].start - rex.line);
  14280        }
  14281        nfa_set_code(t->state->c);
  14282        fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
  14283                abs(t->state->id), (int)t->state->c, code, col,
  14284                pim_info(&t->pim));
  14285      }
  14286 #endif
  14287 
  14288      // Handle the possible codes of the current state.
  14289      // The most important is NFA_MATCH.
  14290      add_state = NULL;
  14291      add_here = false;
  14292      add_count = 0;
  14293      switch (t->state->c) {
  14294      case NFA_MATCH:
  14295        // If the match is not at the start of the line, ends before a
  14296        // composing characters and rex.reg_icombine is not set, that
  14297        // is not really a match.
  14298        if (!rex.reg_icombine
  14299            && rex.input != rex.line
  14300            && utf_iscomposing_legacy(curc)) {
  14301          break;
  14302        }
  14303        nfa_match = true;
  14304        copy_sub(&submatch->norm, &t->subs.norm);
  14305        if (rex.nfa_has_zsubexpr) {
  14306          copy_sub(&submatch->synt, &t->subs.synt);
  14307        }
  14308 #ifdef REGEXP_DEBUG
  14309        log_subsexpr(&t->subs);
  14310 #endif
  14311        // Found the left-most longest match, do not look at any other
  14312        // states at this position.  When the list of states is going
  14313        // to be empty quit without advancing, so that "rex.input" is
  14314        // correct.
  14315        if (nextlist->n == 0) {
  14316          clen = 0;
  14317        }
  14318        goto nextchar;
  14319 
  14320      case NFA_END_INVISIBLE:
  14321      case NFA_END_INVISIBLE_NEG:
  14322      case NFA_END_PATTERN:
  14323        // This is only encountered after a NFA_START_INVISIBLE or
  14324        // NFA_START_INVISIBLE_BEFORE node.
  14325        // They surround a zero-width group, used with "\@=", "\&",
  14326        // "\@!", "\@<=" and "\@<!".
  14327        // If we got here, it means that the current "invisible" group
  14328        // finished successfully, so return control to the parent
  14329        // nfa_regmatch().  For a look-behind match only when it ends
  14330        // in the position in "nfa_endp".
  14331        // Submatches are stored in *m, and used in the parent call.
  14332 #ifdef REGEXP_DEBUG
  14333        if (nfa_endp != NULL) {
  14334          if (REG_MULTI) {
  14335            fprintf(log_fd,
  14336                    "Current lnum: %d, endp lnum: %d;"
  14337                    " current col: %d, endp col: %d\n",
  14338                    (int)rex.lnum,
  14339                    (int)nfa_endp->se_u.pos.lnum,
  14340                    (int)(rex.input - rex.line),
  14341                    nfa_endp->se_u.pos.col);
  14342          } else {
  14343            fprintf(log_fd, "Current col: %d, endp col: %d\n",
  14344                    (int)(rex.input - rex.line),
  14345                    (int)(nfa_endp->se_u.ptr - rex.input));
  14346          }
  14347        }
  14348 #endif
  14349        // If "nfa_endp" is set it's only a match if it ends at
  14350        // "nfa_endp"
  14351        if (nfa_endp != NULL
  14352            && (REG_MULTI
  14353                ? (rex.lnum != nfa_endp->se_u.pos.lnum
  14354                   || (int)(rex.input - rex.line) != nfa_endp->se_u.pos.col)
  14355                : rex.input != nfa_endp->se_u.ptr)) {
  14356          break;
  14357        }
  14358        // do not set submatches for \@!
  14359        if (t->state->c != NFA_END_INVISIBLE_NEG) {
  14360          copy_sub(&m->norm, &t->subs.norm);
  14361          if (rex.nfa_has_zsubexpr) {
  14362            copy_sub(&m->synt, &t->subs.synt);
  14363          }
  14364        }
  14365 #ifdef REGEXP_DEBUG
  14366        fprintf(log_fd, "Match found:\n");
  14367        log_subsexpr(m);
  14368 #endif
  14369        nfa_match = true;
  14370        // See comment above at "goto nextchar".
  14371        if (nextlist->n == 0) {
  14372          clen = 0;
  14373        }
  14374        goto nextchar;
  14375 
  14376      case NFA_START_INVISIBLE:
  14377      case NFA_START_INVISIBLE_FIRST:
  14378      case NFA_START_INVISIBLE_NEG:
  14379      case NFA_START_INVISIBLE_NEG_FIRST:
  14380      case NFA_START_INVISIBLE_BEFORE:
  14381      case NFA_START_INVISIBLE_BEFORE_FIRST:
  14382      case NFA_START_INVISIBLE_BEFORE_NEG:
  14383      case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
  14384 #ifdef REGEXP_DEBUG
  14385        fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
  14386                failure_chance(t->state->out, 0),
  14387                failure_chance(t->state->out1->out, 0));
  14388 #endif
  14389        // Do it directly if there already is a PIM or when
  14390        // nfa_postprocess() detected it will work better.
  14391        if (t->pim.result != NFA_PIM_UNUSED
  14392            || t->state->c == NFA_START_INVISIBLE_FIRST
  14393            || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
  14394            || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
  14395            || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) {
  14396          int in_use = m->norm.in_use;
  14397 
  14398          // Copy submatch info for the recursive call, opposite
  14399          // of what happens on success below.
  14400          copy_sub_off(&m->norm, &t->subs.norm);
  14401          if (rex.nfa_has_zsubexpr) {
  14402            copy_sub_off(&m->synt, &t->subs.synt);
  14403          }
  14404          // First try matching the invisible match, then what
  14405          // follows.
  14406          result = recursive_regmatch(t->state, NULL, prog, submatch, m,
  14407                                      &listids, &listids_len);
  14408          if (result == NFA_TOO_EXPENSIVE) {
  14409            nfa_match = result;
  14410            goto theend;
  14411          }
  14412 
  14413          // for \@! and \@<! it is a match when the result is
  14414          // false
  14415          if (result != (t->state->c == NFA_START_INVISIBLE_NEG
  14416                         || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
  14417                         || t->state->c
  14418                         == NFA_START_INVISIBLE_BEFORE_NEG
  14419                         || t->state->c
  14420                         == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
  14421            // Copy submatch info from the recursive call
  14422            copy_sub_off(&t->subs.norm, &m->norm);
  14423            if (rex.nfa_has_zsubexpr) {
  14424              copy_sub_off(&t->subs.synt, &m->synt);
  14425            }
  14426            // If the pattern has \ze and it matched in the
  14427            // sub pattern, use it.
  14428            copy_ze_off(&t->subs.norm, &m->norm);
  14429 
  14430            // t->state->out1 is the corresponding
  14431            // END_INVISIBLE node; Add its out to the current
  14432            // list (zero-width match).
  14433            add_here = true;
  14434            add_state = t->state->out1->out;
  14435          }
  14436          m->norm.in_use = in_use;
  14437        } else {
  14438          nfa_pim_T pim;
  14439 
  14440          // First try matching what follows.  Only if a match
  14441          // is found verify the invisible match matches.  Add a
  14442          // nfa_pim_T to the following states, it contains info
  14443          // about the invisible match.
  14444          pim.state = t->state;
  14445          pim.result = NFA_PIM_TODO;
  14446          pim.subs.norm.in_use = 0;
  14447          pim.subs.synt.in_use = 0;
  14448          if (REG_MULTI) {
  14449            pim.end.pos.col = (int)(rex.input - rex.line);
  14450            pim.end.pos.lnum = rex.lnum;
  14451          } else {
  14452            pim.end.ptr = rex.input;
  14453          }
  14454          // t->state->out1 is the corresponding END_INVISIBLE
  14455          // node; Add its out to the current list (zero-width
  14456          // match).
  14457          if (addstate_here(thislist, t->state->out1->out, &t->subs,
  14458                            &pim, &listidx) == NULL) {
  14459            nfa_match = NFA_TOO_EXPENSIVE;
  14460            goto theend;
  14461          }
  14462        }
  14463        break;
  14464 
  14465      case NFA_START_PATTERN: {
  14466        nfa_state_T *skip = NULL;
  14467 #ifdef REGEXP_DEBUG
  14468        int skip_lid = 0;
  14469 #endif
  14470 
  14471        // There is no point in trying to match the pattern if the
  14472        // output state is not going to be added to the list.
  14473        if (state_in_list(nextlist, t->state->out1->out, &t->subs)) {
  14474          skip = t->state->out1->out;
  14475 #ifdef REGEXP_DEBUG
  14476          skip_lid = nextlist->id;
  14477 #endif
  14478        } else if (state_in_list(nextlist,
  14479                                 t->state->out1->out->out, &t->subs)) {
  14480          skip = t->state->out1->out->out;
  14481 #ifdef REGEXP_DEBUG
  14482          skip_lid = nextlist->id;
  14483 #endif
  14484        } else if (state_in_list(thislist,
  14485                                 t->state->out1->out->out, &t->subs)) {
  14486          skip = t->state->out1->out->out;
  14487 #ifdef REGEXP_DEBUG
  14488          skip_lid = thislist->id;
  14489 #endif
  14490        }
  14491        if (skip != NULL) {
  14492 #ifdef REGEXP_DEBUG
  14493          nfa_set_code(skip->c);
  14494          fprintf(log_fd,
  14495                  "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
  14496                  abs(skip->id), skip_lid, skip->c, code);
  14497 #endif
  14498          break;
  14499        }
  14500        // Copy submatch info to the recursive call, opposite of what
  14501        // happens afterwards.
  14502        copy_sub_off(&m->norm, &t->subs.norm);
  14503        if (rex.nfa_has_zsubexpr) {
  14504          copy_sub_off(&m->synt, &t->subs.synt);
  14505        }
  14506 
  14507        // First try matching the pattern.
  14508        result = recursive_regmatch(t->state, NULL, prog, submatch, m,
  14509                                    &listids, &listids_len);
  14510        if (result == NFA_TOO_EXPENSIVE) {
  14511          nfa_match = result;
  14512          goto theend;
  14513        }
  14514        if (result) {
  14515          int bytelen;
  14516 
  14517 #ifdef REGEXP_DEBUG
  14518          fprintf(log_fd, "NFA_START_PATTERN matches:\n");
  14519          log_subsexpr(m);
  14520 #endif
  14521          // Copy submatch info from the recursive call
  14522          copy_sub_off(&t->subs.norm, &m->norm);
  14523          if (rex.nfa_has_zsubexpr) {
  14524            copy_sub_off(&t->subs.synt, &m->synt);
  14525          }
  14526          // Now we need to skip over the matched text and then
  14527          // continue with what follows.
  14528          if (REG_MULTI) {
  14529            // TODO(RE): multi-line match
  14530            bytelen = m->norm.list.multi[0].end_col
  14531                      - (int)(rex.input - rex.line);
  14532          } else {
  14533            bytelen = (int)(m->norm.list.line[0].end - rex.input);
  14534          }
  14535 
  14536 #ifdef REGEXP_DEBUG
  14537          fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
  14538 #endif
  14539          if (bytelen == 0) {
  14540            // empty match, output of corresponding
  14541            // NFA_END_PATTERN/NFA_SKIP to be used at current
  14542            // position
  14543            add_here = true;
  14544            add_state = t->state->out1->out->out;
  14545          } else if (bytelen <= clen) {
  14546            // match current character, output of corresponding
  14547            // NFA_END_PATTERN to be used at next position.
  14548            add_state = t->state->out1->out->out;
  14549            add_off = clen;
  14550          } else {
  14551            // skip over the matched characters, set character
  14552            // count in NFA_SKIP
  14553            add_state = t->state->out1->out;
  14554            add_off = bytelen;
  14555            add_count = bytelen - clen;
  14556          }
  14557        }
  14558        break;
  14559      }
  14560 
  14561      case NFA_BOL:
  14562        if (rex.input == rex.line) {
  14563          add_here = true;
  14564          add_state = t->state->out;
  14565        }
  14566        break;
  14567 
  14568      case NFA_EOL:
  14569        if (curc == NUL) {
  14570          add_here = true;
  14571          add_state = t->state->out;
  14572        }
  14573        break;
  14574 
  14575      case NFA_BOW:
  14576        result = true;
  14577 
  14578        if (curc == NUL) {
  14579          result = false;
  14580        } else {
  14581          int this_class;
  14582 
  14583          // Get class of current and previous char (if it exists).
  14584          this_class = mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab);
  14585          if (this_class <= 1) {
  14586            result = false;
  14587          } else if (reg_prev_class() == this_class) {
  14588            result = false;
  14589          }
  14590        }
  14591        if (result) {
  14592          add_here = true;
  14593          add_state = t->state->out;
  14594        }
  14595        break;
  14596 
  14597      case NFA_EOW:
  14598        result = true;
  14599        if (rex.input == rex.line) {
  14600          result = false;
  14601        } else {
  14602          int this_class, prev_class;
  14603 
  14604          // Get class of current and previous char (if it exists).
  14605          this_class = mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab);
  14606          prev_class = reg_prev_class();
  14607          if (this_class == prev_class
  14608              || prev_class == 0 || prev_class == 1) {
  14609            result = false;
  14610          }
  14611        }
  14612        if (result) {
  14613          add_here = true;
  14614          add_state = t->state->out;
  14615        }
  14616        break;
  14617 
  14618      case NFA_BOF:
  14619        if (rex.lnum == 0 && rex.input == rex.line
  14620            && (!REG_MULTI || rex.reg_firstlnum == 1)) {
  14621          add_here = true;
  14622          add_state = t->state->out;
  14623        }
  14624        break;
  14625 
  14626      case NFA_EOF:
  14627        if (rex.lnum == rex.reg_maxline && curc == NUL) {
  14628          add_here = true;
  14629          add_state = t->state->out;
  14630        }
  14631        break;
  14632 
  14633      case NFA_COMPOSING: {
  14634        int mc = curc;
  14635        int len = 0;
  14636        nfa_state_T *end;
  14637        nfa_state_T *sta;
  14638        int cchars[MAX_MCO];
  14639        int ccount = 0;
  14640        int j;
  14641 
  14642        sta = t->state->out;
  14643        len = 0;
  14644        if (utf_iscomposing_legacy(sta->c)) {
  14645          // Only match composing character(s), ignore base
  14646          // character.  Used for ".{composing}" and "{composing}"
  14647          // (no preceding character).
  14648          len += utf_char2len(mc);
  14649        }
  14650        if (rex.reg_icombine && len == 0) {
  14651          // If \Z was present, then ignore composing characters.
  14652          // When ignoring the base character this always matches.
  14653          if (sta->c != curc) {
  14654            result = FAIL;
  14655          } else {
  14656            result = OK;
  14657          }
  14658          while (sta->c != NFA_END_COMPOSING) {
  14659            sta = sta->out;
  14660          }
  14661        } else if (len > 0 || mc == sta->c) {
  14662          // Check base character matches first, unless ignored.
  14663          if (len == 0) {
  14664            len += utf_char2len(mc);
  14665            sta = sta->out;
  14666          }
  14667 
  14668          // We don't care about the order of composing characters.
  14669          // Get them into cchars[] first.
  14670          while (len < clen) {
  14671            mc = utf_ptr2char((char *)rex.input + len);
  14672            cchars[ccount++] = mc;
  14673            len += utf_char2len(mc);
  14674            if (ccount == MAX_MCO) {
  14675              break;
  14676            }
  14677          }
  14678 
  14679          // Check that each composing char in the pattern matches a
  14680          // composing char in the text.  We do not check if all
  14681          // composing chars are matched.
  14682          result = OK;
  14683          while (sta->c != NFA_END_COMPOSING) {
  14684            for (j = 0; j < ccount; j++) {
  14685              if (cchars[j] == sta->c) {
  14686                break;
  14687              }
  14688            }
  14689            if (j == ccount) {
  14690              result = FAIL;
  14691              break;
  14692            }
  14693            sta = sta->out;
  14694          }
  14695        } else {
  14696          result = FAIL;
  14697        }
  14698 
  14699        end = t->state->out1;               // NFA_END_COMPOSING
  14700        ADD_STATE_IF_MATCH(end);
  14701        break;
  14702      }
  14703 
  14704      case NFA_NEWL:
  14705        if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
  14706            && rex.lnum <= rex.reg_maxline) {
  14707          go_to_nextline = true;
  14708          // Pass -1 for the offset, which means taking the position
  14709          // at the start of the next line.
  14710          add_state = t->state->out;
  14711          add_off = -1;
  14712        } else if (curc == '\n' && rex.reg_line_lbr) {
  14713          // match \n as if it is an ordinary character
  14714          add_state = t->state->out;
  14715          add_off = 1;
  14716        }
  14717        break;
  14718 
  14719      case NFA_START_COLL:
  14720      case NFA_START_NEG_COLL: {
  14721        // What follows is a list of characters, until NFA_END_COLL.
  14722        // One of them must match or none of them must match.
  14723        nfa_state_T *state;
  14724        int result_if_matched;
  14725        int c1, c2;
  14726 
  14727        // Never match EOL. If it's part of the collection it is added
  14728        // as a separate state with an OR.
  14729        if (curc == NUL) {
  14730          break;
  14731        }
  14732 
  14733        state = t->state->out;
  14734        result_if_matched = (t->state->c == NFA_START_COLL);
  14735        while (true) {
  14736          if (state->c == NFA_COMPOSING) {
  14737            int mc = curc;
  14738            int len = 0;
  14739            nfa_state_T *end;
  14740            nfa_state_T *sta;
  14741            int cchars[MAX_MCO];
  14742            int ccount = 0;
  14743            int j;
  14744 
  14745            sta = t->state->out->out;
  14746            if (utf_iscomposing_legacy(sta->c)) {
  14747              // Only match composing character(s), ignore base
  14748              // character.  Used for ".{composing}" and "{composing}"
  14749              // (no preceding character).
  14750              len += utf_char2len(mc);
  14751            }
  14752            if (rex.reg_icombine && len == 0) {
  14753              // If \Z was present, then ignore composing characters.
  14754              // When ignoring the base character this always matches.
  14755              if (sta->c != curc) {
  14756                result = FAIL;
  14757              } else {
  14758                result = OK;
  14759              }
  14760              while (sta->c != NFA_END_COMPOSING) {
  14761                sta = sta->out;
  14762              }
  14763            }
  14764            // Check base character matches first, unless ignored.
  14765            else if (len > 0 || mc == sta->c) {
  14766              if (len == 0) {
  14767                len += utf_char2len(mc);
  14768                sta = sta->out;
  14769              }
  14770 
  14771              // We don't care about the order of composing characters.
  14772              // Get them into cchars[] first.
  14773              while (len < clen) {
  14774                mc = utf_ptr2char((char *)rex.input + len);
  14775                cchars[ccount++] = mc;
  14776                len += utf_char2len(mc);
  14777                if (ccount == MAX_MCO) {
  14778                  break;
  14779                }
  14780              }
  14781 
  14782              // Check that each composing char in the pattern matches a
  14783              // composing char in the text.  We do not check if all
  14784              // composing chars are matched.
  14785              result = OK;
  14786              while (sta->c != NFA_END_COMPOSING) {
  14787                for (j = 0; j < ccount; j++) {
  14788                  if (cchars[j] == sta->c) {
  14789                    break;
  14790                  }
  14791                }
  14792                if (j == ccount) {
  14793                  result = FAIL;
  14794                  break;
  14795                }
  14796                sta = sta->out;
  14797              }
  14798            } else {
  14799              result = FAIL;
  14800            }
  14801 
  14802            if (t->state->out->out1 != NULL
  14803                && t->state->out->out1->c == NFA_END_COMPOSING) {
  14804              end = t->state->out->out1;
  14805              ADD_STATE_IF_MATCH(end);
  14806            }
  14807            break;
  14808          }
  14809          if (state->c == NFA_END_COLL) {
  14810            result = !result_if_matched;
  14811            break;
  14812          }
  14813          if (state->c == NFA_RANGE_MIN) {
  14814            c1 = state->val;
  14815            state = state->out;             // advance to NFA_RANGE_MAX
  14816            c2 = state->val;
  14817 #ifdef REGEXP_DEBUG
  14818            fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
  14819                    curc, c1, c2);
  14820 #endif
  14821            if (curc >= c1 && curc <= c2) {
  14822              result = result_if_matched;
  14823              break;
  14824            }
  14825            if (rex.reg_ic) {
  14826              int curc_low = utf_fold(curc);
  14827              int done = false;
  14828 
  14829              for (; c1 <= c2; c1++) {
  14830                if (utf_fold(c1) == curc_low) {
  14831                  result = result_if_matched;
  14832                  done = true;
  14833                  break;
  14834                }
  14835              }
  14836              if (done) {
  14837                break;
  14838              }
  14839            }
  14840          } else if (state->c < 0 ? check_char_class(state->c, curc)
  14841                                  : (curc == state->c
  14842                                     || (rex.reg_ic
  14843                                         && utf_fold(curc) == utf_fold(state->c)))) {
  14844            result = result_if_matched;
  14845            break;
  14846          }
  14847          state = state->out;
  14848        }
  14849        if (result) {
  14850          // next state is in out of the NFA_END_COLL, out1 of
  14851          // START points to the END state
  14852          add_state = t->state->out1->out;
  14853          add_off = clen;
  14854        }
  14855        break;
  14856      }
  14857 
  14858      case NFA_ANY:
  14859        // Any char except NUL, (end of input) does not match.
  14860        if (curc > 0) {
  14861          add_state = t->state->out;
  14862          add_off = clen;
  14863        }
  14864        break;
  14865 
  14866      case NFA_ANY_COMPOSING:
  14867        // On a composing character skip over it.  Otherwise do
  14868        // nothing.  Always matches.
  14869        if (utf_iscomposing_legacy(curc)) {
  14870          add_off = clen;
  14871        } else {
  14872          add_here = true;
  14873          add_off = 0;
  14874        }
  14875        add_state = t->state->out;
  14876        break;
  14877 
  14878      // Character classes like \a for alpha, \d for digit etc.
  14879      case NFA_IDENT:           //  \i
  14880        result = vim_isIDc(curc);
  14881        ADD_STATE_IF_MATCH(t->state);
  14882        break;
  14883 
  14884      case NFA_SIDENT:          //  \I
  14885        result = !ascii_isdigit(curc) && vim_isIDc(curc);
  14886        ADD_STATE_IF_MATCH(t->state);
  14887        break;
  14888 
  14889      case NFA_KWORD:           //  \k
  14890        result = vim_iswordp_buf((char *)rex.input, rex.reg_buf);
  14891        ADD_STATE_IF_MATCH(t->state);
  14892        break;
  14893 
  14894      case NFA_SKWORD:          //  \K
  14895        result = !ascii_isdigit(curc)
  14896                 && vim_iswordp_buf((char *)rex.input, rex.reg_buf);
  14897        ADD_STATE_IF_MATCH(t->state);
  14898        break;
  14899 
  14900      case NFA_FNAME:           //  \f
  14901        result = vim_isfilec(curc);
  14902        ADD_STATE_IF_MATCH(t->state);
  14903        break;
  14904 
  14905      case NFA_SFNAME:          //  \F
  14906        result = !ascii_isdigit(curc) && vim_isfilec(curc);
  14907        ADD_STATE_IF_MATCH(t->state);
  14908        break;
  14909 
  14910      case NFA_PRINT:           //  \p
  14911        result = vim_isprintc(utf_ptr2char((char *)rex.input));
  14912        ADD_STATE_IF_MATCH(t->state);
  14913        break;
  14914 
  14915      case NFA_SPRINT:          //  \P
  14916        result = !ascii_isdigit(curc) && vim_isprintc(utf_ptr2char((char *)rex.input));
  14917        ADD_STATE_IF_MATCH(t->state);
  14918        break;
  14919 
  14920      case NFA_WHITE:           //  \s
  14921        result = ascii_iswhite(curc);
  14922        ADD_STATE_IF_MATCH(t->state);
  14923        break;
  14924 
  14925      case NFA_NWHITE:          //  \S
  14926        result = curc != NUL && !ascii_iswhite(curc);
  14927        ADD_STATE_IF_MATCH(t->state);
  14928        break;
  14929 
  14930      case NFA_DIGIT:           //  \d
  14931        result = ri_digit(curc);
  14932        ADD_STATE_IF_MATCH(t->state);
  14933        break;
  14934 
  14935      case NFA_NDIGIT:          //  \D
  14936        result = curc != NUL && !ri_digit(curc);
  14937        ADD_STATE_IF_MATCH(t->state);
  14938        break;
  14939 
  14940      case NFA_HEX:             //  \x
  14941        result = ri_hex(curc);
  14942        ADD_STATE_IF_MATCH(t->state);
  14943        break;
  14944 
  14945      case NFA_NHEX:            //  \X
  14946        result = curc != NUL && !ri_hex(curc);
  14947        ADD_STATE_IF_MATCH(t->state);
  14948        break;
  14949 
  14950      case NFA_OCTAL:           //  \o
  14951        result = ri_octal(curc);
  14952        ADD_STATE_IF_MATCH(t->state);
  14953        break;
  14954 
  14955      case NFA_NOCTAL:          //  \O
  14956        result = curc != NUL && !ri_octal(curc);
  14957        ADD_STATE_IF_MATCH(t->state);
  14958        break;
  14959 
  14960      case NFA_WORD:            //  \w
  14961        result = ri_word(curc);
  14962        ADD_STATE_IF_MATCH(t->state);
  14963        break;
  14964 
  14965      case NFA_NWORD:           //  \W
  14966        result = curc != NUL && !ri_word(curc);
  14967        ADD_STATE_IF_MATCH(t->state);
  14968        break;
  14969 
  14970      case NFA_HEAD:            //  \h
  14971        result = ri_head(curc);
  14972        ADD_STATE_IF_MATCH(t->state);
  14973        break;
  14974 
  14975      case NFA_NHEAD:           //  \H
  14976        result = curc != NUL && !ri_head(curc);
  14977        ADD_STATE_IF_MATCH(t->state);
  14978        break;
  14979 
  14980      case NFA_ALPHA:           //  \a
  14981        result = ri_alpha(curc);
  14982        ADD_STATE_IF_MATCH(t->state);
  14983        break;
  14984 
  14985      case NFA_NALPHA:          //  \A
  14986        result = curc != NUL && !ri_alpha(curc);
  14987        ADD_STATE_IF_MATCH(t->state);
  14988        break;
  14989 
  14990      case NFA_LOWER:           //  \l
  14991        result = ri_lower(curc);
  14992        ADD_STATE_IF_MATCH(t->state);
  14993        break;
  14994 
  14995      case NFA_NLOWER:          //  \L
  14996        result = curc != NUL && !ri_lower(curc);
  14997        ADD_STATE_IF_MATCH(t->state);
  14998        break;
  14999 
  15000      case NFA_UPPER:           //  \u
  15001        result = ri_upper(curc);
  15002        ADD_STATE_IF_MATCH(t->state);
  15003        break;
  15004 
  15005      case NFA_NUPPER:          // \U
  15006        result = curc != NUL && !ri_upper(curc);
  15007        ADD_STATE_IF_MATCH(t->state);
  15008        break;
  15009 
  15010      case NFA_LOWER_IC:        // [a-z]
  15011        result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
  15012        ADD_STATE_IF_MATCH(t->state);
  15013        break;
  15014 
  15015      case NFA_NLOWER_IC:       // [^a-z]
  15016        result = curc != NUL
  15017                 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
  15018        ADD_STATE_IF_MATCH(t->state);
  15019        break;
  15020 
  15021      case NFA_UPPER_IC:        // [A-Z]
  15022        result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
  15023        ADD_STATE_IF_MATCH(t->state);
  15024        break;
  15025 
  15026      case NFA_NUPPER_IC:       // [^A-Z]
  15027        result = curc != NUL
  15028                 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
  15029        ADD_STATE_IF_MATCH(t->state);
  15030        break;
  15031 
  15032      case NFA_BACKREF1:
  15033      case NFA_BACKREF2:
  15034      case NFA_BACKREF3:
  15035      case NFA_BACKREF4:
  15036      case NFA_BACKREF5:
  15037      case NFA_BACKREF6:
  15038      case NFA_BACKREF7:
  15039      case NFA_BACKREF8:
  15040      case NFA_BACKREF9:
  15041      case NFA_ZREF1:
  15042      case NFA_ZREF2:
  15043      case NFA_ZREF3:
  15044      case NFA_ZREF4:
  15045      case NFA_ZREF5:
  15046      case NFA_ZREF6:
  15047      case NFA_ZREF7:
  15048      case NFA_ZREF8:
  15049      case NFA_ZREF9:
  15050        // \1 .. \9  \z1 .. \z9
  15051      {
  15052        int subidx;
  15053        int bytelen;
  15054 
  15055        if (t->state->c >= NFA_BACKREF1 && t->state->c <= NFA_BACKREF9) {
  15056          subidx = t->state->c - NFA_BACKREF1 + 1;
  15057          result = match_backref(&t->subs.norm, subidx, &bytelen);
  15058        } else {
  15059          subidx = t->state->c - NFA_ZREF1 + 1;
  15060          result = match_zref(subidx, &bytelen);
  15061        }
  15062 
  15063        if (result) {
  15064          if (bytelen == 0) {
  15065            // empty match always works, output of NFA_SKIP to be
  15066            // used next
  15067            add_here = true;
  15068            add_state = t->state->out->out;
  15069          } else if (bytelen <= clen) {
  15070            // match current character, jump ahead to out of
  15071            // NFA_SKIP
  15072            add_state = t->state->out->out;
  15073            add_off = clen;
  15074          } else {
  15075            // skip over the matched characters, set character
  15076            // count in NFA_SKIP
  15077            add_state = t->state->out;
  15078            add_off = bytelen;
  15079            add_count = bytelen - clen;
  15080          }
  15081        }
  15082        break;
  15083      }
  15084      case NFA_SKIP:
  15085        // character of previous matching \1 .. \9  or \@>
  15086        if (t->count - clen <= 0) {
  15087          // end of match, go to what follows
  15088          add_state = t->state->out;
  15089          add_off = clen;
  15090        } else {
  15091          // add state again with decremented count
  15092          add_state = t->state;
  15093          add_off = 0;
  15094          add_count = t->count - clen;
  15095        }
  15096        break;
  15097 
  15098      case NFA_LNUM:
  15099      case NFA_LNUM_GT:
  15100      case NFA_LNUM_LT:
  15101        assert(t->state->val >= 0
  15102               && !((rex.reg_firstlnum > 0
  15103                     && rex.lnum > LONG_MAX - rex.reg_firstlnum)
  15104                    || (rex.reg_firstlnum < 0
  15105                        && rex.lnum < LONG_MIN + rex.reg_firstlnum))
  15106               && rex.lnum + rex.reg_firstlnum >= 0);
  15107        result = (REG_MULTI
  15108                  && nfa_re_num_cmp((uintmax_t)t->state->val,
  15109                                    t->state->c - NFA_LNUM,
  15110                                    (uintmax_t)rex.lnum + (uintmax_t)rex.reg_firstlnum));
  15111        if (result) {
  15112          add_here = true;
  15113          add_state = t->state->out;
  15114        }
  15115        break;
  15116 
  15117      case NFA_COL:
  15118      case NFA_COL_GT:
  15119      case NFA_COL_LT:
  15120        assert(t->state->val >= 0
  15121               && rex.input >= rex.line
  15122               && (uintmax_t)(rex.input - rex.line) <= UINTMAX_MAX - 1);
  15123        result = nfa_re_num_cmp((uintmax_t)t->state->val,
  15124                                t->state->c - NFA_COL,
  15125                                (uintmax_t)(rex.input - rex.line + 1));
  15126        if (result) {
  15127          add_here = true;
  15128          add_state = t->state->out;
  15129        }
  15130        break;
  15131 
  15132      case NFA_VCOL:
  15133      case NFA_VCOL_GT:
  15134      case NFA_VCOL_LT: {
  15135        int op = t->state->c - NFA_VCOL;
  15136        colnr_T col = (colnr_T)(rex.input - rex.line);
  15137 
  15138        // Bail out quickly when there can't be a match, avoid the overhead of
  15139        // win_linetabsize() on long lines.
  15140        if (op != 1 && col > t->state->val * MB_MAXBYTES) {
  15141          break;
  15142        }
  15143 
  15144        result = false;
  15145        win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
  15146        if (op == 1 && col - 1 > t->state->val && col > 100) {
  15147          int64_t ts = (int64_t)wp->w_buffer->b_p_ts;
  15148 
  15149          // Guess that a character won't use more columns than 'tabstop',
  15150          // with a minimum of 4.
  15151          if (ts < 4) {
  15152            ts = 4;
  15153          }
  15154          result = col > t->state->val * ts;
  15155        }
  15156        if (!result) {
  15157          linenr_T lnum = REG_MULTI ? rex.reg_firstlnum + rex.lnum : 1;
  15158          if (REG_MULTI && (lnum <= 0 || lnum > wp->w_buffer->b_ml.ml_line_count)) {
  15159            lnum = 1;
  15160          }
  15161          int vcol = win_linetabsize(wp, lnum, (char *)rex.line, col);
  15162          assert(t->state->val >= 0);
  15163          result = nfa_re_num_cmp((uintmax_t)t->state->val, op, (uintmax_t)vcol + 1);
  15164        }
  15165        if (result) {
  15166          add_here = true;
  15167          add_state = t->state->out;
  15168        }
  15169      }
  15170      break;
  15171 
  15172      case NFA_MARK:
  15173      case NFA_MARK_GT:
  15174      case NFA_MARK_LT: {
  15175        size_t col = REG_MULTI ? (size_t)(rex.input - rex.line) : 0;
  15176        fmark_T *fm = mark_get(rex.reg_buf, curwin, NULL, kMarkBufLocal, t->state->val);
  15177 
  15178        // Line may have been freed, get it again.
  15179        if (REG_MULTI) {
  15180          rex.line = (uint8_t *)reg_getline(rex.lnum);
  15181          rex.input = rex.line + col;
  15182        }
  15183 
  15184        // Compare the mark position to the match position, if the mark
  15185        // exists and mark is set in reg_buf.
  15186        if (fm != NULL && fm->mark.lnum > 0) {
  15187          pos_T *pos = &fm->mark;
  15188          const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
  15189                                  && pos->col == MAXCOL
  15190                                  ? reg_getline_len(pos->lnum - rex.reg_firstlnum)
  15191                                  : pos->col;
  15192 
  15193          result = pos->lnum == rex.lnum + rex.reg_firstlnum
  15194                   ? (pos_col == (colnr_T)(rex.input - rex.line)
  15195                      ? t->state->c == NFA_MARK
  15196                      : (pos_col < (colnr_T)(rex.input - rex.line)
  15197                         ? t->state->c == NFA_MARK_GT
  15198                         : t->state->c == NFA_MARK_LT))
  15199                   : (pos->lnum < rex.lnum + rex.reg_firstlnum
  15200                      ? t->state->c == NFA_MARK_GT
  15201                      : t->state->c == NFA_MARK_LT);
  15202          if (result) {
  15203            add_here = true;
  15204            add_state = t->state->out;
  15205          }
  15206        }
  15207        break;
  15208      }
  15209 
  15210      case NFA_CURSOR:
  15211        result = rex.reg_win != NULL
  15212                 && (rex.lnum + rex.reg_firstlnum == rex.reg_win->w_cursor.lnum)
  15213                 && ((colnr_T)(rex.input - rex.line) == rex.reg_win->w_cursor.col);
  15214        if (result) {
  15215          add_here = true;
  15216          add_state = t->state->out;
  15217        }
  15218        break;
  15219 
  15220      case NFA_VISUAL:
  15221        result = reg_match_visual();
  15222        if (result) {
  15223          add_here = true;
  15224          add_state = t->state->out;
  15225        }
  15226        break;
  15227 
  15228      case NFA_MOPEN1:
  15229      case NFA_MOPEN2:
  15230      case NFA_MOPEN3:
  15231      case NFA_MOPEN4:
  15232      case NFA_MOPEN5:
  15233      case NFA_MOPEN6:
  15234      case NFA_MOPEN7:
  15235      case NFA_MOPEN8:
  15236      case NFA_MOPEN9:
  15237      case NFA_ZOPEN:
  15238      case NFA_ZOPEN1:
  15239      case NFA_ZOPEN2:
  15240      case NFA_ZOPEN3:
  15241      case NFA_ZOPEN4:
  15242      case NFA_ZOPEN5:
  15243      case NFA_ZOPEN6:
  15244      case NFA_ZOPEN7:
  15245      case NFA_ZOPEN8:
  15246      case NFA_ZOPEN9:
  15247      case NFA_NOPEN:
  15248      case NFA_ZSTART:
  15249        // These states are only added to be able to bail out when
  15250        // they are added again, nothing is to be done.
  15251        break;
  15252 
  15253      default:          // regular character
  15254      {
  15255        int c = t->state->c;
  15256 
  15257 #ifdef REGEXP_DEBUG
  15258        if (c < 0) {
  15259          siemsg("INTERNAL: Negative state char: %" PRId64, (int64_t)c);
  15260        }
  15261 #endif
  15262        result = (c == curc);
  15263 
  15264        if (!result && rex.reg_ic) {
  15265          result = utf_fold(c) == utf_fold(curc);
  15266        }
  15267 
  15268        // If rex.reg_icombine is not set only skip over the character
  15269        // itself.  When it is set skip over composing characters.
  15270        if (result && !rex.reg_icombine) {
  15271          clen = utf_ptr2len((char *)rex.input);
  15272        }
  15273 
  15274        ADD_STATE_IF_MATCH(t->state);
  15275        break;
  15276      }
  15277      }       // switch (t->state->c)
  15278 
  15279      if (add_state != NULL) {
  15280        nfa_pim_T *pim;
  15281        nfa_pim_T pim_copy;
  15282 
  15283        if (t->pim.result == NFA_PIM_UNUSED) {
  15284          pim = NULL;
  15285        } else {
  15286          pim = &t->pim;
  15287        }
  15288 
  15289        // Handle the postponed invisible match if the match might end
  15290        // without advancing and before the end of the line.
  15291        if (pim != NULL && (clen == 0 || match_follows(add_state, 0))) {
  15292          if (pim->result == NFA_PIM_TODO) {
  15293 #ifdef REGEXP_DEBUG
  15294            fprintf(log_fd, "\n");
  15295            fprintf(log_fd, "==================================\n");
  15296            fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
  15297            fprintf(log_fd, "\n");
  15298 #endif
  15299            result = recursive_regmatch(pim->state, pim, prog, submatch, m,
  15300                                        &listids, &listids_len);
  15301            pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
  15302            // for \@! and \@<! it is a match when the result is
  15303            // false
  15304            if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
  15305                           || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
  15306                           || pim->state->c
  15307                           == NFA_START_INVISIBLE_BEFORE_NEG
  15308                           || pim->state->c
  15309                           == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
  15310              // Copy submatch info from the recursive call
  15311              copy_sub_off(&pim->subs.norm, &m->norm);
  15312              if (rex.nfa_has_zsubexpr) {
  15313                copy_sub_off(&pim->subs.synt, &m->synt);
  15314              }
  15315            }
  15316          } else {
  15317            result = (pim->result == NFA_PIM_MATCH);
  15318 #ifdef REGEXP_DEBUG
  15319            fprintf(log_fd, "\n");
  15320            fprintf(log_fd,
  15321                    "Using previous recursive nfa_regmatch() result, result == %d\n",
  15322                    pim->result);
  15323            fprintf(log_fd, "MATCH = %s\n", result ? "OK" : "false");
  15324            fprintf(log_fd, "\n");
  15325 #endif
  15326          }
  15327 
  15328          // for \@! and \@<! it is a match when result is false
  15329          if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
  15330                         || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
  15331                         || pim->state->c
  15332                         == NFA_START_INVISIBLE_BEFORE_NEG
  15333                         || pim->state->c
  15334                         == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
  15335            // Copy submatch info from the recursive call
  15336            copy_sub_off(&t->subs.norm, &pim->subs.norm);
  15337            if (rex.nfa_has_zsubexpr) {
  15338              copy_sub_off(&t->subs.synt, &pim->subs.synt);
  15339            }
  15340          } else {
  15341            // look-behind match failed, don't add the state
  15342            continue;
  15343          }
  15344 
  15345          // Postponed invisible match was handled, don't add it to
  15346          // following states.
  15347          pim = NULL;
  15348        }
  15349 
  15350        // If "pim" points into l->t it will become invalid when
  15351        // adding the state causes the list to be reallocated.  Make a
  15352        // local copy to avoid that.
  15353        if (pim == &t->pim) {
  15354          copy_pim(&pim_copy, pim);
  15355          pim = &pim_copy;
  15356        }
  15357 
  15358        if (add_here) {
  15359          r = addstate_here(thislist, add_state, &t->subs, pim, &listidx);
  15360        } else {
  15361          r = addstate(nextlist, add_state, &t->subs, pim, add_off);
  15362          if (add_count > 0) {
  15363            nextlist->t[nextlist->n - 1].count = add_count;
  15364          }
  15365        }
  15366        if (r == NULL) {
  15367          nfa_match = NFA_TOO_EXPENSIVE;
  15368          goto theend;
  15369        }
  15370      }
  15371    }     // for (thislist = thislist; thislist->state; thislist++)
  15372 
  15373    // Look for the start of a match in the current position by adding the
  15374    // start state to the list of states.
  15375    // The first found match is the leftmost one, thus the order of states
  15376    // matters!
  15377    // Do not add the start state in recursive calls of nfa_regmatch(),
  15378    // because recursive calls should only start in the first position.
  15379    // Unless "nfa_endp" is not NULL, then we match the end position.
  15380    // Also don't start a match past the first line.
  15381    if (!nfa_match
  15382        && ((toplevel
  15383             && rex.lnum == 0
  15384             && clen != 0
  15385             && (rex.reg_maxcol == 0
  15386                 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
  15387            || (nfa_endp != NULL
  15388                && (REG_MULTI
  15389                    ? (rex.lnum < nfa_endp->se_u.pos.lnum
  15390                       || (rex.lnum == nfa_endp->se_u.pos.lnum
  15391                           && (int)(rex.input - rex.line)
  15392                           < nfa_endp->se_u.pos.col))
  15393                    : rex.input < nfa_endp->se_u.ptr)))) {
  15394 #ifdef REGEXP_DEBUG
  15395      fprintf(log_fd, "(---) STARTSTATE\n");
  15396 #endif
  15397      // Inline optimized code for addstate() if we know the state is
  15398      // the first MOPEN.
  15399      if (toplevel) {
  15400        int add = true;
  15401 
  15402        if (prog->regstart != NUL && clen != 0) {
  15403          if (nextlist->n == 0) {
  15404            colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
  15405 
  15406            // Nextlist is empty, we can skip ahead to the
  15407            // character that must appear at the start.
  15408            if (skip_to_start(prog->regstart, &col) == FAIL) {
  15409              break;
  15410            }
  15411 #ifdef REGEXP_DEBUG
  15412            fprintf(log_fd, "  Skipping ahead %d bytes to regstart\n",
  15413                    col - ((colnr_T)(rex.input - rex.line) + clen));
  15414 #endif
  15415            rex.input = rex.line + col - clen;
  15416          } else {
  15417            // Checking if the required start character matches is
  15418            // cheaper than adding a state that won't match.
  15419            const int c = utf_ptr2char((char *)rex.input + clen);
  15420            if (c != prog->regstart
  15421                && (!rex.reg_ic
  15422                    || utf_fold(c) != utf_fold(prog->regstart))) {
  15423 #ifdef REGEXP_DEBUG
  15424              fprintf(log_fd,
  15425                      "  Skipping start state, regstart does not match\n");
  15426 #endif
  15427              add = false;
  15428            }
  15429          }
  15430        }
  15431 
  15432        if (add) {
  15433          if (REG_MULTI) {
  15434            m->norm.list.multi[0].start_col =
  15435              (colnr_T)(rex.input - rex.line) + clen;
  15436            m->norm.orig_start_col =
  15437              m->norm.list.multi[0].start_col;
  15438          } else {
  15439            m->norm.list.line[0].start = rex.input + clen;
  15440          }
  15441          if (addstate(nextlist, start->out, m, NULL, clen) == NULL) {
  15442            nfa_match = NFA_TOO_EXPENSIVE;
  15443            goto theend;
  15444          }
  15445        }
  15446      } else {
  15447        if (addstate(nextlist, start, m, NULL, clen) == NULL) {
  15448          nfa_match = NFA_TOO_EXPENSIVE;
  15449          goto theend;
  15450        }
  15451      }
  15452    }
  15453 
  15454 #ifdef REGEXP_DEBUG
  15455    fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
  15456    {
  15457      int i;
  15458 
  15459      for (i = 0; i < thislist->n; i++) {
  15460        fprintf(log_fd, "%d  ", abs(thislist->t[i].state->id));
  15461      }
  15462    }
  15463    fprintf(log_fd, "\n");
  15464 #endif
  15465 
  15466 nextchar:
  15467    // Advance to the next character, or advance to the next line, or
  15468    // finish.
  15469    if (clen != 0) {
  15470      rex.input += clen;
  15471    } else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
  15472                                  && rex.lnum < nfa_endp->se_u.pos.lnum)) {
  15473      reg_nextline();
  15474    } else {
  15475      break;
  15476    }
  15477 
  15478    // Allow interrupting with CTRL-C.
  15479    reg_breakcheck();
  15480    if (got_int) {
  15481      break;
  15482    }
  15483    // Check for timeout once every twenty times to avoid overhead.
  15484    if (nfa_time_limit != NULL && ++nfa_time_count == 20) {
  15485      nfa_time_count = 0;
  15486      if (nfa_did_time_out()) {
  15487        break;
  15488      }
  15489    }
  15490  }
  15491 
  15492 #ifdef REGEXP_DEBUG
  15493  if (log_fd != stderr) {
  15494    fclose(log_fd);
  15495  }
  15496  log_fd = NULL;
  15497 #endif
  15498 
  15499 theend:
  15500  // Free memory
  15501  xfree(list[0].t);
  15502  xfree(list[1].t);
  15503  xfree(listids);
  15504 #undef ADD_STATE_IF_MATCH
  15505 #ifdef NFA_REGEXP_DEBUG_LOG
  15506  fclose(debug);
  15507 #endif
  15508 
  15509  return nfa_match;
  15510 }
  15511 
  15512 /// Try match of "prog" with at rex.line["col"].
  15513 ///
  15514 /// @param tm         timeout limit or NULL
  15515 /// @param timed_out  flag set on timeout or NULL
  15516 ///
  15517 /// @return  <= 0 for failure, number of lines contained in the match otherwise.
  15518 static int nfa_regtry(nfa_regprog_T *prog, colnr_T col, proftime_T *tm, int *timed_out)
  15519 {
  15520  int i;
  15521  regsubs_T subs, m;
  15522  nfa_state_T *start = prog->start;
  15523 #ifdef REGEXP_DEBUG
  15524  FILE *f;
  15525 #endif
  15526 
  15527  rex.input = rex.line + col;
  15528  nfa_time_limit = tm;
  15529  nfa_timed_out = timed_out;
  15530  nfa_time_count = 0;
  15531 
  15532 #ifdef REGEXP_DEBUG
  15533  f = fopen(NFA_REGEXP_RUN_LOG, "a");
  15534  if (f != NULL) {
  15535    fprintf(f,
  15536            "\n\n\t=======================================================\n");
  15537 # ifdef REGEXP_DEBUG
  15538    fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
  15539 # endif
  15540    fprintf(f, "\tInput text is \"%s\" \n", rex.input);
  15541    fprintf(f, "\t=======================================================\n\n");
  15542    nfa_print_state(f, start);
  15543    fprintf(f, "\n\n");
  15544    fclose(f);
  15545  } else {
  15546    emsg("Could not open temporary log file for writing");
  15547  }
  15548 #endif
  15549 
  15550  clear_sub(&subs.norm);
  15551  clear_sub(&m.norm);
  15552  clear_sub(&subs.synt);
  15553  clear_sub(&m.synt);
  15554 
  15555  int result = nfa_regmatch(prog, start, &subs, &m);
  15556  if (!result) {
  15557    return 0;
  15558  } else if (result == NFA_TOO_EXPENSIVE) {
  15559    return result;
  15560  }
  15561 
  15562  cleanup_subexpr();
  15563  if (REG_MULTI) {
  15564    for (i = 0; i < subs.norm.in_use; i++) {
  15565      rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
  15566      rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
  15567 
  15568      rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
  15569      rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
  15570    }
  15571    if (rex.reg_mmatch != NULL) {
  15572      rex.reg_mmatch->rmm_matchcol = subs.norm.orig_start_col;
  15573    }
  15574 
  15575    if (rex.reg_startpos[0].lnum < 0) {
  15576      rex.reg_startpos[0].lnum = 0;
  15577      rex.reg_startpos[0].col = col;
  15578    }
  15579    if (rex.reg_endpos[0].lnum < 0) {
  15580      // pattern has a \ze but it didn't match, use current end
  15581      rex.reg_endpos[0].lnum = rex.lnum;
  15582      rex.reg_endpos[0].col = (int)(rex.input - rex.line);
  15583    } else {
  15584      // Use line number of "\ze".
  15585      rex.lnum = rex.reg_endpos[0].lnum;
  15586    }
  15587  } else {
  15588    for (i = 0; i < subs.norm.in_use; i++) {
  15589      rex.reg_startp[i] = subs.norm.list.line[i].start;
  15590      rex.reg_endp[i] = subs.norm.list.line[i].end;
  15591    }
  15592 
  15593    if (rex.reg_startp[0] == NULL) {
  15594      rex.reg_startp[0] = rex.line + col;
  15595    }
  15596    if (rex.reg_endp[0] == NULL) {
  15597      rex.reg_endp[0] = rex.input;
  15598    }
  15599  }
  15600 
  15601  // Package any found \z(...\) matches for export. Default is none.
  15602  unref_extmatch(re_extmatch_out);
  15603  re_extmatch_out = NULL;
  15604 
  15605  if (prog->reghasz == REX_SET) {
  15606    cleanup_zsubexpr();
  15607    re_extmatch_out = make_extmatch();
  15608    // Loop over \z1, \z2, etc.  There is no \z0.
  15609    for (i = 1; i < subs.synt.in_use; i++) {
  15610      if (REG_MULTI) {
  15611        struct multipos *mpos = &subs.synt.list.multi[i];
  15612 
  15613        // Only accept single line matches that are valid.
  15614        if (mpos->start_lnum >= 0
  15615            && mpos->start_lnum == mpos->end_lnum
  15616            && mpos->end_col >= mpos->start_col) {
  15617          re_extmatch_out->matches[i] =
  15618            (uint8_t *)xstrnsave(reg_getline(mpos->start_lnum) + mpos->start_col,
  15619                                 (size_t)(mpos->end_col - mpos->start_col));
  15620        }
  15621      } else {
  15622        struct linepos *lpos = &subs.synt.list.line[i];
  15623 
  15624        if (lpos->start != NULL && lpos->end != NULL) {
  15625          re_extmatch_out->matches[i] =
  15626            (uint8_t *)xstrnsave((char *)lpos->start, (size_t)(lpos->end - lpos->start));
  15627        }
  15628      }
  15629    }
  15630  }
  15631 
  15632  return 1 + rex.lnum;
  15633 }
  15634 
  15635 /// Match a regexp against a string ("line" points to the string) or multiple
  15636 /// lines (if "line" is NULL, use reg_getline()).
  15637 ///
  15638 /// @param line String in which to search or NULL
  15639 /// @param startcol Column to start looking for match
  15640 /// @param tm Timeout limit or NULL
  15641 /// @param timed_out Flag set on timeout or NULL
  15642 ///
  15643 /// @return <= 0 if there is no match and number of lines contained in the
  15644 /// match otherwise.
  15645 static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int *timed_out)
  15646 {
  15647  nfa_regprog_T *prog;
  15648  int retval = 0;
  15649  colnr_T col = startcol;
  15650 
  15651  if (REG_MULTI) {
  15652    prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
  15653    line = (uint8_t *)reg_getline(0);  // relative to the cursor
  15654    rex.reg_startpos = rex.reg_mmatch->startpos;
  15655    rex.reg_endpos = rex.reg_mmatch->endpos;
  15656  } else {
  15657    prog = (nfa_regprog_T *)rex.reg_match->regprog;
  15658    rex.reg_startp = (uint8_t **)rex.reg_match->startp;
  15659    rex.reg_endp = (uint8_t **)rex.reg_match->endp;
  15660  }
  15661 
  15662  // Be paranoid...
  15663  if (prog == NULL || line == NULL) {
  15664    iemsg(_(e_null));
  15665    goto theend;
  15666  }
  15667 
  15668  // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
  15669  if (prog->regflags & RF_ICASE) {
  15670    rex.reg_ic = true;
  15671  } else if (prog->regflags & RF_NOICASE) {
  15672    rex.reg_ic = false;
  15673  }
  15674 
  15675  // If pattern contains "\Z" overrule value of rex.reg_icombine
  15676  if (prog->regflags & RF_ICOMBINE) {
  15677    rex.reg_icombine = true;
  15678  }
  15679 
  15680  rex.line = line;
  15681  rex.lnum = 0;  // relative to line
  15682 
  15683  rex.nfa_has_zend = prog->has_zend;
  15684  rex.nfa_has_backref = prog->has_backref;
  15685  rex.nfa_nsubexpr = prog->nsubexp;
  15686  rex.nfa_listid = 1;
  15687  rex.nfa_alt_listid = 2;
  15688 #ifdef REGEXP_DEBUG
  15689  nfa_regengine.expr = prog->pattern;
  15690 #endif
  15691 
  15692  if (prog->reganch && col > 0) {
  15693    return 0L;
  15694  }
  15695 
  15696  rex.need_clear_subexpr = true;
  15697  // Clear the external match subpointers if necessary.
  15698  if (prog->reghasz == REX_SET) {
  15699    rex.nfa_has_zsubexpr = true;
  15700    rex.need_clear_zsubexpr = true;
  15701  } else {
  15702    rex.nfa_has_zsubexpr = false;
  15703    rex.need_clear_zsubexpr = false;
  15704  }
  15705 
  15706  if (prog->regstart != NUL) {
  15707    // Skip ahead until a character we know the match must start with.
  15708    // When there is none there is no match.
  15709    if (skip_to_start(prog->regstart, &col) == FAIL) {
  15710      return 0L;
  15711    }
  15712 
  15713    // If match_text is set it contains the full text that must match.
  15714    // Nothing else to try. Doesn't handle combining chars well.
  15715    if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) {
  15716      retval = find_match_text(&col, prog->regstart, prog->match_text);
  15717      if (REG_MULTI) {
  15718        rex.reg_mmatch->rmm_matchcol = col;
  15719      } else {
  15720        rex.reg_match->rm_matchcol = col;
  15721      }
  15722      return retval;
  15723    }
  15724  }
  15725 
  15726  // If the start column is past the maximum column: no need to try.
  15727  if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
  15728    goto theend;
  15729  }
  15730 
  15731  // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
  15732  // it's accidentally used during execution.
  15733  nstate = 0;
  15734  for (int i = 0; i < prog->nstate; i++) {
  15735    prog->state[i].id = i;
  15736    prog->state[i].lastlist[0] = 0;
  15737    prog->state[i].lastlist[1] = 0;
  15738  }
  15739 
  15740  retval = nfa_regtry(prog, col, tm, timed_out);
  15741 
  15742 #ifdef REGEXP_DEBUG
  15743  nfa_regengine.expr = NULL;
  15744 #endif
  15745 
  15746 theend:
  15747  if (retval > 0) {
  15748    // Make sure the end is never before the start.  Can happen when \zs and
  15749    // \ze are used.
  15750    if (REG_MULTI) {
  15751      const lpos_T *const start = &rex.reg_mmatch->startpos[0];
  15752      const lpos_T *const end = &rex.reg_mmatch->endpos[0];
  15753 
  15754      if (end->lnum < start->lnum
  15755          || (end->lnum == start->lnum && end->col < start->col)) {
  15756        rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
  15757      }
  15758    } else {
  15759      if (rex.reg_match->endp[0] < rex.reg_match->startp[0]) {
  15760        rex.reg_match->endp[0] = rex.reg_match->startp[0];
  15761      }
  15762 
  15763      // startpos[0] may be set by "\zs", also return the column where
  15764      // the whole pattern matched.
  15765      rex.reg_match->rm_matchcol = col;
  15766    }
  15767  }
  15768 
  15769  return retval;
  15770 }
  15771 
  15772 // Compile a regular expression into internal code for the NFA matcher.
  15773 // Returns the program in allocated space.  Returns NULL for an error.
  15774 static regprog_T *nfa_regcomp(uint8_t *expr, int re_flags)
  15775 {
  15776  nfa_regprog_T *prog = NULL;
  15777  int *postfix;
  15778 
  15779  if (expr == NULL) {
  15780    return NULL;
  15781  }
  15782 
  15783 #ifdef REGEXP_DEBUG
  15784  nfa_regengine.expr = expr;
  15785 #endif
  15786  nfa_re_flags = re_flags;
  15787 
  15788  init_class_tab();
  15789 
  15790  nfa_regcomp_start(expr, re_flags);
  15791 
  15792  // Build postfix form of the regexp. Needed to build the NFA
  15793  // (and count its size).
  15794  postfix = re2post();
  15795  if (postfix == NULL) {
  15796    goto fail;              // Cascaded (syntax?) error
  15797  }
  15798 
  15799  // In order to build the NFA, we parse the input regexp twice:
  15800  // 1. first pass to count size (so we can allocate space)
  15801  // 2. second to emit code
  15802 #ifdef REGEXP_DEBUG
  15803  {
  15804    FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
  15805 
  15806    if (f != NULL) {
  15807      fprintf(f,
  15808              "\n*****************************\n\n\n\n\t"
  15809              "Compiling regexp \"%s\"... hold on !\n",
  15810              expr);
  15811      fclose(f);
  15812    }
  15813  }
  15814 #endif
  15815 
  15816  // PASS 1
  15817  // Count number of NFA states in "nstate". Do not build the NFA.
  15818  post2nfa(postfix, post_ptr, true);
  15819 
  15820  // allocate the regprog with space for the compiled regexp
  15821  size_t prog_size = offsetof(nfa_regprog_T, state) + sizeof(nfa_state_T) * (size_t)nstate;
  15822  prog = xmalloc(prog_size);
  15823  state_ptr = prog->state;
  15824  prog->re_in_use = false;
  15825 
  15826  // PASS 2
  15827  // Build the NFA
  15828  prog->start = post2nfa(postfix, post_ptr, false);
  15829  if (prog->start == NULL) {
  15830    goto fail;
  15831  }
  15832  prog->regflags = regflags;
  15833  prog->engine = &nfa_regengine;
  15834  prog->nstate = nstate;
  15835  prog->has_zend = rex.nfa_has_zend;
  15836  prog->has_backref = rex.nfa_has_backref;
  15837  prog->nsubexp = regnpar;
  15838 
  15839  nfa_postprocess(prog);
  15840 
  15841  prog->reganch = nfa_get_reganch(prog->start, 0);
  15842  prog->regstart = nfa_get_regstart(prog->start, 0);
  15843  prog->match_text = nfa_get_match_text(prog->start);
  15844 
  15845 #ifdef REGEXP_DEBUG
  15846  nfa_postfix_dump(expr, OK);
  15847  nfa_dump(prog);
  15848 #endif
  15849  // Remember whether this pattern has any \z specials in it.
  15850  prog->reghasz = re_has_z;
  15851  prog->pattern = xstrdup((char *)expr);
  15852 #ifdef REGEXP_DEBUG
  15853  nfa_regengine.expr = NULL;
  15854 #endif
  15855 
  15856 out:
  15857  xfree(post_start);
  15858  post_start = post_ptr = post_end = NULL;
  15859  state_ptr = NULL;
  15860  return (regprog_T *)prog;
  15861 
  15862 fail:
  15863  XFREE_CLEAR(prog);
  15864 #ifdef REGEXP_DEBUG
  15865  nfa_postfix_dump(expr, FAIL);
  15866  nfa_regengine.expr = NULL;
  15867 #endif
  15868  goto out;
  15869 }
  15870 
  15871 // Free a compiled regexp program, returned by nfa_regcomp().
  15872 static void nfa_regfree(regprog_T *prog)
  15873 {
  15874  if (prog == NULL) {
  15875    return;
  15876  }
  15877 
  15878  xfree(((nfa_regprog_T *)prog)->match_text);
  15879  xfree(((nfa_regprog_T *)prog)->pattern);
  15880  xfree(prog);
  15881 }
  15882 
  15883 /// Match a regexp against a string.
  15884 /// "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
  15885 /// Uses curbuf for line count and 'iskeyword'.
  15886 /// If "line_lbr" is true, consider a "\n" in "line" to be a line break.
  15887 ///
  15888 /// @param line  string to match against
  15889 /// @param col   column to start looking for match
  15890 ///
  15891 /// @return  <= 0 for failure, number of lines contained in the match otherwise.
  15892 static int nfa_regexec_nl(regmatch_T *rmp, uint8_t *line, colnr_T col, bool line_lbr)
  15893 {
  15894  rex.reg_match = rmp;
  15895  rex.reg_mmatch = NULL;
  15896  rex.reg_maxline = 0;
  15897  rex.reg_line_lbr = line_lbr;
  15898  rex.reg_buf = curbuf;
  15899  rex.reg_win = NULL;
  15900  rex.reg_ic = rmp->rm_ic;
  15901  rex.reg_icombine = false;
  15902  rex.reg_nobreak = rmp->regprog->re_flags & RE_NOBREAK;
  15903  rex.reg_maxcol = 0;
  15904  return nfa_regexec_both(line, col, NULL, NULL);
  15905 }
  15906 
  15907 /// Matches a regexp against multiple lines.
  15908 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
  15909 /// Uses curbuf for line count and 'iskeyword'.
  15910 ///
  15911 /// @param win Window in which to search or NULL
  15912 /// @param buf Buffer in which to search
  15913 /// @param lnum Number of line to start looking for match
  15914 /// @param col Column to start looking for match
  15915 /// @param tm Timeout limit or NULL
  15916 /// @param timed_out Flag set on timeout or NULL
  15917 ///
  15918 /// @return <= 0 if there is no match and number of lines contained in the match
  15919 /// otherwise.
  15920 ///
  15921 /// @note The body is the same as bt_regexec() except for nfa_regexec_both()
  15922 ///
  15923 /// @warning
  15924 /// Match may actually be in another line. e.g.:
  15925 /// when r.e. is \nc, cursor is at 'a' and the text buffer looks like
  15926 ///
  15927 /// @par
  15928 ///
  15929 ///     +-------------------------+
  15930 ///     |a                        |
  15931 ///     |b                        |
  15932 ///     |c                        |
  15933 ///     |                         |
  15934 ///     +-------------------------+
  15935 ///
  15936 /// @par
  15937 /// then nfa_regexec_multi() returns 3. while the original vim_regexec_multi()
  15938 /// returns 0 and a second call at line 2 will return 2.
  15939 ///
  15940 /// @par
  15941 /// FIXME if this behavior is not compatible.
  15942 static int nfa_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col,
  15943                             proftime_T *tm, int *timed_out)
  15944 {
  15945  init_regexec_multi(rmp, win, buf, lnum);
  15946  return nfa_regexec_both(NULL, col, tm, timed_out);
  15947 }
  15948 // }}}1
  15949 
  15950 static regengine_T bt_regengine = {
  15951  bt_regcomp,
  15952  bt_regfree,
  15953  bt_regexec_nl,
  15954  bt_regexec_multi,
  15955 #ifdef REGEXP_DEBUG
  15956  "",
  15957 #endif
  15958 };
  15959 
  15960 static regengine_T nfa_regengine = {
  15961  nfa_regcomp,
  15962  nfa_regfree,
  15963  nfa_regexec_nl,
  15964  nfa_regexec_multi,
  15965 #ifdef REGEXP_DEBUG
  15966  "",
  15967 #endif
  15968 };
  15969 
  15970 // Which regexp engine to use? Needed for vim_regcomp().
  15971 // Must match with 'regexpengine'.
  15972 static int regexp_engine = 0;
  15973 
  15974 #ifdef REGEXP_DEBUG
  15975 static uint8_t regname[][30] = {
  15976  "AUTOMATIC Regexp Engine",
  15977  "BACKTRACKING Regexp Engine",
  15978  "NFA Regexp Engine"
  15979 };
  15980 #endif
  15981 
  15982 // Compile a regular expression into internal code.
  15983 // Returns the program in allocated memory.
  15984 // Use vim_regfree() to free the memory.
  15985 // Returns NULL for an error.
  15986 regprog_T *vim_regcomp(const char *expr_arg, int re_flags)
  15987 {
  15988  regprog_T *prog = NULL;
  15989  const char *expr = expr_arg;
  15990 
  15991  regexp_engine = (int)p_re;
  15992 
  15993  // Check for prefix "\%#=", that sets the regexp engine
  15994  if (strncmp(expr, "\\%#=", 4) == 0) {
  15995    int newengine = expr[4] - '0';
  15996 
  15997    if (newengine == AUTOMATIC_ENGINE
  15998        || newengine == BACKTRACKING_ENGINE
  15999        || newengine == NFA_ENGINE) {
  16000      regexp_engine = expr[4] - '0';
  16001      expr += 5;
  16002 #ifdef REGEXP_DEBUG
  16003      smsg(0, "New regexp mode selected (%d): %s",
  16004           regexp_engine,
  16005           regname[newengine]);
  16006 #endif
  16007    } else {
  16008      emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
  16009      regexp_engine = AUTOMATIC_ENGINE;
  16010    }
  16011  }
  16012 #ifdef REGEXP_DEBUG
  16013  bt_regengine.expr = expr;
  16014  nfa_regengine.expr = expr;
  16015 #endif
  16016  // reg_iswordc() uses rex.reg_buf
  16017  rex.reg_buf = curbuf;
  16018 
  16019  //
  16020  // First try the NFA engine, unless backtracking was requested.
  16021  //
  16022  const int called_emsg_before = called_emsg;
  16023  if (regexp_engine != BACKTRACKING_ENGINE) {
  16024    prog = nfa_regengine.regcomp((uint8_t *)expr,
  16025                                 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
  16026  } else {
  16027    prog = bt_regengine.regcomp((uint8_t *)expr, re_flags);
  16028  }
  16029 
  16030  // Check for error compiling regexp with initial engine.
  16031  if (prog == NULL) {
  16032 #ifdef BT_REGEXP_DEBUG_LOG
  16033    // Debugging log for BT engine.
  16034    if (regexp_engine != BACKTRACKING_ENGINE) {
  16035      FILE *f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
  16036      if (f) {
  16037        fprintf(f, "Syntax error in \"%s\"\n", expr);
  16038        fclose(f);
  16039      } else {
  16040        semsg("(NFA) Could not open \"%s\" to write !!!",
  16041              BT_REGEXP_DEBUG_LOG_NAME);
  16042      }
  16043    }
  16044 #endif
  16045    // If the NFA engine failed, try the backtracking engine. The NFA engine
  16046    // also fails for patterns that it can't handle well but are still valid
  16047    // patterns, thus a retry should work.
  16048    // But don't try if an error message was given.
  16049    if (regexp_engine == AUTOMATIC_ENGINE && called_emsg == called_emsg_before) {
  16050      regexp_engine = BACKTRACKING_ENGINE;
  16051      report_re_switch(expr);
  16052      prog = bt_regengine.regcomp((uint8_t *)expr, re_flags);
  16053    }
  16054  }
  16055 
  16056  if (prog != NULL) {
  16057    // Store the info needed to call regcomp() again when the engine turns out
  16058    // to be very slow when executing it.
  16059    prog->re_engine = (unsigned)regexp_engine;
  16060    prog->re_flags = (unsigned)re_flags;
  16061  }
  16062 
  16063  return prog;
  16064 }
  16065 
  16066 // Free a compiled regexp program, returned by vim_regcomp().
  16067 void vim_regfree(regprog_T *prog)
  16068 {
  16069  if (prog != NULL) {
  16070    prog->engine->regfree(prog);
  16071  }
  16072 }
  16073 
  16074 #if defined(EXITFREE)
  16075 void free_regexp_stuff(void)
  16076 {
  16077  ga_clear(&regstack);
  16078  ga_clear(&backpos);
  16079  xfree(reg_tofree);
  16080  xfree(reg_prev_sub);
  16081 }
  16082 
  16083 #endif
  16084 
  16085 static void report_re_switch(const char *pat)
  16086 {
  16087  if (p_verbose > 0) {
  16088    verbose_enter();
  16089    msg_puts(_("Switching to backtracking RE engine for pattern: "));
  16090    msg_puts(pat);
  16091    verbose_leave();
  16092  }
  16093 }
  16094 
  16095 /// Match a regexp against a string.
  16096 /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
  16097 /// Note: "rmp->regprog" may be freed and changed.
  16098 /// Uses curbuf for line count and 'iskeyword'.
  16099 /// When "nl" is true consider a "\n" in "line" to be a line break.
  16100 ///
  16101 /// @param rmp
  16102 /// @param line the string to match against
  16103 /// @param col  the column to start looking for match
  16104 /// @param nl
  16105 ///
  16106 /// @return true if there is a match, false if not.
  16107 static bool vim_regexec_string(regmatch_T *rmp, const char *line, colnr_T col, bool nl)
  16108 {
  16109  regexec_T rex_save;
  16110  bool rex_in_use_save = rex_in_use;
  16111 
  16112  // Cannot use the same prog recursively, it contains state.
  16113  if (rmp->regprog->re_in_use) {
  16114    emsg(_(e_recursive));
  16115    return false;
  16116  }
  16117  rmp->regprog->re_in_use = true;
  16118 
  16119  if (rex_in_use) {
  16120    // Being called recursively, save the state.
  16121    rex_save = rex;
  16122  }
  16123  rex_in_use = true;
  16124 
  16125  rex.reg_startp = NULL;
  16126  rex.reg_endp = NULL;
  16127  rex.reg_startpos = NULL;
  16128  rex.reg_endpos = NULL;
  16129 
  16130  int result = rmp->regprog->engine->regexec_nl(rmp, (uint8_t *)line, col, nl);
  16131  rmp->regprog->re_in_use = false;
  16132 
  16133  // NFA engine aborted because it's very slow, use backtracking engine instead.
  16134  if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
  16135      && result == NFA_TOO_EXPENSIVE) {
  16136    int save_p_re = (int)p_re;
  16137    int re_flags = (int)rmp->regprog->re_flags;
  16138    char *pat = xstrdup(((nfa_regprog_T *)rmp->regprog)->pattern);
  16139 
  16140    p_re = BACKTRACKING_ENGINE;
  16141    vim_regfree(rmp->regprog);
  16142    report_re_switch(pat);
  16143    rmp->regprog = vim_regcomp(pat, re_flags);
  16144    if (rmp->regprog != NULL) {
  16145      rmp->regprog->re_in_use = true;
  16146      result = rmp->regprog->engine->regexec_nl(rmp, (uint8_t *)line, col, nl);
  16147      rmp->regprog->re_in_use = false;
  16148    }
  16149 
  16150    xfree(pat);
  16151    p_re = save_p_re;
  16152  }
  16153 
  16154  rex_in_use = rex_in_use_save;
  16155  if (rex_in_use) {
  16156    rex = rex_save;
  16157  }
  16158 
  16159  return result > 0;
  16160 }
  16161 
  16162 // Note: "*prog" may be freed and changed.
  16163 // Return true if there is a match, false if not.
  16164 bool vim_regexec_prog(regprog_T **prog, bool ignore_case, const char *line, colnr_T col)
  16165 {
  16166  regmatch_T regmatch = { .regprog = *prog, .rm_ic = ignore_case };
  16167  bool r = vim_regexec_string(&regmatch, line, col, false);
  16168  *prog = regmatch.regprog;
  16169  return r;
  16170 }
  16171 
  16172 // Note: "rmp->regprog" may be freed and changed.
  16173 // Return true if there is a match, false if not.
  16174 bool vim_regexec(regmatch_T *rmp, const char *line, colnr_T col)
  16175 {
  16176  return vim_regexec_string(rmp, line, col, false);
  16177 }
  16178 
  16179 // Like vim_regexec(), but consider a "\n" in "line" to be a line break.
  16180 // Note: "rmp->regprog" may be freed and changed.
  16181 // Return true if there is a match, false if not.
  16182 bool vim_regexec_nl(regmatch_T *rmp, const char *line, colnr_T col)
  16183 {
  16184  return vim_regexec_string(rmp, line, col, true);
  16185 }
  16186 
  16187 /// Match a regexp against multiple lines.
  16188 /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
  16189 /// Note: "rmp->regprog" may be freed and changed, even set to NULL.
  16190 /// Uses curbuf for line count and 'iskeyword'.
  16191 ///
  16192 /// @param win        window in which to search or NULL
  16193 /// @param buf        buffer in which to search
  16194 /// @param lnum       nr of line to start looking for match
  16195 /// @param col        column to start looking for match
  16196 /// @param tm         timeout limit or NULL
  16197 /// @param timed_out  flag is set when timeout limit reached
  16198 ///
  16199 /// @return  zero if there is no match.  Return number of lines contained in the
  16200 ///          match otherwise.
  16201 int vim_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col,
  16202                      proftime_T *tm, int *timed_out)
  16203  FUNC_ATTR_NONNULL_ARG(1)
  16204 {
  16205  regexec_T rex_save;
  16206  bool rex_in_use_save = rex_in_use;
  16207 
  16208  // Cannot use the same prog recursively, it contains state.
  16209  if (rmp->regprog->re_in_use) {
  16210    emsg(_(e_recursive));
  16211    return false;
  16212  }
  16213  rmp->regprog->re_in_use = true;
  16214 
  16215  if (rex_in_use) {
  16216    // Being called recursively, save the state.
  16217    rex_save = rex;
  16218  }
  16219  rex_in_use = true;
  16220 
  16221  int result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm, timed_out);
  16222  rmp->regprog->re_in_use = false;
  16223 
  16224  // NFA engine aborted because it's very slow, use backtracking engine instead.
  16225  if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
  16226      && result == NFA_TOO_EXPENSIVE) {
  16227    int save_p_re = (int)p_re;
  16228    int re_flags = (int)rmp->regprog->re_flags;
  16229    char *pat = xstrdup(((nfa_regprog_T *)rmp->regprog)->pattern);
  16230 
  16231    p_re = BACKTRACKING_ENGINE;
  16232    regprog_T *prev_prog = rmp->regprog;
  16233 
  16234    report_re_switch(pat);
  16235    // checking for \z misuse was already done when compiling for NFA,
  16236    // allow all here
  16237    reg_do_extmatch = REX_ALL;
  16238    rmp->regprog = vim_regcomp(pat, re_flags);
  16239    reg_do_extmatch = 0;
  16240 
  16241    if (rmp->regprog == NULL) {
  16242      // Somehow compiling the pattern failed now, put back the
  16243      // previous one to avoid "regprog" becoming NULL.
  16244      rmp->regprog = prev_prog;
  16245    } else {
  16246      vim_regfree(prev_prog);
  16247 
  16248      rmp->regprog->re_in_use = true;
  16249      result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm, timed_out);
  16250      rmp->regprog->re_in_use = false;
  16251    }
  16252 
  16253    xfree(pat);
  16254    p_re = save_p_re;
  16255  }
  16256 
  16257  rex_in_use = rex_in_use_save;
  16258  if (rex_in_use) {
  16259    rex = rex_save;
  16260  }
  16261 
  16262  return result <= 0 ? 0 : result;
  16263 }