regexp.c (454742B)
1 // Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub() 2 3 // By default: do not create debugging logs or files related to regular 4 // expressions, even when compiling with -DDEBUG. 5 // Uncomment the second line to get the regexp debugging. 6 // #undef REGEXP_DEBUG 7 // #define REGEXP_DEBUG 8 9 #include <assert.h> 10 #include <ctype.h> 11 #include <inttypes.h> 12 #include <limits.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <stdlib.h> 16 #include <string.h> 17 #include <uv.h> 18 19 #include "nvim/ascii_defs.h" 20 #include "nvim/buffer_defs.h" 21 #include "nvim/charset.h" 22 #include "nvim/errors.h" 23 #include "nvim/eval.h" 24 #include "nvim/eval/typval.h" 25 #include "nvim/eval/userfunc.h" 26 #include "nvim/garray.h" 27 #include "nvim/garray_defs.h" 28 #include "nvim/gettext_defs.h" 29 #include "nvim/globals.h" 30 #include "nvim/keycodes.h" 31 #include "nvim/macros_defs.h" 32 #include "nvim/mark.h" 33 #include "nvim/mark_defs.h" 34 #include "nvim/mbyte.h" 35 #include "nvim/mbyte_defs.h" 36 #include "nvim/memline.h" 37 #include "nvim/memory.h" 38 #include "nvim/message.h" 39 #include "nvim/option_vars.h" 40 #include "nvim/os/input.h" 41 #include "nvim/plines.h" 42 #include "nvim/pos_defs.h" 43 #include "nvim/profile.h" 44 #include "nvim/regexp.h" 45 #include "nvim/regexp_defs.h" 46 #include "nvim/strings.h" 47 #include "nvim/types_defs.h" 48 #include "nvim/vim_defs.h" 49 50 typedef enum { 51 RGLF_LINE = 0x01, 52 RGLF_LENGTH = 0x02, 53 RGLF_SUBMATCH = 0x04, 54 } reg_getline_flags_T; 55 56 enum { 57 /// In the NFA engine: how many braces are allowed. 58 /// TODO(RE): Use dynamic memory allocation instead of static, like here 59 NFA_MAX_BRACES = 20, 60 }; 61 62 enum { 63 /// In the NFA engine: how many states are allowed. 64 NFA_MAX_STATES = 100000, 65 NFA_TOO_EXPENSIVE = -1, 66 }; 67 68 /// Which regexp engine to use? Needed for vim_regcomp(). 69 /// Must match with 'regexpengine'. 70 enum { 71 AUTOMATIC_ENGINE = 0, 72 BACKTRACKING_ENGINE = 1, 73 NFA_ENGINE = 2, 74 }; 75 76 /// Structure returned by vim_regcomp() to pass on to vim_regexec(). 77 /// This is the general structure. For the actual matcher, two specific 78 /// structures are used. See code below. 79 struct regprog { 80 regengine_T *engine; 81 unsigned regflags; 82 unsigned re_engine; ///< Automatic, backtracking or NFA engine. 83 unsigned re_flags; ///< Second argument for vim_regcomp(). 84 bool re_in_use; ///< prog is being executed 85 }; 86 87 /// Structure used by the back track matcher. 88 /// These fields are only to be used in regexp.c! 89 /// See regexp.c for an explanation. 90 typedef struct { 91 // These four members implement regprog_T. 92 regengine_T *engine; 93 unsigned regflags; 94 unsigned re_engine; 95 unsigned re_flags; 96 bool re_in_use; 97 98 int regstart; 99 uint8_t reganch; 100 uint8_t *regmust; 101 int regmlen; 102 uint8_t reghasz; 103 uint8_t program[]; 104 } bt_regprog_T; 105 106 /// Structure representing a NFA state. 107 /// An NFA state may have no outgoing edge, when it is a NFA_MATCH state. 108 typedef struct nfa_state nfa_state_T; 109 struct nfa_state { 110 int c; 111 nfa_state_T *out; 112 nfa_state_T *out1; 113 int id; 114 int lastlist[2]; ///< 0: normal, 1: recursive 115 int val; 116 }; 117 118 /// Structure used by the NFA matcher. 119 typedef struct { 120 // These four members implement regprog_T. 121 regengine_T *engine; 122 unsigned regflags; 123 unsigned re_engine; 124 unsigned re_flags; 125 bool re_in_use; 126 127 nfa_state_T *start; ///< points into state[] 128 129 int reganch; ///< pattern starts with ^ 130 int regstart; ///< char at start of pattern 131 uint8_t *match_text; ///< plain text to match with 132 133 int has_zend; ///< pattern contains \ze 134 int has_backref; ///< pattern contains \1 .. \9 135 int reghasz; 136 char *pattern; 137 int nsubexp; ///< number of () 138 int nstate; 139 nfa_state_T state[]; 140 } nfa_regprog_T; 141 142 struct regengine { 143 /// bt_regcomp or nfa_regcomp 144 regprog_T *(*regcomp)(uint8_t *, int); 145 /// bt_regfree or nfa_regfree 146 void (*regfree)(regprog_T *); 147 /// bt_regexec_nl or nfa_regexec_nl 148 int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool); 149 /// bt_regexec_mult or nfa_regexec_mult 150 int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *); 151 #ifdef REGEXP_DEBUG 152 uint8_t *expr; 153 #endif 154 }; 155 156 // Structure used to save the current input state, when it needs to be 157 // restored after trying a match. Used by reg_save() and reg_restore(). 158 // Also stores the length of "backpos". 159 typedef struct { 160 union { 161 uint8_t *ptr; // rex.input pointer, for single-line regexp 162 lpos_T pos; // rex.input pos, for multi-line regexp 163 } rs_u; 164 int rs_len; 165 } regsave_T; 166 167 // struct to save start/end pointer/position in for \(\) 168 typedef struct { 169 union { 170 uint8_t *ptr; 171 lpos_T pos; 172 } se_u; 173 } save_se_T; 174 175 // Values for rs_state in regitem_T. 176 typedef enum regstate_E { 177 RS_NOPEN = 0, // NOPEN and NCLOSE 178 RS_MOPEN, // MOPEN + [0-9] 179 RS_MCLOSE, // MCLOSE + [0-9] 180 RS_ZOPEN, // ZOPEN + [0-9] 181 RS_ZCLOSE, // ZCLOSE + [0-9] 182 RS_BRANCH, // BRANCH 183 RS_BRCPLX_MORE, // BRACE_COMPLEX and trying one more match 184 RS_BRCPLX_LONG, // BRACE_COMPLEX and trying longest match 185 RS_BRCPLX_SHORT, // BRACE_COMPLEX and trying shortest match 186 RS_NOMATCH, // NOMATCH 187 RS_BEHIND1, // BEHIND / NOBEHIND matching rest 188 RS_BEHIND2, // BEHIND / NOBEHIND matching behind part 189 RS_STAR_LONG, // STAR/PLUS/BRACE_SIMPLE longest match 190 RS_STAR_SHORT, // STAR/PLUS/BRACE_SIMPLE shortest match 191 } regstate_T; 192 193 // When there are alternatives a regstate_T is put on the regstack to remember 194 // what we are doing. 195 // Before it may be another type of item, depending on rs_state, to remember 196 // more things. 197 typedef struct regitem_S { 198 regstate_T rs_state; // what we are doing, one of RS_ above 199 int16_t rs_no; // submatch nr or BEHIND/NOBEHIND 200 uint8_t *rs_scan; // current node in program 201 union { 202 save_se_T sesave; 203 regsave_T regsave; 204 } rs_un; // room for saving rex.input 205 } regitem_T; 206 207 // used for BEHIND and NOBEHIND matching 208 typedef struct regbehind_S { 209 regsave_T save_after; 210 regsave_T save_behind; 211 int save_need_clear_subexpr; 212 save_se_T save_start[NSUBEXP]; 213 save_se_T save_end[NSUBEXP]; 214 } regbehind_T; 215 216 // Since the out pointers in the list are always 217 // uninitialized, we use the pointers themselves 218 // as storage for the Ptrlists. 219 typedef union Ptrlist Ptrlist; 220 union Ptrlist { 221 Ptrlist *next; 222 nfa_state_T *s; 223 }; 224 225 struct Frag { 226 nfa_state_T *start; 227 Ptrlist *out; 228 }; 229 typedef struct Frag Frag_T; 230 231 typedef struct { 232 int in_use; ///< number of subexpr with useful info 233 234 // When REG_MULTI is true list.multi is used, otherwise list.line. 235 union { 236 struct multipos { 237 linenr_T start_lnum; 238 linenr_T end_lnum; 239 colnr_T start_col; 240 colnr_T end_col; 241 } multi[NSUBEXP]; 242 struct linepos { 243 uint8_t *start; 244 uint8_t *end; 245 } line[NSUBEXP]; 246 } list; 247 colnr_T orig_start_col; // list.multi[0].start_col without \zs 248 } regsub_T; 249 250 typedef struct { 251 regsub_T norm; // \( .. \) matches 252 regsub_T synt; // \z( .. \) matches 253 } regsubs_T; 254 255 // nfa_pim_T stores a Postponed Invisible Match. 256 typedef struct nfa_pim_S nfa_pim_T; 257 struct nfa_pim_S { 258 int result; // NFA_PIM_*, see below 259 nfa_state_T *state; // the invisible match start state 260 regsubs_T subs; // submatch info, only party used 261 union { 262 lpos_T pos; 263 uint8_t *ptr; 264 } end; // where the match must end 265 }; 266 267 // nfa_thread_T contains execution information of a NFA state 268 typedef struct { 269 nfa_state_T *state; 270 int count; 271 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed 272 // invisible match 273 regsubs_T subs; // submatch info, only party used 274 } nfa_thread_T; 275 276 // nfa_list_T contains the alternative NFA execution states. 277 typedef struct { 278 nfa_thread_T *t; ///< allocated array of states 279 int n; ///< nr of states currently in "t" 280 int len; ///< max nr of states in "t" 281 int id; ///< ID of the list 282 int has_pim; ///< true when any state has a PIM 283 } nfa_list_T; 284 285 #ifdef REGEXP_DEBUG 286 // show/save debugging data when BT engine is used 287 # define BT_REGEXP_DUMP 288 // save the debugging data to a file instead of displaying it 289 # define BT_REGEXP_LOG 290 # define BT_REGEXP_DEBUG_LOG 291 # define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log" 292 #endif 293 294 // Magic characters have a special meaning, they don't match literally. 295 // Magic characters are negative. This separates them from literal characters 296 // (possibly multi-byte). Only ASCII characters can be Magic. 297 #define Magic(x) ((int)(x) - 256) 298 #define un_Magic(x) ((x) + 256) 299 #define is_Magic(x) ((x) < 0) 300 301 typedef void (*fptr_T)(int *, int); 302 303 static int no_Magic(int x) 304 { 305 if (is_Magic(x)) { 306 return un_Magic(x); 307 } 308 return x; 309 } 310 311 static int toggle_Magic(int x) 312 { 313 if (is_Magic(x)) { 314 return un_Magic(x); 315 } 316 return Magic(x); 317 } 318 319 // The first byte of the BT regexp internal "program" is actually this magic 320 // number; the start node begins in the second byte. It's used to catch the 321 // most severe mutilation of the program by the caller. 322 #define REGMAGIC 0234 323 324 // Utility definitions. 325 #define UCHARAT(p) ((int)(*(uint8_t *)(p))) 326 327 // Used for an error (down from) vim_regcomp(): give the error message, set 328 // rc_did_emsg and return NULL 329 #define EMSG_RET_NULL(m) return (emsg(m), rc_did_emsg = true, (void *)NULL) 330 #define IEMSG_RET_NULL(m) return (iemsg(m), rc_did_emsg = true, (void *)NULL) 331 #define EMSG_RET_FAIL(m) return (emsg(m), rc_did_emsg = true, FAIL) 332 #define EMSG2_RET_NULL(m, c) \ 333 return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, (void *)NULL) 334 #define EMSG3_RET_NULL(m, c, a) \ 335 return (semsg((m), (c) ? "" : "\\", (a)), rc_did_emsg = true, (void *)NULL) 336 #define EMSG2_RET_FAIL(m, c) \ 337 return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, FAIL) 338 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_(e_invalid_item_in_str_brackets), reg_magic == MAGIC_ALL) 339 340 #define MAX_LIMIT (32767 << 16) 341 342 static const char e_invalid_character_after_str_at[] 343 = N_("E59: Invalid character after %s@"); 344 static const char e_invalid_use_of_underscore[] 345 = N_("E63: Invalid use of \\_"); 346 static const char e_pattern_uses_more_memory_than_maxmempattern[] 347 = N_("E363: Pattern uses more memory than 'maxmempattern'"); 348 static const char e_invalid_item_in_str_brackets[] 349 = N_("E369: Invalid item in %s%%[]"); 350 static const char e_missing_delimiter_after_search_pattern_str[] 351 = N_("E654: Missing delimiter after search pattern: %s"); 352 static const char e_missingbracket[] = N_("E769: Missing ] after %s["); 353 static const char e_reverse_range[] = N_("E944: Reverse range in character class"); 354 static const char e_large_class[] = N_("E945: Range too large in character class"); 355 static const char e_unmatchedpp[] = N_("E53: Unmatched %s%%("); 356 static const char e_unmatchedp[] = N_("E54: Unmatched %s("); 357 static const char e_unmatchedpar[] = N_("E55: Unmatched %s)"); 358 static const char e_z_not_allowed[] = N_("E66: \\z( not allowed here"); 359 static const char e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here"); 360 static const char e_missing_sb[] = N_("E69: Missing ] after %s%%["); 361 static const char e_empty_sb[] = N_("E70: Empty %s%%[]"); 362 static const char e_recursive[] = N_("E956: Cannot use pattern recursively"); 363 static const char e_regexp_number_after_dot_pos_search_chr[] 364 = N_("E1204: No Number allowed after .: '\\%%%c'"); 365 static const char e_nfa_regexp_missing_value_in_chr[] 366 = N_("E1273: (NFA regexp) missing value in '\\%%%c'"); 367 static const char e_atom_engine_must_be_at_start_of_pattern[] 368 = N_("E1281: Atom '\\%%#=%c' must be at the start of the pattern"); 369 static const char e_substitute_nesting_too_deep[] = N_("E1290: substitute nesting too deep"); 370 static const char e_unicode_val_too_large[] 371 = N_("E1541: Value too large, max Unicode codepoint is U+10FFFF"); 372 373 #define NOT_MULTI 0 374 #define MULTI_ONE 1 375 #define MULTI_MULT 2 376 377 // return values for regmatch() 378 #define RA_FAIL 1 // something failed, abort 379 #define RA_CONT 2 // continue in inner loop 380 #define RA_BREAK 3 // break inner loop 381 #define RA_MATCH 4 // successful match 382 #define RA_NOMATCH 5 // didn't match 383 384 /// Return NOT_MULTI if c is not a "multi" operator. 385 /// Return MULTI_ONE if c is a single "multi" operator. 386 /// Return MULTI_MULT if c is a multi "multi" operator. 387 static int re_multi_type(int c) 388 { 389 if (c == Magic('@') || c == Magic('=') || c == Magic('?')) { 390 return MULTI_ONE; 391 } 392 if (c == Magic('*') || c == Magic('+') || c == Magic('{')) { 393 return MULTI_MULT; 394 } 395 return NOT_MULTI; 396 } 397 398 static char *reg_prev_sub = NULL; 399 static size_t reg_prev_sublen = 0; 400 401 // REGEXP_INRANGE contains all characters which are always special in a [] 402 // range after '\'. 403 // REGEXP_ABBR contains all characters which act as abbreviations after '\'. 404 // These are: 405 // \n - New line (NL). 406 // \r - Carriage Return (CR). 407 // \t - Tab (TAB). 408 // \e - Escape (ESC). 409 // \b - Backspace (Ctrl_H). 410 // \d - Character code in decimal, eg \d123 411 // \o - Character code in octal, eg \o80 412 // \x - Character code in hex, eg \x4a 413 // \u - Multibyte character code, eg \u20ac 414 // \U - Long multibyte character code, eg \U12345678 415 static char REGEXP_INRANGE[] = "]^-n\\"; 416 static char REGEXP_ABBR[] = "nrtebdoxuU"; 417 418 // Translate '\x' to its control character, except "\n", which is Magic. 419 static int backslash_trans(int c) 420 { 421 switch (c) { 422 case 'r': 423 return CAR; 424 case 't': 425 return TAB; 426 case 'e': 427 return ESC; 428 case 'b': 429 return BS; 430 } 431 return c; 432 } 433 434 enum { 435 CLASS_ALNUM = 0, 436 CLASS_ALPHA, 437 CLASS_BLANK, 438 CLASS_CNTRL, 439 CLASS_DIGIT, 440 CLASS_GRAPH, 441 CLASS_LOWER, 442 CLASS_PRINT, 443 CLASS_PUNCT, 444 CLASS_SPACE, 445 CLASS_UPPER, 446 CLASS_XDIGIT, 447 CLASS_TAB, 448 CLASS_RETURN, 449 CLASS_BACKSPACE, 450 CLASS_ESCAPE, 451 CLASS_IDENT, 452 CLASS_KEYWORD, 453 CLASS_FNAME, 454 CLASS_NONE = 99, 455 }; 456 457 /// Check for a character class name "[:name:]". "pp" points to the '['. 458 /// Returns one of the CLASS_ items. CLASS_NONE means that no item was 459 /// recognized. Otherwise "pp" is advanced to after the item. 460 static int get_char_class(char **pp) 461 { 462 // must be sorted by the 'value' field because it is used by bsearch()! 463 static keyvalue_T char_class_tab[] = { 464 KEYVALUE_ENTRY(CLASS_ALNUM, "alnum:]"), 465 KEYVALUE_ENTRY(CLASS_ALPHA, "alpha:]"), 466 KEYVALUE_ENTRY(CLASS_BACKSPACE, "backspace:]"), 467 KEYVALUE_ENTRY(CLASS_BLANK, "blank:]"), 468 KEYVALUE_ENTRY(CLASS_CNTRL, "cntrl:]"), 469 KEYVALUE_ENTRY(CLASS_DIGIT, "digit:]"), 470 KEYVALUE_ENTRY(CLASS_ESCAPE, "escape:]"), 471 KEYVALUE_ENTRY(CLASS_FNAME, "fname:]"), 472 KEYVALUE_ENTRY(CLASS_GRAPH, "graph:]"), 473 KEYVALUE_ENTRY(CLASS_IDENT, "ident:]"), 474 KEYVALUE_ENTRY(CLASS_KEYWORD, "keyword:]"), 475 KEYVALUE_ENTRY(CLASS_LOWER, "lower:]"), 476 KEYVALUE_ENTRY(CLASS_PRINT, "print:]"), 477 KEYVALUE_ENTRY(CLASS_PUNCT, "punct:]"), 478 KEYVALUE_ENTRY(CLASS_RETURN, "return:]"), 479 KEYVALUE_ENTRY(CLASS_SPACE, "space:]"), 480 KEYVALUE_ENTRY(CLASS_TAB, "tab:]"), 481 KEYVALUE_ENTRY(CLASS_UPPER, "upper:]"), 482 KEYVALUE_ENTRY(CLASS_XDIGIT, "xdigit:]") 483 }; 484 485 // check that the value of "pp" has a chance of matching 486 if ((*pp)[1] == ':' && ASCII_ISLOWER((*pp)[2]) 487 && ASCII_ISLOWER((*pp)[3]) && ASCII_ISLOWER((*pp)[4])) { 488 // this function can be called repeatedly with the same value for "pp" 489 // so we cache the last found entry. 490 static keyvalue_T *last_entry = NULL; 491 492 keyvalue_T target = { 493 .key = 0, 494 .value = *pp + 2, 495 .length = 0, // not used, see cmp_keyvalue_value_n() 496 }; 497 498 keyvalue_T *entry; 499 if (last_entry != NULL && cmp_keyvalue_value_n(&target, last_entry) == 0) { 500 entry = last_entry; 501 } else { 502 entry = (keyvalue_T *)bsearch(&target, &char_class_tab, 503 ARRAY_SIZE(char_class_tab), 504 sizeof(char_class_tab[0]), cmp_keyvalue_value_n); 505 } 506 if (entry != NULL) { 507 last_entry = entry; 508 *pp += entry->length + 2; 509 return entry->key; 510 } 511 } 512 return CLASS_NONE; 513 } 514 515 // Specific version of character class functions. 516 // Using a table to keep this fast. 517 static int16_t class_tab[256]; 518 519 #define RI_DIGIT 0x01 520 #define RI_HEX 0x02 521 #define RI_OCTAL 0x04 522 #define RI_WORD 0x08 523 #define RI_HEAD 0x10 524 #define RI_ALPHA 0x20 525 #define RI_LOWER 0x40 526 #define RI_UPPER 0x80 527 #define RI_WHITE 0x100 528 529 static void init_class_tab(void) 530 { 531 int i; 532 static int done = false; 533 534 if (done) { 535 return; 536 } 537 538 for (i = 0; i < 256; i++) { 539 if (i >= '0' && i <= '7') { 540 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD; 541 } else if (i >= '8' && i <= '9') { 542 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD; 543 } else if (i >= 'a' && i <= 'f') { 544 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER; 545 } else if (i >= 'g' && i <= 'z') { 546 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER; 547 } else if (i >= 'A' && i <= 'F') { 548 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER; 549 } else if (i >= 'G' && i <= 'Z') { 550 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER; 551 } else if (i == '_') { 552 class_tab[i] = RI_WORD + RI_HEAD; 553 } else { 554 class_tab[i] = 0; 555 } 556 } 557 class_tab[' '] |= RI_WHITE; 558 class_tab['\t'] |= RI_WHITE; 559 done = true; 560 } 561 562 #define ri_digit(c) ((c) < 0x100 && (class_tab[c] & RI_DIGIT)) 563 #define ri_hex(c) ((c) < 0x100 && (class_tab[c] & RI_HEX)) 564 #define ri_octal(c) ((c) < 0x100 && (class_tab[c] & RI_OCTAL)) 565 #define ri_word(c) ((c) < 0x100 && (class_tab[c] & RI_WORD)) 566 #define ri_head(c) ((c) < 0x100 && (class_tab[c] & RI_HEAD)) 567 #define ri_alpha(c) ((c) < 0x100 && (class_tab[c] & RI_ALPHA)) 568 #define ri_lower(c) ((c) < 0x100 && (class_tab[c] & RI_LOWER)) 569 #define ri_upper(c) ((c) < 0x100 && (class_tab[c] & RI_UPPER)) 570 #define ri_white(c) ((c) < 0x100 && (class_tab[c] & RI_WHITE)) 571 572 // flags for regflags 573 #define RF_ICASE 1 // ignore case 574 #define RF_NOICASE 2 // don't ignore case 575 #define RF_HASNL 4 // can match a NL 576 #define RF_ICOMBINE 8 // ignore combining characters 577 #define RF_LOOKBH 16 // uses "\@<=" or "\@<!" 578 579 // Global work variables for vim_regcomp(). 580 581 static char *regparse; ///< Input-scan pointer. 582 static int regnpar; ///< () count. 583 static bool wants_nfa; ///< regex should use NFA engine 584 static int regnzpar; ///< \z() count. 585 static int re_has_z; ///< \z item detected 586 static unsigned regflags; ///< RF_ flags for prog 587 static int had_eol; ///< true when EOL found by vim_regcomp() 588 589 static magic_T reg_magic; ///< magicness of the pattern 590 591 static int reg_string; // matching with a string instead of a buffer 592 // line 593 static int reg_strict; // "[abc" is illegal 594 595 // META contains all characters that may be magic, except '^' and '$'. 596 597 // uncrustify:off 598 599 // META[] is used often enough to justify turning it into a table. 600 static uint8_t META_flags[] = { 601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603 // % & ( ) * + . 604 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 605 // 1 2 3 4 5 6 7 8 9 < = > ? 606 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 607 // @ A C D F H I K L M O 608 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 609 // P S U V W X Z [ _ 610 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 611 // a c d f h i k l m n o 612 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 613 // p s u v w x z { | ~ 614 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1 615 }; 616 617 // uncrustify:on 618 619 static int curchr; // currently parsed character 620 // Previous character. Note: prevchr is sometimes -1 when we are not at the 621 // start, eg in /[ ^I]^ the pattern was never found even if it existed, 622 // because ^ was taken to be magic -- webb 623 static int prevchr; 624 static int prevprevchr; // previous-previous character 625 static int nextchr; // used for ungetchr() 626 627 // arguments for reg() 628 #define REG_NOPAREN 0 // toplevel reg() 629 #define REG_PAREN 1 // \(\) 630 #define REG_ZPAREN 2 // \z(\) 631 #define REG_NPAREN 3 // \%(\) 632 633 typedef struct { 634 char *regparse; 635 int prevchr_len; 636 int curchr; 637 int prevchr; 638 int prevprevchr; 639 int nextchr; 640 int at_start; 641 int prev_at_start; 642 int regnpar; 643 } parse_state_T; 644 645 static regengine_T bt_regengine; 646 static regengine_T nfa_regengine; 647 648 #include "regexp.c.generated.h" 649 650 // Return true if compiled regular expression "prog" can match a line break. 651 int re_multiline(const regprog_T *prog) 652 FUNC_ATTR_NONNULL_ALL 653 { 654 return prog->regflags & RF_HASNL; 655 } 656 657 // Check for an equivalence class name "[=a=]". "pp" points to the '['. 658 // Returns a character representing the class. Zero means that no item was 659 // recognized. Otherwise "pp" is advanced to after the item. 660 static int get_equi_class(char **pp) 661 { 662 int c; 663 int l = 1; 664 char *p = *pp; 665 666 if (p[1] == '=' && p[2] != NUL) { 667 l = utfc_ptr2len(p + 2); 668 if (p[l + 2] == '=' && p[l + 3] == ']') { 669 c = utf_ptr2char(p + 2); 670 *pp += l + 4; 671 return c; 672 } 673 } 674 return 0; 675 } 676 677 // Check for a collating element "[.a.]". "pp" points to the '['. 678 // Returns a character. Zero means that no item was recognized. Otherwise 679 // "pp" is advanced to after the item. 680 // Currently only single characters are recognized! 681 static int get_coll_element(char **pp) 682 { 683 int c; 684 int l = 1; 685 char *p = *pp; 686 687 if (p[0] != NUL && p[1] == '.' && p[2] != NUL) { 688 l = utfc_ptr2len(p + 2); 689 if (p[l + 2] == '.' && p[l + 3] == ']') { 690 c = utf_ptr2char(p + 2); 691 *pp += l + 4; 692 return c; 693 } 694 } 695 return 0; 696 } 697 698 static int reg_cpo_lit; // 'cpoptions' contains 'l' flag 699 700 static void get_cpo_flags(void) 701 { 702 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL; 703 } 704 705 /// Skip over a "[]" range. 706 /// "p" must point to the character after the '['. 707 /// The returned pointer is on the matching ']', or the terminating NUL. 708 static char *skip_anyof(char *p) 709 { 710 int l; 711 712 if (*p == '^') { // Complement of range. 713 p++; 714 } 715 if (*p == ']' || *p == '-') { 716 p++; 717 } 718 while (*p != NUL && *p != ']') { 719 if ((l = utfc_ptr2len(p)) > 1) { 720 p += l; 721 } else if (*p == '-') { 722 p++; 723 if (*p != ']' && *p != NUL) { 724 MB_PTR_ADV(p); 725 } 726 } else if (*p == '\\' 727 && (vim_strchr(REGEXP_INRANGE, (uint8_t)p[1]) != NULL 728 || (!reg_cpo_lit 729 && vim_strchr(REGEXP_ABBR, (uint8_t)p[1]) != NULL))) { 730 p += 2; 731 } else if (*p == '[') { 732 if (get_char_class(&p) == CLASS_NONE 733 && get_equi_class(&p) == 0 734 && get_coll_element(&p) == 0 735 && *p != NUL) { 736 p++; // It is not a class name and not NUL 737 } 738 } else { 739 p++; 740 } 741 } 742 743 return p; 744 } 745 746 /// Skip past regular expression. 747 /// Stop at end of "startp" or where "delim" is found ('/', '?', etc). 748 /// Take care of characters with a backslash in front of it. 749 /// Skip strings inside [ and ]. 750 char *skip_regexp(char *startp, int delim, int magic) 751 { 752 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL); 753 } 754 755 /// Call skip_regexp() and when the delimiter does not match give an error and 756 /// return NULL. 757 char *skip_regexp_err(char *startp, int delim, int magic) 758 { 759 char *p = skip_regexp(startp, delim, magic); 760 761 if (*p != delim) { 762 semsg(_(e_missing_delimiter_after_search_pattern_str), startp); 763 return NULL; 764 } 765 return p; 766 } 767 768 /// skip_regexp() with extra arguments: 769 /// When "newp" is not NULL and "dirc" is '?', make an allocated copy of the 770 /// expression and change "\?" to "?". If "*newp" is not NULL the expression 771 /// is changed in-place. 772 /// If a "\?" is changed to "?" then "dropped" is incremented, unless NULL. 773 /// If "magic_val" is not NULL, returns the effective magicness of the pattern 774 char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *dropped, 775 magic_T *magic_val) 776 { 777 magic_T mymagic; 778 char *p = startp; 779 size_t startplen = 0; 780 781 if (magic) { 782 mymagic = MAGIC_ON; 783 } else { 784 mymagic = MAGIC_OFF; 785 } 786 get_cpo_flags(); 787 788 for (; p[0] != NUL; MB_PTR_ADV(p)) { 789 if (p[0] == dirc) { // found end of regexp 790 break; 791 } 792 if ((p[0] == '[' && mymagic >= MAGIC_ON) 793 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF)) { 794 p = skip_anyof(p + 1); 795 if (p[0] == NUL) { 796 break; 797 } 798 } else if (p[0] == '\\' && p[1] != NUL) { 799 if (dirc == '?' && newp != NULL && p[1] == '?') { 800 // change "\?" to "?", make a copy first. 801 if (startplen == 0) { 802 startplen = strlen(startp); 803 } 804 if (*newp == NULL) { 805 *newp = xstrnsave(startp, startplen); 806 p = *newp + (p - startp); 807 startp = *newp; 808 } 809 if (dropped != NULL) { 810 (*dropped)++; 811 } 812 memmove(p, p + 1, startplen - (size_t)((p + 1) - startp) + 1); 813 } else { 814 p++; // skip next character 815 } 816 if (*p == 'v') { 817 mymagic = MAGIC_ALL; 818 } else if (*p == 'V') { 819 mymagic = MAGIC_NONE; 820 } 821 } 822 } 823 if (magic_val != NULL) { 824 *magic_val = mymagic; 825 } 826 return p; 827 } 828 829 // variables used for parsing 830 static int prevchr_len; // byte length of previous char 831 static int at_start; // True when on the first character 832 static int prev_at_start; // True when on the second character 833 834 // Start parsing at "str". 835 static void initchr(char *str) 836 { 837 regparse = str; 838 prevchr_len = 0; 839 curchr = prevprevchr = prevchr = nextchr = -1; 840 at_start = true; 841 prev_at_start = false; 842 } 843 844 // Save the current parse state, so that it can be restored and parsing 845 // starts in the same state again. 846 static void save_parse_state(parse_state_T *ps) 847 { 848 ps->regparse = regparse; 849 ps->prevchr_len = prevchr_len; 850 ps->curchr = curchr; 851 ps->prevchr = prevchr; 852 ps->prevprevchr = prevprevchr; 853 ps->nextchr = nextchr; 854 ps->at_start = at_start; 855 ps->prev_at_start = prev_at_start; 856 ps->regnpar = regnpar; 857 } 858 859 // Restore a previously saved parse state. 860 static void restore_parse_state(parse_state_T *ps) 861 { 862 regparse = ps->regparse; 863 prevchr_len = ps->prevchr_len; 864 curchr = ps->curchr; 865 prevchr = ps->prevchr; 866 prevprevchr = ps->prevprevchr; 867 nextchr = ps->nextchr; 868 at_start = ps->at_start; 869 prev_at_start = ps->prev_at_start; 870 regnpar = ps->regnpar; 871 } 872 873 // Get the next character without advancing. 874 static int peekchr(void) 875 { 876 static int after_slash = false; 877 878 if (curchr != -1) { 879 return curchr; 880 } 881 882 switch (curchr = (uint8_t)regparse[0]) { 883 case '.': 884 case '[': 885 case '~': 886 // magic when 'magic' is on 887 if (reg_magic >= MAGIC_ON) { 888 curchr = Magic(curchr); 889 } 890 break; 891 case '(': 892 case ')': 893 case '{': 894 case '%': 895 case '+': 896 case '=': 897 case '?': 898 case '@': 899 case '!': 900 case '&': 901 case '|': 902 case '<': 903 case '>': 904 case '#': // future ext. 905 case '"': // future ext. 906 case '\'': // future ext. 907 case ',': // future ext. 908 case '-': // future ext. 909 case ':': // future ext. 910 case ';': // future ext. 911 case '`': // future ext. 912 case '/': // Can't be used in / command 913 // magic only after "\v" 914 if (reg_magic == MAGIC_ALL) { 915 curchr = Magic(curchr); 916 } 917 break; 918 case '*': 919 // * is not magic as the very first character, eg "?*ptr", when 920 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But 921 // "\(\*" is not magic, thus must be magic if "after_slash" 922 if (reg_magic >= MAGIC_ON 923 && !at_start 924 && !(prev_at_start && prevchr == Magic('^')) 925 && (after_slash 926 || (prevchr != Magic('(') 927 && prevchr != Magic('&') 928 && prevchr != Magic('|')))) { 929 curchr = Magic('*'); 930 } 931 break; 932 case '^': 933 // '^' is only magic as the very first character and if it's after 934 // "\(", "\|", "\&' or "\n" 935 if (reg_magic >= MAGIC_OFF 936 && (at_start 937 || reg_magic == MAGIC_ALL 938 || prevchr == Magic('(') 939 || prevchr == Magic('|') 940 || prevchr == Magic('&') 941 || prevchr == Magic('n') 942 || (no_Magic(prevchr) == '(' 943 && prevprevchr == Magic('%')))) { 944 curchr = Magic('^'); 945 at_start = true; 946 prev_at_start = false; 947 } 948 break; 949 case '$': 950 // '$' is only magic as the very last char and if it's in front of 951 // either "\|", "\)", "\&", or "\n" 952 if (reg_magic >= MAGIC_OFF) { 953 uint8_t *p = (uint8_t *)regparse + 1; 954 bool is_magic_all = (reg_magic == MAGIC_ALL); 955 956 // ignore \c \C \m \M \v \V and \Z after '$' 957 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C' 958 || p[1] == 'm' || p[1] == 'M' 959 || p[1] == 'v' || p[1] == 'V' 960 || p[1] == 'Z')) { 961 if (p[1] == 'v') { 962 is_magic_all = true; 963 } else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V') { 964 is_magic_all = false; 965 } 966 p += 2; 967 } 968 if (p[0] == NUL 969 || (p[0] == '\\' 970 && (p[1] == '|' || p[1] == '&' || p[1] == ')' 971 || p[1] == 'n')) 972 || (is_magic_all 973 && (p[0] == '|' || p[0] == '&' || p[0] == ')')) 974 || reg_magic == MAGIC_ALL) { 975 curchr = Magic('$'); 976 } 977 } 978 break; 979 case '\\': { 980 int c = (uint8_t)regparse[1]; 981 982 if (c == NUL) { 983 curchr = '\\'; // trailing '\' 984 } else if (c <= '~' && META_flags[c]) { 985 // META contains everything that may be magic sometimes, 986 // except ^ and $ ("\^" and "\$" are only magic after 987 // "\V"). We now fetch the next character and toggle its 988 // magicness. Therefore, \ is so meta-magic that it is 989 // not in META. 990 curchr = -1; 991 prev_at_start = at_start; 992 at_start = false; // be able to say "/\*ptr" 993 regparse++; 994 after_slash++; 995 (void)peekchr(); 996 regparse--; 997 after_slash--; 998 curchr = toggle_Magic(curchr); 999 } else if (vim_strchr(REGEXP_ABBR, c)) { 1000 // Handle abbreviations, like "\t" for TAB -- webb 1001 curchr = backslash_trans(c); 1002 } else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^')) { 1003 curchr = toggle_Magic(c); 1004 } else { 1005 // Next character can never be (made) magic? 1006 // Then backslashing it won't do anything. 1007 curchr = utf_ptr2char(regparse + 1); 1008 } 1009 break; 1010 } 1011 1012 default: 1013 curchr = utf_ptr2char(regparse); 1014 } 1015 1016 return curchr; 1017 } 1018 1019 // Eat one lexed character. Do this in a way that we can undo it. 1020 static void skipchr(void) 1021 { 1022 // peekchr() eats a backslash, do the same here 1023 if (*regparse == '\\') { 1024 prevchr_len = 1; 1025 } else { 1026 prevchr_len = 0; 1027 } 1028 if (regparse[prevchr_len] != NUL) { 1029 // Exclude composing chars that utfc_ptr2len does include. 1030 prevchr_len += utf_ptr2len(regparse + prevchr_len); 1031 } 1032 regparse += prevchr_len; 1033 prev_at_start = at_start; 1034 at_start = false; 1035 prevprevchr = prevchr; 1036 prevchr = curchr; 1037 curchr = nextchr; // use previously unget char, or -1 1038 nextchr = -1; 1039 } 1040 1041 // Skip a character while keeping the value of prev_at_start for at_start. 1042 // prevchr and prevprevchr are also kept. 1043 static void skipchr_keepstart(void) 1044 { 1045 int as = prev_at_start; 1046 int pr = prevchr; 1047 int prpr = prevprevchr; 1048 1049 skipchr(); 1050 at_start = as; 1051 prevchr = pr; 1052 prevprevchr = prpr; 1053 } 1054 1055 // Get the next character from the pattern. We know about magic and such, so 1056 // therefore we need a lexical analyzer. 1057 static int getchr(void) 1058 { 1059 int chr = peekchr(); 1060 1061 skipchr(); 1062 return chr; 1063 } 1064 1065 // put character back. Works only once! 1066 static void ungetchr(void) 1067 { 1068 nextchr = curchr; 1069 curchr = prevchr; 1070 prevchr = prevprevchr; 1071 at_start = prev_at_start; 1072 prev_at_start = false; 1073 1074 // Backup regparse, so that it's at the same position as before the 1075 // getchr(). 1076 regparse -= prevchr_len; 1077 } 1078 1079 // Get and return the value of the hex string at the current position. 1080 // Return -1 if there is no valid hex number. 1081 // The position is updated: 1082 // blahblah\%x20asdf 1083 // before-^ ^-after 1084 // The parameter controls the maximum number of input characters. This will be 1085 // 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence. 1086 static int64_t gethexchrs(int maxinputlen) 1087 { 1088 int64_t nr = 0; 1089 int c; 1090 int i; 1091 1092 for (i = 0; i < maxinputlen; i++) { 1093 c = (uint8_t)regparse[0]; 1094 if (!ascii_isxdigit(c)) { 1095 break; 1096 } 1097 nr <<= 4; 1098 nr |= hex2nr(c); 1099 regparse++; 1100 } 1101 1102 if (i == 0) { 1103 return -1; 1104 } 1105 return nr; 1106 } 1107 1108 // Get and return the value of the decimal string immediately after the 1109 // current position. Return -1 for invalid. Consumes all digits. 1110 static int64_t getdecchrs(void) 1111 { 1112 int64_t nr = 0; 1113 int c; 1114 int i; 1115 1116 for (i = 0;; i++) { 1117 c = (uint8_t)regparse[0]; 1118 if (c < '0' || c > '9') { 1119 break; 1120 } 1121 nr *= 10; 1122 nr += c - '0'; 1123 regparse++; 1124 curchr = -1; // no longer valid 1125 } 1126 1127 if (i == 0) { 1128 return -1; 1129 } 1130 return nr; 1131 } 1132 1133 // get and return the value of the octal string immediately after the current 1134 // position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle 1135 // numbers > 377 correctly (for example, 400 is treated as 40) and doesn't 1136 // treat 8 or 9 as recognised characters. Position is updated: 1137 // blahblah\%o210asdf 1138 // before-^ ^-after 1139 static int64_t getoctchrs(void) 1140 { 1141 int64_t nr = 0; 1142 int c; 1143 int i; 1144 1145 for (i = 0; i < 3 && nr < 040; i++) { 1146 c = (uint8_t)regparse[0]; 1147 if (c < '0' || c > '7') { 1148 break; 1149 } 1150 nr <<= 3; 1151 nr |= hex2nr(c); 1152 regparse++; 1153 } 1154 1155 if (i == 0) { 1156 return -1; 1157 } 1158 return nr; 1159 } 1160 1161 // read_limits - Read two integers to be taken as a minimum and maximum. 1162 // If the first character is '-', then the range is reversed. 1163 // Should end with 'end'. If minval is missing, zero is default, if maxval is 1164 // missing, a very big number is the default. 1165 static int read_limits(int *minval, int *maxval) 1166 { 1167 int reverse = false; 1168 char *first_char; 1169 int tmp; 1170 1171 if (*regparse == '-') { 1172 // Starts with '-', so reverse the range later. 1173 regparse++; 1174 reverse = true; 1175 } 1176 first_char = regparse; 1177 *minval = getdigits_int(®parse, false, 0); 1178 if (*regparse == ',') { // There is a comma. 1179 if (ascii_isdigit(*++regparse)) { 1180 *maxval = getdigits_int(®parse, false, MAX_LIMIT); 1181 } else { 1182 *maxval = MAX_LIMIT; 1183 } 1184 } else if (ascii_isdigit(*first_char)) { 1185 *maxval = *minval; // It was \{n} or \{-n} 1186 } else { 1187 *maxval = MAX_LIMIT; // It was \{} or \{-} 1188 } 1189 if (*regparse == '\\') { 1190 regparse++; // Allow either \{...} or \{...\} 1191 } 1192 if (*regparse != '}') { 1193 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"), reg_magic == MAGIC_ALL); 1194 } 1195 1196 // Reverse the range if there was a '-', or make sure it is in the right 1197 // order otherwise. 1198 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval)) { 1199 tmp = *minval; 1200 *minval = *maxval; 1201 *maxval = tmp; 1202 } 1203 skipchr(); // let's be friends with the lexer again 1204 return OK; 1205 } 1206 1207 // vim_regexec and friends 1208 1209 // Global work variables for vim_regexec(). 1210 1211 // Sometimes need to save a copy of a line. Since alloc()/free() is very 1212 // slow, we keep one allocated piece of memory and only re-allocate it when 1213 // it's too small. It's freed in bt_regexec_both() when finished. 1214 static uint8_t *reg_tofree = NULL; 1215 static unsigned reg_tofreelen; 1216 1217 // Structure used to store the execution state of the regex engine. 1218 // Which ones are set depends on whether a single-line or multi-line match is 1219 // done: 1220 // single-line multi-line 1221 // reg_match ®match_T NULL 1222 // reg_mmatch NULL ®mmatch_T 1223 // reg_startp reg_match->startp <invalid> 1224 // reg_endp reg_match->endp <invalid> 1225 // reg_startpos <invalid> reg_mmatch->startpos 1226 // reg_endpos <invalid> reg_mmatch->endpos 1227 // reg_win NULL window in which to search 1228 // reg_buf curbuf buffer in which to search 1229 // reg_firstlnum <invalid> first line in which to search 1230 // reg_maxline 0 last line nr 1231 // reg_line_lbr false or true false 1232 typedef struct { 1233 regmatch_T *reg_match; 1234 regmmatch_T *reg_mmatch; 1235 1236 uint8_t **reg_startp; 1237 uint8_t **reg_endp; 1238 lpos_T *reg_startpos; 1239 lpos_T *reg_endpos; 1240 1241 win_T *reg_win; 1242 buf_T *reg_buf; 1243 linenr_T reg_firstlnum; 1244 linenr_T reg_maxline; 1245 bool reg_line_lbr; // "\n" in string is line break 1246 1247 // The current match-position is remembered with these variables: 1248 linenr_T lnum; ///< line number, relative to first line 1249 uint8_t *line; ///< start of current line 1250 uint8_t *input; ///< current input, points into "line" 1251 1252 int need_clear_subexpr; ///< subexpressions still need to be cleared 1253 int need_clear_zsubexpr; ///< extmatch subexpressions still need to be 1254 ///< cleared 1255 1256 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec(). 1257 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern 1258 // contains '\c' or '\C' the value is overruled. 1259 bool reg_ic; 1260 1261 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z 1262 // flag in the regexp. Defaults to false, always. 1263 bool reg_icombine; 1264 1265 bool reg_nobreak; 1266 1267 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when 1268 // there is no maximum. 1269 colnr_T reg_maxcol; 1270 1271 // State for the NFA engine regexec. 1272 int nfa_has_zend; ///< NFA regexp \ze operator encountered. 1273 int nfa_has_backref; ///< NFA regexp \1 .. \9 encountered. 1274 int nfa_nsubexpr; ///< Number of sub expressions actually being used 1275 ///< during execution. 1 if only the whole match 1276 ///< (subexpr 0) is used. 1277 // listid is global, so that it increases on recursive calls to 1278 // nfa_regmatch(), which means we don't have to clear the lastlist field of 1279 // all the states. 1280 int nfa_listid; 1281 int nfa_alt_listid; 1282 1283 int nfa_has_zsubexpr; ///< NFA regexp has \z( ), set zsubexpr. 1284 } regexec_T; 1285 1286 static regexec_T rex; 1287 static bool rex_in_use = false; 1288 1289 static void reg_breakcheck(void) 1290 { 1291 if (!rex.reg_nobreak) { 1292 fast_breakcheck(); 1293 } 1294 } 1295 1296 // Return true if character 'c' is included in 'iskeyword' option for 1297 // "reg_buf" buffer. 1298 static bool reg_iswordc(int c) 1299 { 1300 return vim_iswordc_buf(c, rex.reg_buf); 1301 } 1302 1303 static bool can_f_submatch = false; ///< true when submatch() can be used 1304 1305 /// These pointers are used for reg_submatch(). Needed for when the 1306 /// substitution string is an expression that contains a call to substitute() 1307 /// and submatch(). 1308 typedef struct { 1309 regmatch_T *sm_match; 1310 regmmatch_T *sm_mmatch; 1311 linenr_T sm_firstlnum; 1312 linenr_T sm_maxline; 1313 int sm_line_lbr; 1314 } regsubmatch_T; 1315 1316 static regsubmatch_T rsm; ///< can only be used when can_f_submatch is true 1317 1318 /// Common code for reg_getline(), reg_getline_len(), reg_getline_submatch() and 1319 /// reg_getline_submatch_len(). 1320 /// 1321 /// @param flags a bitmask that controls what info is to be returned 1322 /// and whether or not submatch is in effect. 1323 static void reg_getline_common(linenr_T lnum, reg_getline_flags_T flags, char **line, 1324 colnr_T *length) 1325 { 1326 bool get_line = flags & RGLF_LINE; 1327 bool get_length = flags & RGLF_LENGTH; 1328 linenr_T firstlnum; 1329 linenr_T maxline; 1330 1331 if (flags & RGLF_SUBMATCH) { 1332 firstlnum = rsm.sm_firstlnum + lnum; 1333 maxline = rsm.sm_maxline; 1334 } else { 1335 firstlnum = rex.reg_firstlnum + lnum; 1336 maxline = rex.reg_maxline; 1337 } 1338 1339 // when looking behind for a match/no-match lnum is negative. but we 1340 // can't go before line 1. 1341 if (firstlnum < 1) { 1342 if (get_line) { 1343 *line = NULL; 1344 } 1345 if (get_length) { 1346 *length = 0; 1347 } 1348 1349 return; 1350 } 1351 1352 if (lnum > maxline) { 1353 // must have matched the "\n" in the last line. 1354 if (get_line) { 1355 *line = ""; 1356 } 1357 if (get_length) { 1358 *length = 0; 1359 } 1360 1361 return; 1362 } 1363 1364 if (get_line) { 1365 *line = ml_get_buf(rex.reg_buf, firstlnum); 1366 } 1367 if (get_length) { 1368 *length = ml_get_buf_len(rex.reg_buf, firstlnum); 1369 } 1370 } 1371 1372 /// Get pointer to the line "lnum", which is relative to "reg_firstlnum". 1373 static char *reg_getline(linenr_T lnum) 1374 { 1375 char *line; 1376 reg_getline_common(lnum, RGLF_LINE, &line, NULL); 1377 return line; 1378 } 1379 1380 /// Get length of line "lnum", which is relative to "reg_firstlnum". 1381 static colnr_T reg_getline_len(linenr_T lnum) 1382 { 1383 colnr_T length; 1384 reg_getline_common(lnum, RGLF_LENGTH, NULL, &length); 1385 return length; 1386 } 1387 1388 static uint8_t *reg_startzp[NSUBEXP]; // Workspace to mark beginning 1389 static uint8_t *reg_endzp[NSUBEXP]; // and end of \z(...\) matches 1390 static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos 1391 static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos 1392 1393 // true if using multi-line regexp. 1394 #define REG_MULTI (rex.reg_match == NULL) 1395 1396 // Create a new extmatch and mark it as referenced once. 1397 static reg_extmatch_T *make_extmatch(void) 1398 FUNC_ATTR_NONNULL_RET 1399 { 1400 reg_extmatch_T *em = xcalloc(1, sizeof(reg_extmatch_T)); 1401 em->refcnt = 1; 1402 return em; 1403 } 1404 1405 // Add a reference to an extmatch. 1406 reg_extmatch_T *ref_extmatch(reg_extmatch_T *em) 1407 { 1408 if (em != NULL) { 1409 em->refcnt++; 1410 } 1411 return em; 1412 } 1413 1414 // Remove a reference to an extmatch. If there are no references left, free 1415 // the info. 1416 void unref_extmatch(reg_extmatch_T *em) 1417 { 1418 int i; 1419 1420 if (em != NULL && --em->refcnt <= 0) { 1421 for (i = 0; i < NSUBEXP; i++) { 1422 xfree(em->matches[i]); 1423 } 1424 xfree(em); 1425 } 1426 } 1427 1428 // Get class of previous character. 1429 static int reg_prev_class(void) 1430 { 1431 if (rex.input > rex.line) { 1432 return mb_get_class_tab((char *)rex.input - 1 - 1433 utf_head_off((char *)rex.line, (char *)rex.input - 1), 1434 rex.reg_buf->b_chartab); 1435 } 1436 return -1; 1437 } 1438 1439 // Return true if the current rex.input position matches the Visual area. 1440 static bool reg_match_visual(void) 1441 { 1442 pos_T top, bot; 1443 linenr_T lnum; 1444 colnr_T col; 1445 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win; 1446 int mode; 1447 colnr_T start, end; 1448 colnr_T start2, end2; 1449 colnr_T curswant; 1450 1451 // Check if the buffer is the current buffer and not using a string. 1452 if (rex.reg_buf != curbuf || VIsual.lnum == 0 || !REG_MULTI) { 1453 return false; 1454 } 1455 1456 if (VIsual_active) { 1457 if (lt(VIsual, wp->w_cursor)) { 1458 top = VIsual; 1459 bot = wp->w_cursor; 1460 } else { 1461 top = wp->w_cursor; 1462 bot = VIsual; 1463 } 1464 mode = VIsual_mode; 1465 curswant = wp->w_curswant; 1466 } else { 1467 if (lt(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end)) { 1468 top = curbuf->b_visual.vi_start; 1469 bot = curbuf->b_visual.vi_end; 1470 } else { 1471 top = curbuf->b_visual.vi_end; 1472 bot = curbuf->b_visual.vi_start; 1473 } 1474 // a substitute command may have removed some lines 1475 if (bot.lnum > curbuf->b_ml.ml_line_count) { 1476 bot.lnum = curbuf->b_ml.ml_line_count; 1477 } 1478 mode = curbuf->b_visual.vi_mode; 1479 curswant = curbuf->b_visual.vi_curswant; 1480 } 1481 lnum = rex.lnum + rex.reg_firstlnum; 1482 if (lnum < top.lnum || lnum > bot.lnum) { 1483 return false; 1484 } 1485 1486 col = (colnr_T)(rex.input - rex.line); 1487 if (mode == 'v') { 1488 if ((lnum == top.lnum && col < top.col) 1489 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e'))) { 1490 return false; 1491 } 1492 } else if (mode == Ctrl_V) { 1493 getvvcol(wp, &top, &start, NULL, &end); 1494 getvvcol(wp, &bot, &start2, NULL, &end2); 1495 if (start2 < start) { 1496 start = start2; 1497 } 1498 if (end2 > end) { 1499 end = end2; 1500 } 1501 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL) { 1502 end = MAXCOL; 1503 } 1504 1505 // getvvcol() flushes rex.line, need to get it again 1506 rex.line = (uint8_t *)reg_getline(rex.lnum); 1507 rex.input = rex.line + col; 1508 1509 colnr_T cols = win_linetabsize(wp, rex.reg_firstlnum + rex.lnum, (char *)rex.line, col); 1510 if (cols < start || cols > end - (*p_sel == 'e')) { 1511 return false; 1512 } 1513 } 1514 return true; 1515 } 1516 1517 // Check the regexp program for its magic number. 1518 // Return true if it's wrong. 1519 static int prog_magic_wrong(void) 1520 { 1521 regprog_T *prog; 1522 1523 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog; 1524 if (prog->engine == &nfa_regengine) { 1525 // For NFA matcher we don't check the magic 1526 return false; 1527 } 1528 1529 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) { 1530 emsg(_(e_re_corr)); 1531 return true; 1532 } 1533 return false; 1534 } 1535 1536 // Cleanup the subexpressions, if this wasn't done yet. 1537 // This construction is used to clear the subexpressions only when they are 1538 // used (to increase speed). 1539 static void cleanup_subexpr(void) 1540 { 1541 if (!rex.need_clear_subexpr) { 1542 return; 1543 } 1544 1545 if (REG_MULTI) { 1546 // Use 0xff to set lnum to -1 1547 memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP); 1548 memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP); 1549 } else { 1550 memset(rex.reg_startp, 0, sizeof(char *) * NSUBEXP); 1551 memset(rex.reg_endp, 0, sizeof(char *) * NSUBEXP); 1552 } 1553 rex.need_clear_subexpr = false; 1554 } 1555 1556 static void cleanup_zsubexpr(void) 1557 { 1558 if (!rex.need_clear_zsubexpr) { 1559 return; 1560 } 1561 1562 if (REG_MULTI) { 1563 // Use 0xff to set lnum to -1 1564 memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP); 1565 memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP); 1566 } else { 1567 memset(reg_startzp, 0, sizeof(char *) * NSUBEXP); 1568 memset(reg_endzp, 0, sizeof(char *) * NSUBEXP); 1569 } 1570 rex.need_clear_zsubexpr = false; 1571 } 1572 1573 // Advance rex.lnum, rex.line and rex.input to the next line. 1574 static void reg_nextline(void) 1575 { 1576 rex.line = (uint8_t *)reg_getline(++rex.lnum); 1577 rex.input = rex.line; 1578 reg_breakcheck(); 1579 } 1580 1581 // Check whether a backreference matches. 1582 // Returns RA_FAIL, RA_NOMATCH or RA_MATCH. 1583 // If "bytelen" is not NULL, it is set to the byte length of the match in the 1584 // last line. 1585 // Optional: ignore case if rex.reg_ic is set. 1586 static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, 1587 colnr_T end_col, int *bytelen) 1588 { 1589 linenr_T clnum = start_lnum; 1590 colnr_T ccol = start_col; 1591 int len; 1592 char *p; 1593 1594 if (bytelen != NULL) { 1595 *bytelen = 0; 1596 } 1597 while (true) { 1598 // Since getting one line may invalidate the other, need to make copy. 1599 // Slow! 1600 if (rex.line != reg_tofree) { 1601 len = (int)strlen((char *)rex.line); 1602 if (reg_tofree == NULL || len >= (int)reg_tofreelen) { 1603 len += 50; // get some extra 1604 xfree(reg_tofree); 1605 reg_tofree = xmalloc((size_t)len); 1606 reg_tofreelen = (unsigned)len; 1607 } 1608 STRCPY(reg_tofree, rex.line); 1609 rex.input = reg_tofree + (rex.input - rex.line); 1610 rex.line = reg_tofree; 1611 } 1612 1613 // Get the line to compare with. 1614 p = reg_getline(clnum); 1615 assert(p); 1616 1617 if (clnum == end_lnum) { 1618 len = end_col - ccol; 1619 } else { 1620 len = reg_getline_len(clnum) - ccol; 1621 } 1622 1623 if ((!rex.reg_ic && cstrncmp(p + ccol, (char *)rex.input, &len) != 0) 1624 || (rex.reg_ic && mb_strnicmp(p + ccol, (char *)rex.input, (size_t)len) != 0)) { 1625 return RA_NOMATCH; // doesn't match 1626 } 1627 if (bytelen != NULL) { 1628 *bytelen += len; 1629 } 1630 if (clnum == end_lnum) { 1631 break; // match and at end! 1632 } 1633 if (rex.lnum >= rex.reg_maxline) { 1634 return RA_NOMATCH; // text too short 1635 } 1636 1637 // Advance to next line. 1638 reg_nextline(); 1639 if (bytelen != NULL) { 1640 *bytelen = 0; 1641 } 1642 clnum++; 1643 ccol = 0; 1644 if (got_int) { 1645 return RA_FAIL; 1646 } 1647 } 1648 1649 // found a match! Note that rex.line may now point to a copy of the line, 1650 // that should not matter. 1651 return RA_MATCH; 1652 } 1653 1654 /// Used in a place where no * or \+ can follow. 1655 static bool re_mult_next(char *what) 1656 { 1657 if (re_multi_type(peekchr()) == MULTI_MULT) { 1658 semsg(_("E888: (NFA regexp) cannot repeat %s"), what); 1659 rc_did_emsg = true; 1660 return false; 1661 } 1662 return true; 1663 } 1664 1665 typedef struct { 1666 int a, b, c; 1667 } decomp_T; 1668 1669 // 0xfb20 - 0xfb4f 1670 static decomp_T decomp_table[0xfb4f - 0xfb20 + 1] = { 1671 { 0x5e2, 0, 0 }, // 0xfb20 alt ayin 1672 { 0x5d0, 0, 0 }, // 0xfb21 alt alef 1673 { 0x5d3, 0, 0 }, // 0xfb22 alt dalet 1674 { 0x5d4, 0, 0 }, // 0xfb23 alt he 1675 { 0x5db, 0, 0 }, // 0xfb24 alt kaf 1676 { 0x5dc, 0, 0 }, // 0xfb25 alt lamed 1677 { 0x5dd, 0, 0 }, // 0xfb26 alt mem-sofit 1678 { 0x5e8, 0, 0 }, // 0xfb27 alt resh 1679 { 0x5ea, 0, 0 }, // 0xfb28 alt tav 1680 { '+', 0, 0 }, // 0xfb29 alt plus 1681 { 0x5e9, 0x5c1, 0 }, // 0xfb2a shin+shin-dot 1682 { 0x5e9, 0x5c2, 0 }, // 0xfb2b shin+sin-dot 1683 { 0x5e9, 0x5c1, 0x5bc }, // 0xfb2c shin+shin-dot+dagesh 1684 { 0x5e9, 0x5c2, 0x5bc }, // 0xfb2d shin+sin-dot+dagesh 1685 { 0x5d0, 0x5b7, 0 }, // 0xfb2e alef+patah 1686 { 0x5d0, 0x5b8, 0 }, // 0xfb2f alef+qamats 1687 { 0x5d0, 0x5b4, 0 }, // 0xfb30 alef+hiriq 1688 { 0x5d1, 0x5bc, 0 }, // 0xfb31 bet+dagesh 1689 { 0x5d2, 0x5bc, 0 }, // 0xfb32 gimel+dagesh 1690 { 0x5d3, 0x5bc, 0 }, // 0xfb33 dalet+dagesh 1691 { 0x5d4, 0x5bc, 0 }, // 0xfb34 he+dagesh 1692 { 0x5d5, 0x5bc, 0 }, // 0xfb35 vav+dagesh 1693 { 0x5d6, 0x5bc, 0 }, // 0xfb36 zayin+dagesh 1694 { 0xfb37, 0, 0 }, // 0xfb37 -- UNUSED 1695 { 0x5d8, 0x5bc, 0 }, // 0xfb38 tet+dagesh 1696 { 0x5d9, 0x5bc, 0 }, // 0xfb39 yud+dagesh 1697 { 0x5da, 0x5bc, 0 }, // 0xfb3a kaf sofit+dagesh 1698 { 0x5db, 0x5bc, 0 }, // 0xfb3b kaf+dagesh 1699 { 0x5dc, 0x5bc, 0 }, // 0xfb3c lamed+dagesh 1700 { 0xfb3d, 0, 0 }, // 0xfb3d -- UNUSED 1701 { 0x5de, 0x5bc, 0 }, // 0xfb3e mem+dagesh 1702 { 0xfb3f, 0, 0 }, // 0xfb3f -- UNUSED 1703 { 0x5e0, 0x5bc, 0 }, // 0xfb40 nun+dagesh 1704 { 0x5e1, 0x5bc, 0 }, // 0xfb41 samech+dagesh 1705 { 0xfb42, 0, 0 }, // 0xfb42 -- UNUSED 1706 { 0x5e3, 0x5bc, 0 }, // 0xfb43 pe sofit+dagesh 1707 { 0x5e4, 0x5bc, 0 }, // 0xfb44 pe+dagesh 1708 { 0xfb45, 0, 0 }, // 0xfb45 -- UNUSED 1709 { 0x5e6, 0x5bc, 0 }, // 0xfb46 tsadi+dagesh 1710 { 0x5e7, 0x5bc, 0 }, // 0xfb47 qof+dagesh 1711 { 0x5e8, 0x5bc, 0 }, // 0xfb48 resh+dagesh 1712 { 0x5e9, 0x5bc, 0 }, // 0xfb49 shin+dagesh 1713 { 0x5ea, 0x5bc, 0 }, // 0xfb4a tav+dagesh 1714 { 0x5d5, 0x5b9, 0 }, // 0xfb4b vav+holam 1715 { 0x5d1, 0x5bf, 0 }, // 0xfb4c bet+rafe 1716 { 0x5db, 0x5bf, 0 }, // 0xfb4d kaf+rafe 1717 { 0x5e4, 0x5bf, 0 }, // 0xfb4e pe+rafe 1718 { 0x5d0, 0x5dc, 0 } // 0xfb4f alef-lamed 1719 }; 1720 1721 static void mb_decompose(int c, int *c1, int *c2, int *c3) 1722 { 1723 decomp_T d; 1724 1725 if (c >= 0xfb20 && c <= 0xfb4f) { 1726 d = decomp_table[c - 0xfb20]; 1727 *c1 = d.a; 1728 *c2 = d.b; 1729 *c3 = d.c; 1730 } else { 1731 *c1 = c; 1732 *c2 = 0; 1733 *c3 = 0; 1734 } 1735 } 1736 1737 /// Compare two strings, ignore case if rex.reg_ic set. 1738 /// Return 0 if strings match, non-zero otherwise. 1739 /// Correct the length "*n" when composing characters are ignored 1740 /// or when both utf codepoints are considered equal because of 1741 /// case-folding but have different length (e.g. 's' and 'Å¿') 1742 static int cstrncmp(char *s1, char *s2, int *n) 1743 { 1744 int result; 1745 1746 if (!rex.reg_ic) { 1747 result = strncmp(s1, s2, (size_t)(*n)); 1748 } else { 1749 char *p = s1; 1750 int n2 = 0; 1751 int n1 = *n; 1752 // count the number of characters for byte-length of s1 1753 while (n1 > 0 && *p != NUL) { 1754 n1 -= utfc_ptr2len(s1); 1755 MB_PTR_ADV(p); 1756 n2++; 1757 } 1758 // count the number of bytes to advance the same number of chars for s2 1759 p = s2; 1760 while (n2-- > 0 && *p != NUL) { 1761 MB_PTR_ADV(p); 1762 } 1763 1764 n2 = (int)(p - s2); 1765 1766 result = utf_strnicmp(s1, s2, (size_t)(*n), (size_t)n2); 1767 if (result == 0 && n2 < *n) { 1768 *n = n2; 1769 } 1770 } 1771 1772 // if it failed and it's utf8 and we want to combineignore: 1773 if (result != 0 && rex.reg_icombine) { 1774 const char *str1, *str2; 1775 int c1, c2, c11, c12; 1776 int junk; 1777 1778 // we have to handle the strcmp ourselves, since it is necessary to 1779 // deal with the composing characters by ignoring them: 1780 str1 = s1; 1781 str2 = s2; 1782 c1 = c2 = 0; 1783 while ((int)(str1 - s1) < *n) { 1784 c1 = mb_ptr2char_adv(&str1); 1785 c2 = mb_ptr2char_adv(&str2); 1786 1787 // decompose the character if necessary, into 'base' characters 1788 // because I don't care about Arabic, I will hard-code the Hebrew 1789 // which I *do* care about! So sue me... 1790 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) { 1791 // decomposition necessary? 1792 mb_decompose(c1, &c11, &junk, &junk); 1793 mb_decompose(c2, &c12, &junk, &junk); 1794 c1 = c11; 1795 c2 = c12; 1796 if (c11 != c12 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12))) { 1797 break; 1798 } 1799 } 1800 } 1801 result = c2 - c1; 1802 if (result == 0) { 1803 *n = (int)(str2 - s2); 1804 } 1805 } 1806 1807 return result; 1808 } 1809 1810 /// Wrapper around strchr which accounts for case-insensitive searches and 1811 /// non-ASCII characters. 1812 /// 1813 /// This function is used a lot for simple searches, keep it fast! 1814 /// 1815 /// @param s string to search 1816 /// @param c character to find in @a s 1817 /// 1818 /// @return NULL if no match, otherwise pointer to the position in @a s 1819 static inline char *cstrchr(const char *const s, const int c) 1820 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL 1821 FUNC_ATTR_ALWAYS_INLINE 1822 { 1823 if (!rex.reg_ic) { 1824 return vim_strchr(s, c); 1825 } 1826 1827 int cc, lc; 1828 if (c > 0x80) { 1829 cc = utf_fold(c); 1830 lc = cc; 1831 } else if (ASCII_ISUPPER(c)) { 1832 cc = TOLOWER_ASC(c); 1833 lc = cc; 1834 } else if (ASCII_ISLOWER(c)) { 1835 cc = TOUPPER_ASC(c); 1836 lc = c; 1837 } else { 1838 return vim_strchr(s, c); 1839 } 1840 1841 for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) { 1842 const int uc = utf_ptr2char(p); 1843 if (c > 0x80 || uc > 0x80) { 1844 // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf, not 0xff. 1845 // Compare with lower case of the character. 1846 if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == lc) { 1847 return (char *)p; 1848 } 1849 } else if ((uint8_t)(*p) == c || (uint8_t)(*p) == cc) { 1850 return (char *)p; 1851 } 1852 } 1853 1854 return NULL; 1855 } 1856 1857 //////////////////////////////////////////////////////////////// 1858 // regsub stuff // 1859 //////////////////////////////////////////////////////////////// 1860 1861 static void do_upper(int *d, int c) 1862 { 1863 *d = mb_toupper(c); 1864 } 1865 1866 static void do_lower(int *d, int c) 1867 { 1868 *d = mb_tolower(c); 1869 } 1870 1871 /// regtilde(): Replace tildes in the pattern by the old pattern. 1872 /// 1873 /// Short explanation of the tilde: It stands for the previous replacement 1874 /// pattern. If that previous pattern also contains a ~ we should go back a 1875 /// step further... But we insert the previous pattern into the current one 1876 /// and remember that. 1877 /// This still does not handle the case where "magic" changes. So require the 1878 /// user to keep his hands off of "magic". 1879 /// 1880 /// The tildes are parsed once before the first call to vim_regsub(). 1881 char *regtilde(char *source, int magic, bool preview) 1882 { 1883 char *newsub = source; 1884 size_t newsublen = 0; 1885 char tilde[3] = { '~', NUL, NUL }; 1886 size_t tildelen = 1; 1887 bool error = false; 1888 1889 if (!magic) { 1890 tilde[0] = '\\'; 1891 tilde[1] = '~'; 1892 tilde[2] = NUL; 1893 tildelen = 2; 1894 } 1895 1896 char *p; 1897 for (p = newsub; *p; p++) { 1898 if (strncmp(p, tilde, tildelen) == 0) { 1899 size_t prefixlen = (size_t)(p - newsub); // not including the tilde 1900 char *postfix = p + tildelen; 1901 size_t postfixlen; 1902 size_t tmpsublen; 1903 1904 if (newsublen == 0) { 1905 newsublen = strlen(newsub); 1906 } 1907 newsublen -= tildelen; 1908 postfixlen = newsublen - prefixlen; 1909 tmpsublen = prefixlen + reg_prev_sublen + postfixlen; 1910 1911 if (tmpsublen > 0 && reg_prev_sub != NULL) { 1912 // Avoid making the text longer than MAXCOL, it will cause 1913 // trouble at some point. 1914 if (tmpsublen > MAXCOL) { 1915 emsg(_(e_resulting_text_too_long)); 1916 error = true; 1917 break; 1918 } 1919 1920 char *tmpsub = xmalloc(tmpsublen + 1); 1921 // copy prefix 1922 memmove(tmpsub, newsub, prefixlen); 1923 // interpret tilde 1924 memmove(tmpsub + prefixlen, reg_prev_sub, reg_prev_sublen); 1925 // copy postfix 1926 STRCPY(tmpsub + prefixlen + reg_prev_sublen, postfix); 1927 1928 if (newsub != source) { // allocated newsub before 1929 xfree(newsub); 1930 } 1931 newsub = tmpsub; 1932 newsublen = tmpsublen; 1933 p = newsub + prefixlen + reg_prev_sublen; 1934 } else { 1935 memmove(p, postfix, postfixlen + 1); // remove the tilde (+1 for the NUL) 1936 } 1937 p--; 1938 } else { 1939 if (*p == '\\' && p[1]) { // skip escaped characters 1940 p++; 1941 } 1942 p += utfc_ptr2len(p) - 1; 1943 } 1944 } 1945 1946 if (error) { 1947 if (newsub != source) { 1948 xfree(newsub); 1949 } 1950 return source; 1951 } 1952 1953 // Only change reg_prev_sub when not previewing. 1954 if (!preview) { 1955 // Store a copy of newsub in reg_prev_sub. It is always allocated, 1956 // because recursive calls may make the returned string invalid. 1957 // Only store it if there something to store. 1958 newsublen = (size_t)(p - newsub); 1959 if (newsublen == 0) { 1960 XFREE_CLEAR(reg_prev_sub); 1961 } else { 1962 xfree(reg_prev_sub); 1963 reg_prev_sub = xstrnsave(newsub, newsublen); 1964 } 1965 reg_prev_sublen = newsublen; 1966 } 1967 1968 return newsub; 1969 } 1970 1971 /// Put the submatches in "argv[argskip]" which is a list passed into 1972 /// call_func() by vim_regsub_both(). 1973 static int fill_submatch_list(int argc FUNC_ATTR_UNUSED, typval_T *argv, int argskip, ufunc_T *fp) 1974 FUNC_ATTR_NONNULL_ALL 1975 { 1976 typval_T *listarg = argv + argskip; 1977 1978 if (!fp->uf_varargs && fp->uf_args.ga_len <= argskip) { 1979 // called function doesn't take a submatches argument 1980 return argskip; 1981 } 1982 1983 // Relies on sl_list to be the first item in staticList10_T. 1984 tv_list_init_static10((staticList10_T *)listarg->vval.v_list); 1985 1986 // There are always 10 list items in staticList10_T. 1987 listitem_T *li = tv_list_first(listarg->vval.v_list); 1988 for (int i = 0; i < 10; i++) { 1989 char *s = rsm.sm_match->startp[i]; 1990 if (s == NULL || rsm.sm_match->endp[i] == NULL) { 1991 s = NULL; 1992 } else { 1993 s = xstrnsave(s, (size_t)(rsm.sm_match->endp[i] - s)); 1994 } 1995 TV_LIST_ITEM_TV(li)->v_type = VAR_STRING; 1996 TV_LIST_ITEM_TV(li)->vval.v_string = s; 1997 li = TV_LIST_ITEM_NEXT(argv->vval.v_list, li); 1998 } 1999 return argskip + 1; 2000 } 2001 2002 static void clear_submatch_list(staticList10_T *sl) 2003 { 2004 TV_LIST_ITER(&sl->sl_list, li, { 2005 xfree(TV_LIST_ITEM_TV(li)->vval.v_string); 2006 }); 2007 } 2008 2009 /// vim_regsub() - perform substitutions after a vim_regexec() or 2010 /// vim_regexec_multi() match. 2011 /// 2012 /// If "flags" has REGSUB_COPY really copy into "dest[destlen]". 2013 /// Otherwise nothing is copied, only compute the length of the result. 2014 /// 2015 /// If "flags" has REGSUB_MAGIC then behave like 'magic' is set. 2016 /// 2017 /// If "flags" has REGSUB_BACKSLASH a backslash will be removed later, need to 2018 /// double them to keep them, and insert a backslash before a CR to avoid it 2019 /// being replaced with a line break later. 2020 /// 2021 /// Note: The matched text must not change between the call of 2022 /// vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back 2023 /// references invalid! 2024 /// 2025 /// Returns the size of the replacement, including terminating NUL. 2026 int vim_regsub(regmatch_T *rmp, char *source, typval_T *expr, char *dest, int destlen, int flags) 2027 { 2028 regexec_T rex_save; 2029 bool rex_in_use_save = rex_in_use; 2030 2031 if (rex_in_use) { 2032 // Being called recursively, save the state. 2033 rex_save = rex; 2034 } 2035 rex_in_use = true; 2036 2037 rex.reg_match = rmp; 2038 rex.reg_mmatch = NULL; 2039 rex.reg_maxline = 0; 2040 rex.reg_buf = curbuf; 2041 rex.reg_line_lbr = true; 2042 int result = vim_regsub_both(source, expr, dest, destlen, flags); 2043 2044 rex_in_use = rex_in_use_save; 2045 if (rex_in_use) { 2046 rex = rex_save; 2047 } 2048 2049 return result; 2050 } 2051 2052 int vim_regsub_multi(regmmatch_T *rmp, linenr_T lnum, char *source, char *dest, int destlen, 2053 int flags) 2054 { 2055 regexec_T rex_save; 2056 bool rex_in_use_save = rex_in_use; 2057 2058 if (rex_in_use) { 2059 // Being called recursively, save the state. 2060 rex_save = rex; 2061 } 2062 rex_in_use = true; 2063 2064 rex.reg_match = NULL; 2065 rex.reg_mmatch = rmp; 2066 rex.reg_buf = curbuf; // always works on the current buffer! 2067 rex.reg_firstlnum = lnum; 2068 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum; 2069 rex.reg_line_lbr = false; 2070 int result = vim_regsub_both(source, NULL, dest, destlen, flags); 2071 2072 rex_in_use = rex_in_use_save; 2073 if (rex_in_use) { 2074 rex = rex_save; 2075 } 2076 2077 return result; 2078 } 2079 2080 // When nesting more than a couple levels it's probably a mistake. 2081 #define MAX_REGSUB_NESTING 4 2082 static char *eval_result[MAX_REGSUB_NESTING] = { NULL, NULL, NULL, NULL }; 2083 2084 #if defined(EXITFREE) 2085 void free_resub_eval_result(void) 2086 { 2087 for (int i = 0; i < MAX_REGSUB_NESTING; i++) { 2088 XFREE_CLEAR(eval_result[i]); 2089 } 2090 } 2091 #endif 2092 2093 static int vim_regsub_both(char *source, typval_T *expr, char *dest, int destlen, int flags) 2094 { 2095 char *src; 2096 char *dst; 2097 char *s; 2098 int c; 2099 int cc; 2100 int no = -1; 2101 fptr_T func_all = (fptr_T)NULL; 2102 fptr_T func_one = (fptr_T)NULL; 2103 linenr_T clnum = 0; // init for GCC 2104 int len = 0; // init for GCC 2105 static int nesting = 0; 2106 bool copy = flags & REGSUB_COPY; 2107 2108 // Be paranoid... 2109 if ((source == NULL && expr == NULL) || dest == NULL) { 2110 emsg(_(e_null)); 2111 return 0; 2112 } 2113 if (prog_magic_wrong()) { 2114 return 0; 2115 } 2116 if (nesting == MAX_REGSUB_NESTING) { 2117 emsg(_(e_substitute_nesting_too_deep)); 2118 return 0; 2119 } 2120 int nested = nesting; 2121 src = source; 2122 dst = dest; 2123 2124 // When the substitute part starts with "\=" evaluate it as an expression. 2125 if (expr != NULL || (source[0] == '\\' && source[1] == '=')) { 2126 // To make sure that the length doesn't change between checking the 2127 // length and copying the string, and to speed up things, the 2128 // resulting string is saved from the call with 2129 // "flags & REGSUB_COPY" == 0 to the call with 2130 // "flags & REGSUB_COPY" != 0. 2131 if (copy) { 2132 if (eval_result[nested] != NULL) { 2133 size_t eval_len = strlen(eval_result[nested]); 2134 if (eval_len < (size_t)destlen) { 2135 STRCPY(dest, eval_result[nested]); 2136 dst += eval_len; 2137 XFREE_CLEAR(eval_result[nested]); 2138 } 2139 } 2140 } else { 2141 const bool prev_can_f_submatch = can_f_submatch; 2142 regsubmatch_T rsm_save; 2143 2144 XFREE_CLEAR(eval_result[nested]); 2145 2146 // The expression may contain substitute(), which calls us 2147 // recursively. Make sure submatch() gets the text from the first 2148 // level. 2149 if (can_f_submatch) { 2150 rsm_save = rsm; 2151 } 2152 can_f_submatch = true; 2153 rsm.sm_match = rex.reg_match; 2154 rsm.sm_mmatch = rex.reg_mmatch; 2155 rsm.sm_firstlnum = rex.reg_firstlnum; 2156 rsm.sm_maxline = rex.reg_maxline; 2157 rsm.sm_line_lbr = rex.reg_line_lbr; 2158 2159 // Although unlikely, it is possible that the expression invokes a 2160 // substitute command (it might fail, but still). Therefore keep 2161 // an array of eval results. 2162 nesting++; 2163 2164 if (expr != NULL) { 2165 typval_T argv[2]; 2166 typval_T rettv; 2167 staticList10_T matchList = TV_LIST_STATIC10_INIT; 2168 rettv.v_type = VAR_STRING; 2169 rettv.vval.v_string = NULL; 2170 argv[0].v_type = VAR_LIST; 2171 argv[0].vval.v_list = &matchList.sl_list; 2172 funcexe_T funcexe = FUNCEXE_INIT; 2173 funcexe.fe_argv_func = fill_submatch_list; 2174 funcexe.fe_evaluate = true; 2175 if (expr->v_type == VAR_FUNC) { 2176 s = expr->vval.v_string; 2177 call_func(s, -1, &rettv, 1, argv, &funcexe); 2178 } else if (expr->v_type == VAR_PARTIAL) { 2179 partial_T *partial = expr->vval.v_partial; 2180 2181 s = partial_name(partial); 2182 funcexe.fe_partial = partial; 2183 call_func(s, -1, &rettv, 1, argv, &funcexe); 2184 } 2185 if (tv_list_len(&matchList.sl_list) > 0) { 2186 // fill_submatch_list() was called. 2187 clear_submatch_list(&matchList); 2188 } 2189 if (rettv.v_type == VAR_UNKNOWN) { 2190 // something failed, no need to report another error 2191 eval_result[nested] = NULL; 2192 } else { 2193 char buf[NUMBUFLEN]; 2194 eval_result[nested] = (char *)tv_get_string_buf_chk(&rettv, buf); 2195 if (eval_result[nested] != NULL) { 2196 eval_result[nested] = xstrdup(eval_result[nested]); 2197 } 2198 } 2199 tv_clear(&rettv); 2200 } else { 2201 eval_result[nested] = eval_to_string(source + 2, true, false); 2202 } 2203 nesting--; 2204 2205 if (eval_result[nested] != NULL) { 2206 int had_backslash = false; 2207 2208 for (s = eval_result[nested]; *s != NUL; MB_PTR_ADV(s)) { 2209 // Change NL to CR, so that it becomes a line break, 2210 // unless called from vim_regexec_nl(). 2211 // Skip over a backslashed character. 2212 if (*s == NL && !rsm.sm_line_lbr) { 2213 *s = CAR; 2214 } else if (*s == '\\' && s[1] != NUL) { 2215 s++; 2216 // Change NL to CR here too, so that this works: 2217 // :s/abc\\\ndef/\="aaa\\\nbbb"/ on text: 2218 // abc{backslash} 2219 // def 2220 // Not when called from vim_regexec_nl(). 2221 if (*s == NL && !rsm.sm_line_lbr) { 2222 *s = CAR; 2223 } 2224 had_backslash = true; 2225 } 2226 } 2227 if (had_backslash && (flags & REGSUB_BACKSLASH)) { 2228 // Backslashes will be consumed, need to double them. 2229 s = vim_strsave_escaped(eval_result[nested], "\\"); 2230 xfree(eval_result[nested]); 2231 eval_result[nested] = s; 2232 } 2233 2234 dst += strlen(eval_result[nested]); 2235 } 2236 2237 can_f_submatch = prev_can_f_submatch; 2238 if (can_f_submatch) { 2239 rsm = rsm_save; 2240 } 2241 } 2242 } else { 2243 while ((c = (uint8_t)(*src++)) != NUL) { 2244 if (c == '&' && (flags & REGSUB_MAGIC)) { 2245 no = 0; 2246 } else if (c == '\\' && *src != NUL) { 2247 if (*src == '&' && !(flags & REGSUB_MAGIC)) { 2248 src++; 2249 no = 0; 2250 } else if ('0' <= *src && *src <= '9') { 2251 no = *src++ - '0'; 2252 } else if (vim_strchr("uUlLeE", (uint8_t)(*src))) { 2253 switch (*src++) { 2254 case 'u': 2255 func_one = do_upper; 2256 continue; 2257 case 'U': 2258 func_all = do_upper; 2259 continue; 2260 case 'l': 2261 func_one = do_lower; 2262 continue; 2263 case 'L': 2264 func_all = do_lower; 2265 continue; 2266 case 'e': 2267 case 'E': 2268 func_one = func_all = (fptr_T)NULL; 2269 continue; 2270 } 2271 } 2272 } 2273 if (no < 0) { // Ordinary character. 2274 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL) { 2275 // Copy a special key as-is. 2276 if (copy) { 2277 if (dst + 3 > dest + destlen) { 2278 iemsg("vim_regsub_both(): not enough space"); 2279 return 0; 2280 } 2281 *dst++ = (char)c; 2282 *dst++ = *src++; 2283 *dst++ = *src++; 2284 } else { 2285 dst += 3; 2286 src += 2; 2287 } 2288 continue; 2289 } 2290 2291 if (c == '\\' && *src != NUL) { 2292 // Check for abbreviations -- webb 2293 switch (*src) { 2294 case 'r': 2295 c = CAR; ++src; break; 2296 case 'n': 2297 c = NL; ++src; break; 2298 case 't': 2299 c = TAB; ++src; break; 2300 // Oh no! \e already has meaning in subst pat :-( 2301 // case 'e': c = ESC; ++src; break; 2302 case 'b': 2303 c = Ctrl_H; ++src; break; 2304 2305 // If "backslash" is true the backslash will be removed 2306 // later. Used to insert a literal CR. 2307 default: 2308 if (flags & REGSUB_BACKSLASH) { 2309 if (copy) { 2310 if (dst + 1 > dest + destlen) { 2311 iemsg("vim_regsub_both(): not enough space"); 2312 return 0; 2313 } 2314 *dst = '\\'; 2315 } 2316 dst++; 2317 } 2318 c = (uint8_t)(*src++); 2319 } 2320 } else { 2321 c = utf_ptr2char(src - 1); 2322 } 2323 2324 // Write to buffer, if copy is set. 2325 if (func_one != NULL) { 2326 func_one(&cc, c); 2327 func_one = NULL; 2328 } else if (func_all != NULL) { 2329 func_all(&cc, c); 2330 } else { 2331 // just copy 2332 cc = c; 2333 } 2334 2335 int totlen = utfc_ptr2len(src - 1); 2336 int charlen = utf_char2len(cc); 2337 2338 if (copy) { 2339 if (dst + charlen > dest + destlen) { 2340 iemsg("vim_regsub_both(): not enough space"); 2341 return 0; 2342 } 2343 utf_char2bytes(cc, dst); 2344 } 2345 dst += charlen - 1; 2346 int clen = utf_ptr2len(src - 1); 2347 2348 // If the character length is shorter than "totlen", there 2349 // are composing characters; copy them as-is. 2350 if (clen < totlen) { 2351 if (copy) { 2352 if (dst + totlen - clen > dest + destlen) { 2353 iemsg("vim_regsub_both(): not enough space"); 2354 return 0; 2355 } 2356 memmove(dst + 1, src - 1 + clen, (size_t)(totlen - clen)); 2357 } 2358 dst += totlen - clen; 2359 } 2360 src += totlen - 1; 2361 dst++; 2362 } else { 2363 if (REG_MULTI) { 2364 clnum = rex.reg_mmatch->startpos[no].lnum; 2365 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0) { 2366 s = NULL; 2367 } else { 2368 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col; 2369 if (rex.reg_mmatch->endpos[no].lnum == clnum) { 2370 len = rex.reg_mmatch->endpos[no].col 2371 - rex.reg_mmatch->startpos[no].col; 2372 } else { 2373 len = reg_getline_len(clnum) - rex.reg_mmatch->startpos[no].col; 2374 } 2375 } 2376 } else { 2377 s = rex.reg_match->startp[no]; 2378 if (rex.reg_match->endp[no] == NULL) { 2379 s = NULL; 2380 } else { 2381 len = (int)(rex.reg_match->endp[no] - s); 2382 } 2383 } 2384 if (s != NULL) { 2385 while (true) { 2386 if (len == 0) { 2387 if (REG_MULTI) { 2388 if (rex.reg_mmatch->endpos[no].lnum == clnum) { 2389 break; 2390 } 2391 if (copy) { 2392 if (dst + 1 > dest + destlen) { 2393 iemsg("vim_regsub_both(): not enough space"); 2394 return 0; 2395 } 2396 *dst = CAR; 2397 } 2398 dst++; 2399 s = reg_getline(++clnum); 2400 if (rex.reg_mmatch->endpos[no].lnum == clnum) { 2401 len = rex.reg_mmatch->endpos[no].col; 2402 } else { 2403 len = reg_getline_len(clnum); 2404 } 2405 } else { 2406 break; 2407 } 2408 } else if (*s == NUL) { // we hit NUL. 2409 if (copy) { 2410 iemsg(_(e_re_damg)); 2411 } 2412 goto exit; 2413 } else { 2414 if ((flags & REGSUB_BACKSLASH) && (*s == CAR || *s == '\\')) { 2415 // Insert a backslash in front of a CR, otherwise 2416 // it will be replaced by a line break. 2417 // Number of backslashes will be halved later, 2418 // double them here. 2419 if (copy) { 2420 if (dst + 2 > dest + destlen) { 2421 iemsg("vim_regsub_both(): not enough space"); 2422 return 0; 2423 } 2424 dst[0] = '\\'; 2425 dst[1] = *s; 2426 } 2427 dst += 2; 2428 } else { 2429 c = utf_ptr2char(s); 2430 2431 if (func_one != (fptr_T)NULL) { 2432 func_one(&cc, c); 2433 func_one = NULL; 2434 } else if (func_all != (fptr_T)NULL) { 2435 func_all(&cc, c); 2436 } else { // just copy 2437 cc = c; 2438 } 2439 2440 { 2441 int l; 2442 int charlen; 2443 2444 // Copy composing characters separately, one 2445 // at a time. 2446 l = utf_ptr2len(s) - 1; 2447 2448 s += l; 2449 len -= l; 2450 charlen = utf_char2len(cc); 2451 if (copy) { 2452 if (dst + charlen > dest + destlen) { 2453 iemsg("vim_regsub_both(): not enough space"); 2454 return 0; 2455 } 2456 utf_char2bytes(cc, dst); 2457 } 2458 dst += charlen - 1; 2459 } 2460 dst++; 2461 } 2462 2463 s++; 2464 len--; 2465 } 2466 } 2467 } 2468 no = -1; 2469 } 2470 } 2471 } 2472 if (copy) { 2473 *dst = NUL; 2474 } 2475 2476 exit: 2477 return (int)((dst - dest) + 1); 2478 } 2479 2480 static char *reg_getline_submatch(linenr_T lnum) 2481 { 2482 char *line; 2483 reg_getline_common(lnum, RGLF_LINE | RGLF_SUBMATCH, &line, NULL); 2484 return line; 2485 } 2486 2487 static colnr_T reg_getline_submatch_len(linenr_T lnum) 2488 { 2489 colnr_T length; 2490 reg_getline_common(lnum, RGLF_LENGTH | RGLF_SUBMATCH, NULL, &length); 2491 return length; 2492 } 2493 2494 /// Used for the submatch() function: get the string from the n'th submatch in 2495 /// allocated memory. 2496 /// 2497 /// @return NULL when not in a ":s" command and for a non-existing submatch. 2498 char *reg_submatch(int no) 2499 { 2500 char *retval = NULL; 2501 char *s; 2502 int round; 2503 linenr_T lnum; 2504 2505 if (!can_f_submatch || no < 0) { 2506 return NULL; 2507 } 2508 2509 if (rsm.sm_match == NULL) { 2510 ssize_t len; 2511 2512 // First round: compute the length and allocate memory. 2513 // Second round: copy the text. 2514 for (round = 1; round <= 2; round++) { 2515 lnum = rsm.sm_mmatch->startpos[no].lnum; 2516 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0) { 2517 return NULL; 2518 } 2519 2520 s = reg_getline_submatch(lnum); 2521 if (s == NULL) { // anti-crash check, cannot happen? 2522 break; 2523 } 2524 s += rsm.sm_mmatch->startpos[no].col; 2525 if (rsm.sm_mmatch->endpos[no].lnum == lnum) { 2526 // Within one line: take form start to end col. 2527 len = rsm.sm_mmatch->endpos[no].col - rsm.sm_mmatch->startpos[no].col; 2528 if (round == 2) { 2529 xmemcpyz(retval, s, (size_t)len); 2530 } 2531 len++; 2532 } else { 2533 // Multiple lines: take start line from start col, middle 2534 // lines completely and end line up to end col. 2535 len = reg_getline_submatch_len(lnum) - rsm.sm_mmatch->startpos[no].col; 2536 if (round == 2) { 2537 STRCPY(retval, s); 2538 retval[len] = '\n'; 2539 } 2540 len++; 2541 lnum++; 2542 while (lnum < rsm.sm_mmatch->endpos[no].lnum) { 2543 s = reg_getline_submatch(lnum); 2544 if (round == 2) { 2545 STRCPY(retval + len, s); 2546 } 2547 len += reg_getline_submatch_len(lnum); 2548 if (round == 2) { 2549 retval[len] = '\n'; 2550 } 2551 len++; 2552 lnum++; 2553 } 2554 if (round == 2) { 2555 strncpy(retval + len, // NOLINT(runtime/printf) 2556 reg_getline_submatch(lnum), 2557 (size_t)rsm.sm_mmatch->endpos[no].col); 2558 } 2559 len += rsm.sm_mmatch->endpos[no].col; 2560 if (round == 2) { 2561 retval[len] = NUL; 2562 } 2563 len++; 2564 } 2565 2566 if (retval == NULL) { 2567 retval = xmalloc((size_t)len); 2568 } 2569 } 2570 } else { 2571 s = rsm.sm_match->startp[no]; 2572 if (s == NULL || rsm.sm_match->endp[no] == NULL) { 2573 retval = NULL; 2574 } else { 2575 retval = xstrnsave(s, (size_t)(rsm.sm_match->endp[no] - s)); 2576 } 2577 } 2578 2579 return retval; 2580 } 2581 2582 // Used for the submatch() function with the optional non-zero argument: get 2583 // the list of strings from the n'th submatch in allocated memory with NULs 2584 // represented in NLs. 2585 // Returns a list of allocated strings. Returns NULL when not in a ":s" 2586 // command, for a non-existing submatch and for any error. 2587 list_T *reg_submatch_list(int no) 2588 { 2589 if (!can_f_submatch || no < 0) { 2590 return NULL; 2591 } 2592 2593 linenr_T slnum; 2594 linenr_T elnum; 2595 list_T *list; 2596 const char *s; 2597 2598 if (rsm.sm_match == NULL) { 2599 slnum = rsm.sm_mmatch->startpos[no].lnum; 2600 elnum = rsm.sm_mmatch->endpos[no].lnum; 2601 if (slnum < 0 || elnum < 0) { 2602 return NULL; 2603 } 2604 2605 colnr_T scol = rsm.sm_mmatch->startpos[no].col; 2606 colnr_T ecol = rsm.sm_mmatch->endpos[no].col; 2607 2608 list = tv_list_alloc(elnum - slnum + 1); 2609 2610 s = reg_getline_submatch(slnum) + scol; 2611 if (slnum == elnum) { 2612 tv_list_append_string(list, s, ecol - scol); 2613 } else { 2614 int max_lnum = elnum - slnum; 2615 tv_list_append_string(list, s, -1); 2616 for (int i = 1; i < max_lnum; i++) { 2617 s = reg_getline_submatch(slnum + i); 2618 tv_list_append_string(list, s, -1); 2619 } 2620 s = reg_getline_submatch(elnum); 2621 tv_list_append_string(list, s, ecol); 2622 } 2623 } else { 2624 s = rsm.sm_match->startp[no]; 2625 if (s == NULL || rsm.sm_match->endp[no] == NULL) { 2626 return NULL; 2627 } 2628 list = tv_list_alloc(1); 2629 tv_list_append_string(list, s, rsm.sm_match->endp[no] - s); 2630 } 2631 2632 tv_list_ref(list); 2633 return list; 2634 } 2635 2636 /// Initialize the values used for matching against multiple lines 2637 /// 2638 /// @param win window in which to search or NULL 2639 /// @param buf buffer in which to search 2640 /// @param lnum nr of line to start looking for match 2641 static void init_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum) 2642 { 2643 rex.reg_match = NULL; 2644 rex.reg_mmatch = rmp; 2645 rex.reg_buf = buf; 2646 rex.reg_win = win; 2647 rex.reg_firstlnum = lnum; 2648 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum; 2649 rex.reg_line_lbr = false; 2650 rex.reg_ic = rmp->rmm_ic; 2651 rex.reg_icombine = false; 2652 rex.reg_nobreak = rmp->regprog->re_flags & RE_NOBREAK; 2653 rex.reg_maxcol = rmp->rmm_maxcol; 2654 } 2655 2656 // regexp_bt.c {{{1 2657 2658 // Backtracking regular expression implementation. 2659 // 2660 // NOTICE: 2661 // 2662 // This is NOT the original regular expression code as written by Henry 2663 // Spencer. This code has been modified specifically for use with the VIM 2664 // editor, and should not be used separately from Vim. If you want a good 2665 // regular expression library, get the original code. The copyright notice 2666 // that follows is from the original. 2667 // 2668 // END NOTICE 2669 // 2670 // Copyright (c) 1986 by University of Toronto. 2671 // Written by Henry Spencer. Not derived from licensed software. 2672 // 2673 // Permission is granted to anyone to use this software for any 2674 // purpose on any computer system, and to redistribute it freely, 2675 // subject to the following restrictions: 2676 // 2677 // 1. The author is not responsible for the consequences of use of 2678 // this software, no matter how awful, even if they arise 2679 // from defects in it. 2680 // 2681 // 2. The origin of this software must not be misrepresented, either 2682 // by explicit claim or by omission. 2683 // 2684 // 3. Altered versions must be plainly marked as such, and must not 2685 // be misrepresented as being the original software. 2686 // 2687 // Beware that some of this code is subtly aware of the way operator 2688 // precedence is structured in regular expressions. Serious changes in 2689 // regular-expression syntax might require a total rethink. 2690 // 2691 // Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert 2692 // Webb, Ciaran McCreesh and Bram Moolenaar. 2693 // Named character class support added by Walter Briscoe (1998 Jul 01) 2694 2695 // The "internal use only" fields in regexp_defs.h are present to pass info from 2696 // compile to execute that permits the execute phase to run lots faster on 2697 // simple cases. They are: 2698 // 2699 // regstart char that must begin a match; NUL if none obvious; Can be a 2700 // multi-byte character. 2701 // reganch is the match anchored (at beginning-of-line only)? 2702 // regmust string (pointer into program) that match must include, or NULL 2703 // regmlen length of regmust string 2704 // regflags RF_ values or'ed together 2705 // 2706 // Regstart and reganch permit very fast decisions on suitable starting points 2707 // for a match, cutting down the work a lot. Regmust permits fast rejection 2708 // of lines that cannot possibly match. The regmust tests are costly enough 2709 // that vim_regcomp() supplies a regmust only if the r.e. contains something 2710 // potentially expensive (at present, the only such thing detected is * or + 2711 // at the start of the r.e., which can involve a lot of backup). Regmlen is 2712 // supplied because the test in vim_regexec() needs it and vim_regcomp() is 2713 // computing it anyway. 2714 2715 // Structure for regexp "program". This is essentially a linear encoding 2716 // of a nondeterministic finite-state machine (aka syntax charts or 2717 // "railroad normal form" in parsing technology). Each node is an opcode 2718 // plus a "next" pointer, possibly plus an operand. "Next" pointers of 2719 // all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next" 2720 // pointer with a BRANCH on both ends of it is connecting two alternatives. 2721 // (Here we have one of the subtle syntax dependencies: an individual BRANCH 2722 // (as opposed to a collection of them) is never concatenated with anything 2723 // because of operator precedence). The "next" pointer of a BRACES_COMPLEX 2724 // node points to the node after the stuff to be repeated. 2725 // The operand of some types of node is a literal string; for others, it is a 2726 // node leading into a sub-FSM. In particular, the operand of a BRANCH node 2727 // is the first node of the branch. 2728 // (NB this is *not* a tree structure: the tail of the branch connects to the 2729 // thing following the set of BRANCHes.) 2730 // 2731 // pattern is coded like: 2732 // 2733 // +-----------------+ 2734 // | V 2735 // <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END 2736 // | ^ | ^ 2737 // +------+ +----------+ 2738 // 2739 // 2740 // +------------------+ 2741 // V | 2742 // <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END 2743 // | | ^ ^ 2744 // | +---------------+ | 2745 // +---------------------------------------------+ 2746 // 2747 // 2748 // +----------------------+ 2749 // V | 2750 // <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END 2751 // | | ^ ^ 2752 // | +-----------+ | 2753 // +--------------------------------------------------+ 2754 // 2755 // 2756 // +-------------------------+ 2757 // V | 2758 // <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END 2759 // | | ^ 2760 // | +----------------+ 2761 // +-----------------------------------------------+ 2762 // 2763 // 2764 // <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END 2765 // | | ^ ^ 2766 // | +----------------+ | 2767 // +--------------------------------+ 2768 // 2769 // +---------+ 2770 // | V 2771 // \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END 2772 // | | | | ^ ^ 2773 // | | | +-----+ | 2774 // | | +----------------+ | 2775 // | +---------------------------+ | 2776 // +------------------------------------------------------+ 2777 // 2778 // They all start with a BRANCH for "\|" alternatives, even when there is only 2779 // one alternative. 2780 2781 // The opcodes are: 2782 2783 // definition number opnd? meaning 2784 #define END 0 // End of program or NOMATCH operand. 2785 #define BOL 1 // Match "" at beginning of line. 2786 #define EOL 2 // Match "" at end of line. 2787 #define BRANCH 3 // node Match this alternative, or the 2788 // next... 2789 #define BACK 4 // Match "", "next" ptr points backward. 2790 #define EXACTLY 5 // str Match this string. 2791 #define NOTHING 6 // Match empty string. 2792 #define STAR 7 // node Match this (simple) thing 0 or more 2793 // times. 2794 #define PLUS 8 // node Match this (simple) thing 1 or more 2795 // times. 2796 #define MATCH 9 // node match the operand zero-width 2797 #define NOMATCH 10 // node check for no match with operand 2798 #define BEHIND 11 // node look behind for a match with operand 2799 #define NOBEHIND 12 // node look behind for no match with operand 2800 #define SUBPAT 13 // node match the operand here 2801 #define BRACE_SIMPLE 14 // node Match this (simple) thing between m and 2802 // n times (\{m,n\}). 2803 #define BOW 15 // Match "" after [^a-zA-Z0-9_] 2804 #define EOW 16 // Match "" at [^a-zA-Z0-9_] 2805 #define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE 2806 // and BRACE_COMPLEX. 2807 #define NEWL 18 // Match line-break 2808 #define BHPOS 19 // End position for BEHIND or NOBEHIND 2809 2810 // character classes: 20-48 normal, 50-78 include a line-break 2811 #define ADD_NL 30 2812 #define FIRST_NL ANY + ADD_NL 2813 #define ANY 20 // Match any one character. 2814 #define ANYOF 21 // str Match any character in this string. 2815 #define ANYBUT 22 // str Match any character not in this 2816 // string. 2817 #define IDENT 23 // Match identifier char 2818 #define SIDENT 24 // Match identifier char but no digit 2819 #define KWORD 25 // Match keyword char 2820 #define SKWORD 26 // Match word char but no digit 2821 #define FNAME 27 // Match file name char 2822 #define SFNAME 28 // Match file name char but no digit 2823 #define PRINT 29 // Match printable char 2824 #define SPRINT 30 // Match printable char but no digit 2825 #define WHITE 31 // Match whitespace char 2826 #define NWHITE 32 // Match non-whitespace char 2827 #define DIGIT 33 // Match digit char 2828 #define NDIGIT 34 // Match non-digit char 2829 #define HEX 35 // Match hex char 2830 #define NHEX 36 // Match non-hex char 2831 #define OCTAL 37 // Match octal char 2832 #define NOCTAL 38 // Match non-octal char 2833 #define WORD 39 // Match word char 2834 #define NWORD 40 // Match non-word char 2835 #define HEAD 41 // Match head char 2836 #define NHEAD 42 // Match non-head char 2837 #define ALPHA 43 // Match alpha char 2838 #define NALPHA 44 // Match non-alpha char 2839 #define LOWER 45 // Match lowercase char 2840 #define NLOWER 46 // Match non-lowercase char 2841 #define UPPER 47 // Match uppercase char 2842 #define NUPPER 48 // Match non-uppercase char 2843 #define LAST_NL NUPPER + ADD_NL 2844 #define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL) 2845 2846 #define MOPEN 80 // -89 Mark this point in input as start of 2847 // \( … \) subexpr. MOPEN + 0 marks start of 2848 // match. 2849 #define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks 2850 // end of match. 2851 #define BACKREF 100 // -109 node Match same string again \1-\9. 2852 2853 #define ZOPEN 110 // -119 Mark this point in input as start of 2854 // \z( … \) subexpr. 2855 #define ZCLOSE 120 // -129 Analogous to ZOPEN. 2856 #define ZREF 130 // -139 node Match external submatch \z1-\z9 2857 2858 #define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times 2859 2860 #define NOPEN 150 // Mark this point in input as start of 2861 // \%( subexpr. 2862 #define NCLOSE 151 // Analogous to NOPEN. 2863 2864 #define MULTIBYTECODE 200 // mbc Match one multi-byte character 2865 #define RE_BOF 201 // Match "" at beginning of file. 2866 #define RE_EOF 202 // Match "" at end of file. 2867 #define CURSOR 203 // Match location of cursor. 2868 2869 #define RE_LNUM 204 // nr cmp Match line number 2870 #define RE_COL 205 // nr cmp Match column number 2871 #define RE_VCOL 206 // nr cmp Match virtual column number 2872 2873 #define RE_MARK 207 // mark cmp Match mark position 2874 #define RE_VISUAL 208 // Match Visual area 2875 #define RE_COMPOSING 209 // any composing characters 2876 2877 // Flags to be passed up and down. 2878 #define HASWIDTH 0x1 // Known never to match null string. 2879 #define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand. 2880 #define SPSTART 0x4 // Starts with * or +. 2881 #define HASNL 0x8 // Contains some \n. 2882 #define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!". 2883 #define WORST 0 // Worst case. 2884 2885 static int prevchr_len; ///< byte length of previous char 2886 static int num_complex_braces; ///< Complex \{...} count 2887 static uint8_t *regcode; ///< Code-emit pointer, or JUST_CALC_SIZE 2888 static int64_t regsize; ///< Code size. 2889 static int reg_toolong; ///< true when offset out of range 2890 static uint8_t had_endbrace[NSUBEXP]; ///< flags, true if end of () found 2891 static int64_t brace_min[10]; ///< Minimums for complex brace repeats 2892 static int64_t brace_max[10]; ///< Maximums for complex brace repeats 2893 static int brace_count[10]; ///< Current counts for complex brace repeats 2894 static int one_exactly = false; ///< only do one char for EXACTLY 2895 2896 // When making changes to classchars also change nfa_classcodes. 2897 static uint8_t *classchars = (uint8_t *)".iIkKfFpPsSdDxXoOwWhHaAlLuU"; 2898 static int classcodes[] = { 2899 ANY, IDENT, SIDENT, KWORD, SKWORD, 2900 FNAME, SFNAME, PRINT, SPRINT, 2901 WHITE, NWHITE, DIGIT, NDIGIT, 2902 HEX, NHEX, OCTAL, NOCTAL, 2903 WORD, NWORD, HEAD, NHEAD, 2904 ALPHA, NALPHA, LOWER, NLOWER, 2905 UPPER, NUPPER 2906 }; 2907 2908 // When regcode is set to this value, code is not emitted and size is computed 2909 // instead. 2910 #define JUST_CALC_SIZE ((uint8_t *)-1) 2911 2912 // used for STAR, PLUS and BRACE_SIMPLE matching 2913 typedef struct regstar_S { 2914 int nextb; // next byte 2915 int nextb_ic; // next byte reverse case 2916 int64_t count; 2917 int64_t minval; 2918 int64_t maxval; 2919 } regstar_T; 2920 2921 // used to store input position when a BACK was encountered, so that we now if 2922 // we made any progress since the last time. 2923 typedef struct backpos_S { 2924 uint8_t *bp_scan; // "scan" where BACK was encountered 2925 regsave_T bp_pos; // last input position 2926 } backpos_T; 2927 2928 // "regstack" and "backpos" are used by regmatch(). They are kept over calls 2929 // to avoid invoking malloc() and free() often. 2930 // "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T 2931 // or regbehind_T. 2932 // "backpos_T" is a table with backpos_T for BACK 2933 static garray_T regstack = GA_EMPTY_INIT_VALUE; 2934 static garray_T backpos = GA_EMPTY_INIT_VALUE; 2935 2936 static regsave_T behind_pos; 2937 2938 // Both for regstack and backpos tables we use the following strategy of 2939 // allocation (to reduce malloc/free calls): 2940 // - Initial size is fairly small. 2941 // - When needed, the tables are grown bigger (8 times at first, double after 2942 // that). 2943 // - After executing the match we free the memory only if the array has grown. 2944 // Thus the memory is kept allocated when it's at the initial size. 2945 // This makes it fast while not keeping a lot of memory allocated. 2946 // A three times speed increase was observed when using many simple patterns. 2947 #define REGSTACK_INITIAL 2048 2948 #define BACKPOS_INITIAL 64 2949 2950 // Opcode notes: 2951 // 2952 // BRANCH The set of branches constituting a single choice are hooked 2953 // together with their "next" pointers, since precedence prevents 2954 // anything being concatenated to any individual branch. The 2955 // "next" pointer of the last BRANCH in a choice points to the 2956 // thing following the whole choice. This is also where the 2957 // final "next" pointer of each individual branch points; each 2958 // branch starts with the operand node of a BRANCH node. 2959 // 2960 // BACK Normal "next" pointers all implicitly point forward; BACK 2961 // exists to make loop structures possible. 2962 // 2963 // STAR,PLUS '=', and complex '*' and '+', are implemented as circular 2964 // BRANCH structures using BACK. Simple cases (one character 2965 // per match) are implemented with STAR and PLUS for speed 2966 // and to minimize recursive plunges. 2967 // 2968 // BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX 2969 // node, and defines the min and max limits to be used for that 2970 // node. 2971 // 2972 // MOPEN,MCLOSE ...are numbered at compile time. 2973 // ZOPEN,ZCLOSE ...ditto 2974 /// 2975 // 2976 // 2977 // A node is one char of opcode followed by two chars of "next" pointer. 2978 // "Next" pointers are stored as two 8-bit bytes, high order first. The 2979 // value is a positive offset from the opcode of the node containing it. 2980 // An operand, if any, simply follows the node. (Note that much of the 2981 // code generation knows about this implicit relationship.) 2982 // 2983 // Using two bytes for the "next" pointer is vast overkill for most things, 2984 // but allows patterns to get big without disasters. 2985 #define OP(p) ((int)(*(p))) 2986 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377)) 2987 #define OPERAND(p) ((p) + 3) 2988 // Obtain an operand that was stored as four bytes, MSB first. 2989 #define OPERAND_MIN(p) (((int64_t)(p)[3] << 24) + ((int64_t)(p)[4] << 16) \ 2990 + ((int64_t)(p)[5] << 8) + (int64_t)(p)[6]) 2991 // Obtain a second operand stored as four bytes. 2992 #define OPERAND_MAX(p) OPERAND_MIN((p) + 4) 2993 // Obtain a second single-byte operand stored after a four bytes operand. 2994 #define OPERAND_CMP(p) (p)[7] 2995 2996 static uint8_t *reg(int paren, int *flagp); 2997 2998 #ifdef BT_REGEXP_DUMP 2999 static void regdump(uint8_t *, bt_regprog_T *); 3000 #endif 3001 3002 #ifdef REGEXP_DEBUG 3003 static uint8_t *regprop(uint8_t *); 3004 3005 static int regnarrate = 0; 3006 #endif 3007 3008 // Setup to parse the regexp. Used once to get the length and once to do it. 3009 static void regcomp_start(uint8_t *expr, int re_flags) // see vim_regcomp() 3010 { 3011 initchr((char *)expr); 3012 if (re_flags & RE_MAGIC) { 3013 reg_magic = MAGIC_ON; 3014 } else { 3015 reg_magic = MAGIC_OFF; 3016 } 3017 reg_string = (re_flags & RE_STRING); 3018 reg_strict = (re_flags & RE_STRICT); 3019 get_cpo_flags(); 3020 3021 num_complex_braces = 0; 3022 regnpar = 1; 3023 CLEAR_FIELD(had_endbrace); 3024 regnzpar = 1; 3025 re_has_z = 0; 3026 regsize = 0L; 3027 reg_toolong = false; 3028 regflags = 0; 3029 had_eol = false; 3030 } 3031 3032 // Return true if MULTIBYTECODE should be used instead of EXACTLY for 3033 // character "c". 3034 static bool use_multibytecode(int c) 3035 { 3036 return utf_char2len(c) > 1 3037 && (re_multi_type(peekchr()) != NOT_MULTI 3038 || utf_iscomposing_legacy(c)); 3039 } 3040 3041 // Emit (if appropriate) a byte of code 3042 static void regc(int b) 3043 { 3044 if (regcode == JUST_CALC_SIZE) { 3045 regsize++; 3046 } else { 3047 *regcode++ = (uint8_t)b; 3048 } 3049 } 3050 3051 // Emit (if appropriate) a multi-byte character of code 3052 static void regmbc(int c) 3053 { 3054 if (regcode == JUST_CALC_SIZE) { 3055 regsize += utf_char2len(c); 3056 } else { 3057 regcode += utf_char2bytes(c, (char *)regcode); 3058 } 3059 } 3060 3061 // Produce the bytes for equivalence class "c". 3062 // Currently only handles latin1, latin9 and utf-8. 3063 // NOTE: When changing this function, also change nfa_emit_equi_class() 3064 static void reg_equi_class(int c) 3065 { 3066 { 3067 switch (c) { 3068 // Do not use '\300' style, it results in a negative number. 3069 case 'A': 3070 case 0xc0: 3071 case 0xc1: 3072 case 0xc2: 3073 case 0xc3: 3074 case 0xc4: 3075 case 0xc5: 3076 case 0x100: 3077 case 0x102: 3078 case 0x104: 3079 case 0x1cd: 3080 case 0x1de: 3081 case 0x1e0: 3082 case 0x1fa: 3083 case 0x202: 3084 case 0x226: 3085 case 0x23a: 3086 case 0x1e00: 3087 case 0x1ea0: 3088 case 0x1ea2: 3089 case 0x1ea4: 3090 case 0x1ea6: 3091 case 0x1ea8: 3092 case 0x1eaa: 3093 case 0x1eac: 3094 case 0x1eae: 3095 case 0x1eb0: 3096 case 0x1eb2: 3097 case 0x1eb4: 3098 case 0x1eb6: 3099 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2); 3100 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5); 3101 regmbc(0x100); regmbc(0x102); regmbc(0x104); 3102 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0); 3103 regmbc(0x1fa); regmbc(0x202); regmbc(0x226); 3104 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0); 3105 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6); 3106 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac); 3107 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2); 3108 regmbc(0x1eb4); regmbc(0x1eb6); 3109 return; 3110 case 'B': 3111 case 0x181: 3112 case 0x243: 3113 case 0x1e02: 3114 case 0x1e04: 3115 case 0x1e06: 3116 regmbc('B'); 3117 regmbc(0x181); regmbc(0x243); regmbc(0x1e02); 3118 regmbc(0x1e04); regmbc(0x1e06); 3119 return; 3120 case 'C': 3121 case 0xc7: 3122 case 0x106: 3123 case 0x108: 3124 case 0x10a: 3125 case 0x10c: 3126 case 0x187: 3127 case 0x23b: 3128 case 0x1e08: 3129 case 0xa792: 3130 regmbc('C'); regmbc(0xc7); 3131 regmbc(0x106); regmbc(0x108); regmbc(0x10a); 3132 regmbc(0x10c); regmbc(0x187); regmbc(0x23b); 3133 regmbc(0x1e08); regmbc(0xa792); 3134 return; 3135 case 'D': 3136 case 0x10e: 3137 case 0x110: 3138 case 0x18a: 3139 case 0x1e0a: 3140 case 0x1e0c: 3141 case 0x1e0e: 3142 case 0x1e10: 3143 case 0x1e12: 3144 regmbc('D'); regmbc(0x10e); regmbc(0x110); 3145 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c); 3146 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12); 3147 return; 3148 case 'E': 3149 case 0xc8: 3150 case 0xc9: 3151 case 0xca: 3152 case 0xcb: 3153 case 0x112: 3154 case 0x114: 3155 case 0x116: 3156 case 0x118: 3157 case 0x11a: 3158 case 0x204: 3159 case 0x206: 3160 case 0x228: 3161 case 0x246: 3162 case 0x1e14: 3163 case 0x1e16: 3164 case 0x1e18: 3165 case 0x1e1a: 3166 case 0x1e1c: 3167 case 0x1eb8: 3168 case 0x1eba: 3169 case 0x1ebc: 3170 case 0x1ebe: 3171 case 0x1ec0: 3172 case 0x1ec2: 3173 case 0x1ec4: 3174 case 0x1ec6: 3175 regmbc('E'); regmbc(0xc8); regmbc(0xc9); 3176 regmbc(0xca); regmbc(0xcb); regmbc(0x112); 3177 regmbc(0x114); regmbc(0x116); regmbc(0x118); 3178 regmbc(0x11a); regmbc(0x204); regmbc(0x206); 3179 regmbc(0x228); regmbc(0x246); regmbc(0x1e14); 3180 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a); 3181 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba); 3182 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0); 3183 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6); 3184 return; 3185 case 'F': 3186 case 0x191: 3187 case 0x1e1e: 3188 case 0xa798: 3189 regmbc('F'); regmbc(0x191); regmbc(0x1e1e); 3190 regmbc(0xa798); 3191 return; 3192 case 'G': 3193 case 0x11c: 3194 case 0x11e: 3195 case 0x120: 3196 case 0x122: 3197 case 0x193: 3198 case 0x1e4: 3199 case 0x1e6: 3200 case 0x1f4: 3201 case 0x1e20: 3202 case 0xa7a0: 3203 regmbc('G'); regmbc(0x11c); regmbc(0x11e); 3204 regmbc(0x120); regmbc(0x122); regmbc(0x193); 3205 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4); 3206 regmbc(0x1e20); regmbc(0xa7a0); 3207 return; 3208 case 'H': 3209 case 0x124: 3210 case 0x126: 3211 case 0x21e: 3212 case 0x1e22: 3213 case 0x1e24: 3214 case 0x1e26: 3215 case 0x1e28: 3216 case 0x1e2a: 3217 case 0x2c67: 3218 regmbc('H'); regmbc(0x124); regmbc(0x126); 3219 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24); 3220 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a); 3221 regmbc(0x2c67); 3222 return; 3223 case 'I': 3224 case 0xcc: 3225 case 0xcd: 3226 case 0xce: 3227 case 0xcf: 3228 case 0x128: 3229 case 0x12a: 3230 case 0x12c: 3231 case 0x12e: 3232 case 0x130: 3233 case 0x197: 3234 case 0x1cf: 3235 case 0x208: 3236 case 0x20a: 3237 case 0x1e2c: 3238 case 0x1e2e: 3239 case 0x1ec8: 3240 case 0x1eca: 3241 regmbc('I'); regmbc(0xcc); regmbc(0xcd); 3242 regmbc(0xce); regmbc(0xcf); regmbc(0x128); 3243 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e); 3244 regmbc(0x130); regmbc(0x197); regmbc(0x1cf); 3245 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c); 3246 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca); 3247 return; 3248 case 'J': 3249 case 0x134: 3250 case 0x248: 3251 regmbc('J'); regmbc(0x134); regmbc(0x248); 3252 return; 3253 case 'K': 3254 case 0x136: 3255 case 0x198: 3256 case 0x1e8: 3257 case 0x1e30: 3258 case 0x1e32: 3259 case 0x1e34: 3260 case 0x2c69: 3261 case 0xa740: 3262 regmbc('K'); regmbc(0x136); regmbc(0x198); 3263 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32); 3264 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740); 3265 return; 3266 case 'L': 3267 case 0x139: 3268 case 0x13b: 3269 case 0x13d: 3270 case 0x13f: 3271 case 0x141: 3272 case 0x23d: 3273 case 0x1e36: 3274 case 0x1e38: 3275 case 0x1e3a: 3276 case 0x1e3c: 3277 case 0x2c60: 3278 regmbc('L'); regmbc(0x139); regmbc(0x13b); 3279 regmbc(0x13d); regmbc(0x13f); regmbc(0x141); 3280 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38); 3281 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60); 3282 return; 3283 case 'M': 3284 case 0x1e3e: 3285 case 0x1e40: 3286 case 0x1e42: 3287 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40); 3288 regmbc(0x1e42); 3289 return; 3290 case 'N': 3291 case 0xd1: 3292 case 0x143: 3293 case 0x145: 3294 case 0x147: 3295 case 0x1f8: 3296 case 0x1e44: 3297 case 0x1e46: 3298 case 0x1e48: 3299 case 0x1e4a: 3300 case 0xa7a4: 3301 regmbc('N'); regmbc(0xd1); 3302 regmbc(0x143); regmbc(0x145); regmbc(0x147); 3303 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46); 3304 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4); 3305 return; 3306 case 'O': 3307 case 0xd2: 3308 case 0xd3: 3309 case 0xd4: 3310 case 0xd5: 3311 case 0xd6: 3312 case 0xd8: 3313 case 0x14c: 3314 case 0x14e: 3315 case 0x150: 3316 case 0x19f: 3317 case 0x1a0: 3318 case 0x1d1: 3319 case 0x1ea: 3320 case 0x1ec: 3321 case 0x1fe: 3322 case 0x20c: 3323 case 0x20e: 3324 case 0x22a: 3325 case 0x22c: 3326 case 0x22e: 3327 case 0x230: 3328 case 0x1e4c: 3329 case 0x1e4e: 3330 case 0x1e50: 3331 case 0x1e52: 3332 case 0x1ecc: 3333 case 0x1ece: 3334 case 0x1ed0: 3335 case 0x1ed2: 3336 case 0x1ed4: 3337 case 0x1ed6: 3338 case 0x1ed8: 3339 case 0x1eda: 3340 case 0x1edc: 3341 case 0x1ede: 3342 case 0x1ee0: 3343 case 0x1ee2: 3344 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4); 3345 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8); 3346 regmbc(0x14c); regmbc(0x14e); regmbc(0x150); 3347 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1); 3348 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe); 3349 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a); 3350 regmbc(0x22c); regmbc(0x22e); regmbc(0x230); 3351 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50); 3352 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece); 3353 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4); 3354 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda); 3355 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0); 3356 regmbc(0x1ee2); 3357 return; 3358 case 'P': 3359 case 0x1a4: 3360 case 0x1e54: 3361 case 0x1e56: 3362 case 0x2c63: 3363 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54); 3364 regmbc(0x1e56); regmbc(0x2c63); 3365 return; 3366 case 'Q': 3367 case 0x24a: 3368 regmbc('Q'); regmbc(0x24a); 3369 return; 3370 case 'R': 3371 case 0x154: 3372 case 0x156: 3373 case 0x158: 3374 case 0x210: 3375 case 0x212: 3376 case 0x24c: 3377 case 0x1e58: 3378 case 0x1e5a: 3379 case 0x1e5c: 3380 case 0x1e5e: 3381 case 0x2c64: 3382 case 0xa7a6: 3383 regmbc('R'); regmbc(0x154); regmbc(0x156); 3384 regmbc(0x210); regmbc(0x212); regmbc(0x158); 3385 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a); 3386 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64); 3387 regmbc(0xa7a6); 3388 return; 3389 case 'S': 3390 case 0x15a: 3391 case 0x15c: 3392 case 0x15e: 3393 case 0x160: 3394 case 0x218: 3395 case 0x1e60: 3396 case 0x1e62: 3397 case 0x1e64: 3398 case 0x1e66: 3399 case 0x1e68: 3400 case 0x2c7e: 3401 case 0xa7a8: 3402 regmbc('S'); regmbc(0x15a); regmbc(0x15c); 3403 regmbc(0x15e); regmbc(0x160); regmbc(0x218); 3404 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64); 3405 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e); 3406 regmbc(0xa7a8); 3407 return; 3408 case 'T': 3409 case 0x162: 3410 case 0x164: 3411 case 0x166: 3412 case 0x1ac: 3413 case 0x1ae: 3414 case 0x21a: 3415 case 0x23e: 3416 case 0x1e6a: 3417 case 0x1e6c: 3418 case 0x1e6e: 3419 case 0x1e70: 3420 regmbc('T'); regmbc(0x162); regmbc(0x164); 3421 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e); 3422 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a); 3423 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70); 3424 return; 3425 case 'U': 3426 case 0xd9: 3427 case 0xda: 3428 case 0xdb: 3429 case 0xdc: 3430 case 0x168: 3431 case 0x16a: 3432 case 0x16c: 3433 case 0x16e: 3434 case 0x170: 3435 case 0x172: 3436 case 0x1af: 3437 case 0x1d3: 3438 case 0x1d5: 3439 case 0x1d7: 3440 case 0x1d9: 3441 case 0x1db: 3442 case 0x214: 3443 case 0x216: 3444 case 0x244: 3445 case 0x1e72: 3446 case 0x1e74: 3447 case 0x1e76: 3448 case 0x1e78: 3449 case 0x1e7a: 3450 case 0x1ee4: 3451 case 0x1ee6: 3452 case 0x1ee8: 3453 case 0x1eea: 3454 case 0x1eec: 3455 case 0x1eee: 3456 case 0x1ef0: 3457 regmbc('U'); regmbc(0xd9); regmbc(0xda); 3458 regmbc(0xdb); regmbc(0xdc); regmbc(0x168); 3459 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e); 3460 regmbc(0x170); regmbc(0x172); regmbc(0x1af); 3461 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7); 3462 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214); 3463 regmbc(0x216); regmbc(0x244); regmbc(0x1e72); 3464 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78); 3465 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6); 3466 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec); 3467 regmbc(0x1eee); regmbc(0x1ef0); 3468 return; 3469 case 'V': 3470 case 0x1b2: 3471 case 0x1e7c: 3472 case 0x1e7e: 3473 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c); 3474 regmbc(0x1e7e); 3475 return; 3476 case 'W': 3477 case 0x174: 3478 case 0x1e80: 3479 case 0x1e82: 3480 case 0x1e84: 3481 case 0x1e86: 3482 case 0x1e88: 3483 regmbc('W'); regmbc(0x174); regmbc(0x1e80); 3484 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86); 3485 regmbc(0x1e88); 3486 return; 3487 case 'X': 3488 case 0x1e8a: 3489 case 0x1e8c: 3490 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c); 3491 return; 3492 case 'Y': 3493 case 0xdd: 3494 case 0x176: 3495 case 0x178: 3496 case 0x1b3: 3497 case 0x232: 3498 case 0x24e: 3499 case 0x1e8e: 3500 case 0x1ef2: 3501 case 0x1ef6: 3502 case 0x1ef4: 3503 case 0x1ef8: 3504 regmbc('Y'); regmbc(0xdd); regmbc(0x176); 3505 regmbc(0x178); regmbc(0x1b3); regmbc(0x232); 3506 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2); 3507 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8); 3508 return; 3509 case 'Z': 3510 case 0x179: 3511 case 0x17b: 3512 case 0x17d: 3513 case 0x1b5: 3514 case 0x1e90: 3515 case 0x1e92: 3516 case 0x1e94: 3517 case 0x2c6b: 3518 regmbc('Z'); regmbc(0x179); regmbc(0x17b); 3519 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90); 3520 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b); 3521 return; 3522 case 'a': 3523 case 0xe0: 3524 case 0xe1: 3525 case 0xe2: 3526 case 0xe3: 3527 case 0xe4: 3528 case 0xe5: 3529 case 0x101: 3530 case 0x103: 3531 case 0x105: 3532 case 0x1ce: 3533 case 0x1df: 3534 case 0x1e1: 3535 case 0x1fb: 3536 case 0x201: 3537 case 0x203: 3538 case 0x227: 3539 case 0x1d8f: 3540 case 0x1e01: 3541 case 0x1e9a: 3542 case 0x1ea1: 3543 case 0x1ea3: 3544 case 0x1ea5: 3545 case 0x1ea7: 3546 case 0x1ea9: 3547 case 0x1eab: 3548 case 0x1ead: 3549 case 0x1eaf: 3550 case 0x1eb1: 3551 case 0x1eb3: 3552 case 0x1eb5: 3553 case 0x1eb7: 3554 case 0x2c65: 3555 regmbc('a'); regmbc(0xe0); regmbc(0xe1); 3556 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4); 3557 regmbc(0xe5); regmbc(0x101); regmbc(0x103); 3558 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df); 3559 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201); 3560 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f); 3561 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1); 3562 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7); 3563 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead); 3564 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3); 3565 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65); 3566 return; 3567 case 'b': 3568 case 0x180: 3569 case 0x253: 3570 case 0x1d6c: 3571 case 0x1d80: 3572 case 0x1e03: 3573 case 0x1e05: 3574 case 0x1e07: 3575 regmbc('b'); 3576 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c); 3577 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05); 3578 regmbc(0x1e07); 3579 return; 3580 case 'c': 3581 case 0xe7: 3582 case 0x107: 3583 case 0x109: 3584 case 0x10b: 3585 case 0x10d: 3586 case 0x188: 3587 case 0x23c: 3588 case 0x1e09: 3589 case 0xa793: 3590 case 0xa794: 3591 regmbc('c'); regmbc(0xe7); regmbc(0x107); 3592 regmbc(0x109); regmbc(0x10b); regmbc(0x10d); 3593 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09); 3594 regmbc(0xa793); regmbc(0xa794); 3595 return; 3596 case 'd': 3597 case 0x10f: 3598 case 0x111: 3599 case 0x257: 3600 case 0x1d6d: 3601 case 0x1d81: 3602 case 0x1d91: 3603 case 0x1e0b: 3604 case 0x1e0d: 3605 case 0x1e0f: 3606 case 0x1e11: 3607 case 0x1e13: 3608 regmbc('d'); regmbc(0x10f); regmbc(0x111); 3609 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81); 3610 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d); 3611 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13); 3612 return; 3613 case 'e': 3614 case 0xe8: 3615 case 0xe9: 3616 case 0xea: 3617 case 0xeb: 3618 case 0x113: 3619 case 0x115: 3620 case 0x117: 3621 case 0x119: 3622 case 0x11b: 3623 case 0x205: 3624 case 0x207: 3625 case 0x229: 3626 case 0x247: 3627 case 0x1d92: 3628 case 0x1e15: 3629 case 0x1e17: 3630 case 0x1e19: 3631 case 0x1e1b: 3632 case 0x1eb9: 3633 case 0x1ebb: 3634 case 0x1e1d: 3635 case 0x1ebd: 3636 case 0x1ebf: 3637 case 0x1ec1: 3638 case 0x1ec3: 3639 case 0x1ec5: 3640 case 0x1ec7: 3641 regmbc('e'); regmbc(0xe8); regmbc(0xe9); 3642 regmbc(0xea); regmbc(0xeb); regmbc(0x113); 3643 regmbc(0x115); regmbc(0x117); regmbc(0x119); 3644 regmbc(0x11b); regmbc(0x205); regmbc(0x207); 3645 regmbc(0x229); regmbc(0x247); regmbc(0x1d92); 3646 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19); 3647 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9); 3648 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf); 3649 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5); 3650 regmbc(0x1ec7); 3651 return; 3652 case 'f': 3653 case 0x192: 3654 case 0x1d6e: 3655 case 0x1d82: 3656 case 0x1e1f: 3657 case 0xa799: 3658 regmbc('f'); regmbc(0x192); regmbc(0x1d6e); 3659 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799); 3660 return; 3661 case 'g': 3662 case 0x11d: 3663 case 0x11f: 3664 case 0x121: 3665 case 0x123: 3666 case 0x1e5: 3667 case 0x1e7: 3668 case 0x260: 3669 case 0x1f5: 3670 case 0x1d83: 3671 case 0x1e21: 3672 case 0xa7a1: 3673 regmbc('g'); regmbc(0x11d); regmbc(0x11f); 3674 regmbc(0x121); regmbc(0x123); regmbc(0x1e5); 3675 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260); 3676 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1); 3677 return; 3678 case 'h': 3679 case 0x125: 3680 case 0x127: 3681 case 0x21f: 3682 case 0x1e23: 3683 case 0x1e25: 3684 case 0x1e27: 3685 case 0x1e29: 3686 case 0x1e2b: 3687 case 0x1e96: 3688 case 0x2c68: 3689 case 0xa795: 3690 regmbc('h'); regmbc(0x125); regmbc(0x127); 3691 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25); 3692 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b); 3693 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795); 3694 return; 3695 case 'i': 3696 case 0xec: 3697 case 0xed: 3698 case 0xee: 3699 case 0xef: 3700 case 0x129: 3701 case 0x12b: 3702 case 0x12d: 3703 case 0x12f: 3704 case 0x1d0: 3705 case 0x209: 3706 case 0x20b: 3707 case 0x268: 3708 case 0x1d96: 3709 case 0x1e2d: 3710 case 0x1e2f: 3711 case 0x1ec9: 3712 case 0x1ecb: 3713 regmbc('i'); regmbc(0xec); regmbc(0xed); 3714 regmbc(0xee); regmbc(0xef); regmbc(0x129); 3715 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f); 3716 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b); 3717 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d); 3718 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb); 3719 return; 3720 case 'j': 3721 case 0x135: 3722 case 0x1f0: 3723 case 0x249: 3724 regmbc('j'); regmbc(0x135); regmbc(0x1f0); 3725 regmbc(0x249); 3726 return; 3727 case 'k': 3728 case 0x137: 3729 case 0x199: 3730 case 0x1e9: 3731 case 0x1d84: 3732 case 0x1e31: 3733 case 0x1e33: 3734 case 0x1e35: 3735 case 0x2c6a: 3736 case 0xa741: 3737 regmbc('k'); regmbc(0x137); regmbc(0x199); 3738 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31); 3739 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a); 3740 regmbc(0xa741); 3741 return; 3742 case 'l': 3743 case 0x13a: 3744 case 0x13c: 3745 case 0x13e: 3746 case 0x140: 3747 case 0x142: 3748 case 0x19a: 3749 case 0x1e37: 3750 case 0x1e39: 3751 case 0x1e3b: 3752 case 0x1e3d: 3753 case 0x2c61: 3754 regmbc('l'); regmbc(0x13a); regmbc(0x13c); 3755 regmbc(0x13e); regmbc(0x140); regmbc(0x142); 3756 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39); 3757 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61); 3758 return; 3759 case 'm': 3760 case 0x1d6f: 3761 case 0x1e3f: 3762 case 0x1e41: 3763 case 0x1e43: 3764 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f); 3765 regmbc(0x1e41); regmbc(0x1e43); 3766 return; 3767 case 'n': 3768 case 0xf1: 3769 case 0x144: 3770 case 0x146: 3771 case 0x148: 3772 case 0x149: 3773 case 0x1f9: 3774 case 0x1d70: 3775 case 0x1d87: 3776 case 0x1e45: 3777 case 0x1e47: 3778 case 0x1e49: 3779 case 0x1e4b: 3780 case 0xa7a5: 3781 regmbc('n'); regmbc(0xf1); regmbc(0x144); 3782 regmbc(0x146); regmbc(0x148); regmbc(0x149); 3783 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87); 3784 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49); 3785 regmbc(0x1e4b); regmbc(0xa7a5); 3786 return; 3787 case 'o': 3788 case 0xf2: 3789 case 0xf3: 3790 case 0xf4: 3791 case 0xf5: 3792 case 0xf6: 3793 case 0xf8: 3794 case 0x14d: 3795 case 0x14f: 3796 case 0x151: 3797 case 0x1a1: 3798 case 0x1d2: 3799 case 0x1eb: 3800 case 0x1ed: 3801 case 0x1ff: 3802 case 0x20d: 3803 case 0x20f: 3804 case 0x22b: 3805 case 0x22d: 3806 case 0x22f: 3807 case 0x231: 3808 case 0x275: 3809 case 0x1e4d: 3810 case 0x1e4f: 3811 case 0x1e51: 3812 case 0x1e53: 3813 case 0x1ecd: 3814 case 0x1ecf: 3815 case 0x1ed1: 3816 case 0x1ed3: 3817 case 0x1ed5: 3818 case 0x1ed7: 3819 case 0x1ed9: 3820 case 0x1edb: 3821 case 0x1edd: 3822 case 0x1edf: 3823 case 0x1ee1: 3824 case 0x1ee3: 3825 regmbc('o'); regmbc(0xf2); regmbc(0xf3); 3826 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6); 3827 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f); 3828 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2); 3829 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff); 3830 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b); 3831 regmbc(0x22d); regmbc(0x22f); regmbc(0x231); 3832 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f); 3833 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd); 3834 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3); 3835 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9); 3836 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf); 3837 regmbc(0x1ee1); regmbc(0x1ee3); 3838 return; 3839 case 'p': 3840 case 0x1a5: 3841 case 0x1d71: 3842 case 0x1d88: 3843 case 0x1d7d: 3844 case 0x1e55: 3845 case 0x1e57: 3846 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71); 3847 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55); 3848 regmbc(0x1e57); 3849 return; 3850 case 'q': 3851 case 0x24b: 3852 case 0x2a0: 3853 regmbc('q'); regmbc(0x24b); regmbc(0x2a0); 3854 return; 3855 case 'r': 3856 case 0x155: 3857 case 0x157: 3858 case 0x159: 3859 case 0x211: 3860 case 0x213: 3861 case 0x24d: 3862 case 0x27d: 3863 case 0x1d72: 3864 case 0x1d73: 3865 case 0x1d89: 3866 case 0x1e59: 3867 case 0x1e5b: 3868 case 0x1e5d: 3869 case 0x1e5f: 3870 case 0xa7a7: 3871 regmbc('r'); regmbc(0x155); regmbc(0x157); 3872 regmbc(0x159); regmbc(0x211); regmbc(0x213); 3873 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73); 3874 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d); 3875 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f); 3876 regmbc(0xa7a7); 3877 return; 3878 case 's': 3879 case 0x15b: 3880 case 0x15d: 3881 case 0x15f: 3882 case 0x161: 3883 case 0x1e61: 3884 case 0x219: 3885 case 0x23f: 3886 case 0x1d74: 3887 case 0x1d8a: 3888 case 0x1e63: 3889 case 0x1e65: 3890 case 0x1e67: 3891 case 0x1e69: 3892 case 0xa7a9: 3893 regmbc('s'); regmbc(0x15b); regmbc(0x15d); 3894 regmbc(0x15f); regmbc(0x161); regmbc(0x23f); 3895 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a); 3896 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65); 3897 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9); 3898 return; 3899 case 't': 3900 case 0x163: 3901 case 0x165: 3902 case 0x167: 3903 case 0x1ab: 3904 case 0x1ad: 3905 case 0x21b: 3906 case 0x288: 3907 case 0x1d75: 3908 case 0x1e6b: 3909 case 0x1e6d: 3910 case 0x1e6f: 3911 case 0x1e71: 3912 case 0x1e97: 3913 case 0x2c66: 3914 regmbc('t'); regmbc(0x163); regmbc(0x165); 3915 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b); 3916 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75); 3917 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f); 3918 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66); 3919 return; 3920 case 'u': 3921 case 0xf9: 3922 case 0xfa: 3923 case 0xfb: 3924 case 0xfc: 3925 case 0x169: 3926 case 0x16b: 3927 case 0x16d: 3928 case 0x16f: 3929 case 0x171: 3930 case 0x173: 3931 case 0x1b0: 3932 case 0x1d4: 3933 case 0x1d6: 3934 case 0x1d8: 3935 case 0x1da: 3936 case 0x1dc: 3937 case 0x215: 3938 case 0x217: 3939 case 0x289: 3940 case 0x1e73: 3941 case 0x1d7e: 3942 case 0x1d99: 3943 case 0x1e75: 3944 case 0x1e77: 3945 case 0x1e79: 3946 case 0x1e7b: 3947 case 0x1ee5: 3948 case 0x1ee7: 3949 case 0x1ee9: 3950 case 0x1eeb: 3951 case 0x1eed: 3952 case 0x1eef: 3953 case 0x1ef1: 3954 regmbc('u'); regmbc(0xf9); regmbc(0xfa); 3955 regmbc(0xfb); regmbc(0xfc); regmbc(0x169); 3956 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f); 3957 regmbc(0x171); regmbc(0x173); regmbc(0x1d6); 3958 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc); 3959 regmbc(0x215); regmbc(0x217); regmbc(0x1b0); 3960 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e); 3961 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75); 3962 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b); 3963 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9); 3964 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef); 3965 regmbc(0x1ef1); 3966 return; 3967 case 'v': 3968 case 0x28b: 3969 case 0x1d8c: 3970 case 0x1e7d: 3971 case 0x1e7f: 3972 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c); 3973 regmbc(0x1e7d); regmbc(0x1e7f); 3974 return; 3975 case 'w': 3976 case 0x175: 3977 case 0x1e81: 3978 case 0x1e83: 3979 case 0x1e85: 3980 case 0x1e87: 3981 case 0x1e89: 3982 case 0x1e98: 3983 regmbc('w'); regmbc(0x175); regmbc(0x1e81); 3984 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87); 3985 regmbc(0x1e89); regmbc(0x1e98); 3986 return; 3987 case 'x': 3988 case 0x1e8b: 3989 case 0x1e8d: 3990 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d); 3991 return; 3992 case 'y': 3993 case 0xfd: 3994 case 0xff: 3995 case 0x177: 3996 case 0x1b4: 3997 case 0x233: 3998 case 0x24f: 3999 case 0x1e8f: 4000 case 0x1e99: 4001 case 0x1ef3: 4002 case 0x1ef5: 4003 case 0x1ef7: 4004 case 0x1ef9: 4005 regmbc('y'); regmbc(0xfd); regmbc(0xff); 4006 regmbc(0x177); regmbc(0x1b4); regmbc(0x233); 4007 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99); 4008 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7); 4009 regmbc(0x1ef9); 4010 return; 4011 case 'z': 4012 case 0x17a: 4013 case 0x17c: 4014 case 0x17e: 4015 case 0x1b6: 4016 case 0x1d76: 4017 case 0x1d8e: 4018 case 0x1e91: 4019 case 0x1e93: 4020 case 0x1e95: 4021 case 0x2c6c: 4022 regmbc('z'); regmbc(0x17a); regmbc(0x17c); 4023 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76); 4024 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93); 4025 regmbc(0x1e95); regmbc(0x2c6c); 4026 return; 4027 } 4028 } 4029 regmbc(c); 4030 } 4031 4032 // Emit a node. 4033 // Return pointer to generated code. 4034 static uint8_t *regnode(int op) 4035 { 4036 uint8_t *ret; 4037 4038 ret = regcode; 4039 if (ret == JUST_CALC_SIZE) { 4040 regsize += 3; 4041 } else { 4042 *regcode++ = (uint8_t)op; 4043 *regcode++ = NUL; // Null "next" pointer. 4044 *regcode++ = NUL; 4045 } 4046 return ret; 4047 } 4048 4049 // Write a four bytes number at "p" and return pointer to the next char. 4050 static uint8_t *re_put_uint32(uint8_t *p, uint32_t val) 4051 { 4052 *p++ = (uint8_t)((val >> 24) & 0377); 4053 *p++ = (uint8_t)((val >> 16) & 0377); 4054 *p++ = (uint8_t)((val >> 8) & 0377); 4055 *p++ = (uint8_t)(val & 0377); 4056 return p; 4057 } 4058 4059 // regnext - dig the "next" pointer out of a node 4060 // Returns NULL when calculating size, when there is no next item and when 4061 // there is an error. 4062 static uint8_t *regnext(uint8_t *p) 4063 FUNC_ATTR_NONNULL_ALL 4064 { 4065 int offset; 4066 4067 if (p == JUST_CALC_SIZE || reg_toolong) { 4068 return NULL; 4069 } 4070 4071 offset = NEXT(p); 4072 if (offset == 0) { 4073 return NULL; 4074 } 4075 4076 if (OP(p) == BACK) { 4077 return p - offset; 4078 } else { 4079 return p + offset; 4080 } 4081 } 4082 4083 // Set the next-pointer at the end of a node chain. 4084 static void regtail(uint8_t *p, const uint8_t *val) 4085 { 4086 int offset; 4087 4088 if (p == JUST_CALC_SIZE) { 4089 return; 4090 } 4091 4092 // Find last node. 4093 uint8_t *scan = p; 4094 while (true) { 4095 uint8_t *temp = regnext(scan); 4096 if (temp == NULL) { 4097 break; 4098 } 4099 scan = temp; 4100 } 4101 4102 if (OP(scan) == BACK) { 4103 offset = (int)(scan - val); 4104 } else { 4105 offset = (int)(val - scan); 4106 } 4107 // When the offset uses more than 16 bits it can no longer fit in the two 4108 // bytes available. Use a global flag to avoid having to check return 4109 // values in too many places. 4110 if (offset > 0xffff) { 4111 reg_toolong = true; 4112 } else { 4113 *(scan + 1) = (uint8_t)(((unsigned)offset >> 8) & 0377); 4114 *(scan + 2) = (uint8_t)(offset & 0377); 4115 } 4116 } 4117 4118 // Like regtail, on item after a BRANCH; nop if none. 4119 static void regoptail(uint8_t *p, uint8_t *val) 4120 { 4121 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" 4122 if (p == NULL || p == JUST_CALC_SIZE 4123 || (OP(p) != BRANCH 4124 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9))) { 4125 return; 4126 } 4127 regtail(OPERAND(p), val); 4128 } 4129 4130 // Insert an operator in front of already-emitted operand 4131 // 4132 // Means relocating the operand. 4133 static void reginsert(int op, uint8_t *opnd) 4134 { 4135 uint8_t *src; 4136 uint8_t *dst; 4137 uint8_t *place; 4138 4139 if (regcode == JUST_CALC_SIZE) { 4140 regsize += 3; 4141 return; 4142 } 4143 src = regcode; 4144 regcode += 3; 4145 dst = regcode; 4146 while (src > opnd) { 4147 *--dst = *--src; 4148 } 4149 4150 place = opnd; // Op node, where operand used to be. 4151 *place++ = (uint8_t)op; 4152 *place++ = NUL; 4153 *place = NUL; 4154 } 4155 4156 // Insert an operator in front of already-emitted operand. 4157 // Add a number to the operator. 4158 static void reginsert_nr(int op, int64_t val, uint8_t *opnd) 4159 { 4160 uint8_t *src; 4161 uint8_t *dst; 4162 uint8_t *place; 4163 4164 if (regcode == JUST_CALC_SIZE) { 4165 regsize += 7; 4166 return; 4167 } 4168 src = regcode; 4169 regcode += 7; 4170 dst = regcode; 4171 while (src > opnd) { 4172 *--dst = *--src; 4173 } 4174 4175 place = opnd; // Op node, where operand used to be. 4176 *place++ = (uint8_t)op; 4177 *place++ = NUL; 4178 *place++ = NUL; 4179 assert(val >= 0 && (uintmax_t)val <= UINT32_MAX); 4180 re_put_uint32(place, (uint32_t)val); 4181 } 4182 4183 // Insert an operator in front of already-emitted operand. 4184 // The operator has the given limit values as operands. Also set next pointer. 4185 // 4186 // Means relocating the operand. 4187 static void reginsert_limits(int op, int64_t minval, int64_t maxval, uint8_t *opnd) 4188 { 4189 uint8_t *src; 4190 uint8_t *dst; 4191 uint8_t *place; 4192 4193 if (regcode == JUST_CALC_SIZE) { 4194 regsize += 11; 4195 return; 4196 } 4197 src = regcode; 4198 regcode += 11; 4199 dst = regcode; 4200 while (src > opnd) { 4201 *--dst = *--src; 4202 } 4203 4204 place = opnd; // Op node, where operand used to be. 4205 *place++ = (uint8_t)op; 4206 *place++ = NUL; 4207 *place++ = NUL; 4208 assert(minval >= 0 && (uintmax_t)minval <= UINT32_MAX); 4209 place = re_put_uint32(place, (uint32_t)minval); 4210 assert(maxval >= 0 && (uintmax_t)maxval <= UINT32_MAX); 4211 place = re_put_uint32(place, (uint32_t)maxval); 4212 regtail(opnd, place); 4213 } 4214 4215 /// Return true if the back reference is legal. We must have seen the close 4216 /// brace. 4217 /// TODO(vim): Should also check that we don't refer to something repeated 4218 /// (+*=): what instance of the repetition should we match? 4219 static int seen_endbrace(int refnum) 4220 { 4221 if (!had_endbrace[refnum]) { 4222 uint8_t *p; 4223 4224 // Trick: check if "@<=" or "@<!" follows, in which case 4225 // the \1 can appear before the referenced match. 4226 for (p = (uint8_t *)regparse; *p != NUL; p++) { 4227 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '=')) { 4228 break; 4229 } 4230 } 4231 4232 if (*p == NUL) { 4233 emsg(_("E65: Illegal back reference")); 4234 rc_did_emsg = true; 4235 return false; 4236 } 4237 } 4238 return true; 4239 } 4240 4241 // Parse the lowest level. 4242 // 4243 // Optimization: gobbles an entire sequence of ordinary characters so that 4244 // it can turn them into a single node, which is smaller to store and 4245 // faster to run. Don't do this when one_exactly is set. 4246 static uint8_t *regatom(int *flagp) 4247 { 4248 uint8_t *ret; 4249 int flags; 4250 int c; 4251 uint8_t *p; 4252 int extra = 0; 4253 int save_prev_at_start = prev_at_start; 4254 4255 *flagp = WORST; // Tentatively. 4256 4257 c = getchr(); 4258 switch (c) { 4259 case Magic('^'): 4260 ret = regnode(BOL); 4261 break; 4262 4263 case Magic('$'): 4264 ret = regnode(EOL); 4265 had_eol = true; 4266 break; 4267 4268 case Magic('<'): 4269 ret = regnode(BOW); 4270 break; 4271 4272 case Magic('>'): 4273 ret = regnode(EOW); 4274 break; 4275 4276 case Magic('_'): 4277 c = no_Magic(getchr()); 4278 if (c == '^') { // "\_^" is start-of-line 4279 ret = regnode(BOL); 4280 break; 4281 } 4282 if (c == '$') { // "\_$" is end-of-line 4283 ret = regnode(EOL); 4284 had_eol = true; 4285 break; 4286 } 4287 4288 extra = ADD_NL; 4289 *flagp |= HASNL; 4290 4291 // "\_[" is character range plus newline 4292 if (c == '[') { 4293 goto collection; 4294 } 4295 4296 // "\_x" is character class plus newline 4297 FALLTHROUGH; 4298 4299 // Character classes. 4300 case Magic('.'): 4301 case Magic('i'): 4302 case Magic('I'): 4303 case Magic('k'): 4304 case Magic('K'): 4305 case Magic('f'): 4306 case Magic('F'): 4307 case Magic('p'): 4308 case Magic('P'): 4309 case Magic('s'): 4310 case Magic('S'): 4311 case Magic('d'): 4312 case Magic('D'): 4313 case Magic('x'): 4314 case Magic('X'): 4315 case Magic('o'): 4316 case Magic('O'): 4317 case Magic('w'): 4318 case Magic('W'): 4319 case Magic('h'): 4320 case Magic('H'): 4321 case Magic('a'): 4322 case Magic('A'): 4323 case Magic('l'): 4324 case Magic('L'): 4325 case Magic('u'): 4326 case Magic('U'): 4327 p = (uint8_t *)vim_strchr((char *)classchars, no_Magic(c)); 4328 if (p == NULL) { 4329 EMSG_RET_NULL(_(e_invalid_use_of_underscore)); 4330 } 4331 // When '.' is followed by a composing char ignore the dot, so that 4332 // the composing char is matched here. 4333 if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { 4334 c = getchr(); 4335 goto do_multibyte; 4336 } 4337 ret = regnode(classcodes[p - classchars] + extra); 4338 *flagp |= HASWIDTH | SIMPLE; 4339 break; 4340 4341 case Magic('n'): 4342 if (reg_string) { 4343 // In a string "\n" matches a newline character. 4344 ret = regnode(EXACTLY); 4345 regc(NL); 4346 regc(NUL); 4347 *flagp |= HASWIDTH | SIMPLE; 4348 } else { 4349 // In buffer text "\n" matches the end of a line. 4350 ret = regnode(NEWL); 4351 *flagp |= HASWIDTH | HASNL; 4352 } 4353 break; 4354 4355 case Magic('('): 4356 if (one_exactly) { 4357 EMSG_ONE_RET_NULL; 4358 } 4359 ret = reg(REG_PAREN, &flags); 4360 if (ret == NULL) { 4361 return NULL; 4362 } 4363 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH); 4364 break; 4365 4366 case NUL: 4367 case Magic('|'): 4368 case Magic('&'): 4369 case Magic(')'): 4370 if (one_exactly) { 4371 EMSG_ONE_RET_NULL; 4372 } 4373 // Supposed to be caught earlier. 4374 IEMSG_RET_NULL(_(e_internal_error_in_regexp)); 4375 // NOTREACHED 4376 4377 case Magic('='): 4378 case Magic('?'): 4379 case Magic('+'): 4380 case Magic('@'): 4381 case Magic('{'): 4382 case Magic('*'): 4383 c = no_Magic(c); 4384 EMSG3_RET_NULL(_("E64: %s%c follows nothing"), 4385 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c); 4386 // NOTREACHED 4387 4388 case Magic('~'): // previous substitute pattern 4389 if (reg_prev_sub != NULL) { 4390 uint8_t *lp; 4391 4392 ret = regnode(EXACTLY); 4393 lp = (uint8_t *)reg_prev_sub; 4394 while (*lp != NUL) { 4395 regc(*lp++); 4396 } 4397 regc(NUL); 4398 if (*reg_prev_sub != NUL) { 4399 *flagp |= HASWIDTH; 4400 if ((lp - (uint8_t *)reg_prev_sub) == 1) { 4401 *flagp |= SIMPLE; 4402 } 4403 } 4404 } else { 4405 EMSG_RET_NULL(_(e_nopresub)); 4406 } 4407 break; 4408 4409 case Magic('1'): 4410 case Magic('2'): 4411 case Magic('3'): 4412 case Magic('4'): 4413 case Magic('5'): 4414 case Magic('6'): 4415 case Magic('7'): 4416 case Magic('8'): 4417 case Magic('9'): { 4418 int refnum; 4419 4420 refnum = c - Magic('0'); 4421 if (!seen_endbrace(refnum)) { 4422 return NULL; 4423 } 4424 ret = regnode(BACKREF + refnum); 4425 } 4426 break; 4427 4428 case Magic('z'): 4429 c = no_Magic(getchr()); 4430 switch (c) { 4431 case '(': 4432 if ((reg_do_extmatch & REX_SET) == 0) { 4433 EMSG_RET_NULL(_(e_z_not_allowed)); 4434 } 4435 if (one_exactly) { 4436 EMSG_ONE_RET_NULL; 4437 } 4438 ret = reg(REG_ZPAREN, &flags); 4439 if (ret == NULL) { 4440 return NULL; 4441 } 4442 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH); 4443 re_has_z = REX_SET; 4444 break; 4445 4446 case '1': 4447 case '2': 4448 case '3': 4449 case '4': 4450 case '5': 4451 case '6': 4452 case '7': 4453 case '8': 4454 case '9': 4455 if ((reg_do_extmatch & REX_USE) == 0) { 4456 EMSG_RET_NULL(_(e_z1_not_allowed)); 4457 } 4458 ret = regnode(ZREF + c - '0'); 4459 re_has_z = REX_USE; 4460 break; 4461 4462 case 's': 4463 ret = regnode(MOPEN + 0); 4464 if (!re_mult_next("\\zs")) { 4465 return NULL; 4466 } 4467 break; 4468 4469 case 'e': 4470 ret = regnode(MCLOSE + 0); 4471 if (!re_mult_next("\\ze")) { 4472 return NULL; 4473 } 4474 break; 4475 4476 default: 4477 EMSG_RET_NULL(_("E68: Invalid character after \\z")); 4478 } 4479 break; 4480 4481 case Magic('%'): 4482 c = no_Magic(getchr()); 4483 switch (c) { 4484 // () without a back reference 4485 case '(': 4486 if (one_exactly) { 4487 EMSG_ONE_RET_NULL; 4488 } 4489 ret = reg(REG_NPAREN, &flags); 4490 if (ret == NULL) { 4491 return NULL; 4492 } 4493 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH); 4494 break; 4495 4496 // Catch \%^ and \%$ regardless of where they appear in the 4497 // pattern -- regardless of whether or not it makes sense. 4498 case '^': 4499 ret = regnode(RE_BOF); 4500 break; 4501 4502 case '$': 4503 ret = regnode(RE_EOF); 4504 break; 4505 4506 case '#': 4507 if (regparse[0] == '=' && regparse[1] >= 48 && regparse[1] <= 50) { 4508 // misplaced \%#=1 4509 semsg(_(e_atom_engine_must_be_at_start_of_pattern), regparse[1]); 4510 return FAIL; 4511 } 4512 ret = regnode(CURSOR); 4513 break; 4514 4515 case 'V': 4516 ret = regnode(RE_VISUAL); 4517 break; 4518 4519 case 'C': 4520 ret = regnode(RE_COMPOSING); 4521 break; 4522 4523 // \%[abc]: Emit as a list of branches, all ending at the last 4524 // branch which matches nothing. 4525 case '[': 4526 if (one_exactly) { // doesn't nest 4527 EMSG_ONE_RET_NULL; 4528 } 4529 { 4530 uint8_t *lastbranch; 4531 uint8_t *lastnode = NULL; 4532 uint8_t *br; 4533 4534 ret = NULL; 4535 while ((c = getchr()) != ']') { 4536 if (c == NUL) { 4537 EMSG2_RET_NULL(_(e_missing_sb), 4538 reg_magic == MAGIC_ALL); 4539 } 4540 br = regnode(BRANCH); 4541 if (ret == NULL) { 4542 ret = br; 4543 } else { 4544 regtail(lastnode, br); 4545 if (reg_toolong) { 4546 return NULL; 4547 } 4548 } 4549 4550 ungetchr(); 4551 one_exactly = true; 4552 lastnode = regatom(flagp); 4553 one_exactly = false; 4554 if (lastnode == NULL) { 4555 return NULL; 4556 } 4557 } 4558 if (ret == NULL) { 4559 EMSG2_RET_NULL(_(e_empty_sb), 4560 reg_magic == MAGIC_ALL); 4561 } 4562 lastbranch = regnode(BRANCH); 4563 br = regnode(NOTHING); 4564 if (ret != JUST_CALC_SIZE) { 4565 regtail(lastnode, br); 4566 regtail(lastbranch, br); 4567 // connect all branches to the NOTHING 4568 // branch at the end 4569 for (br = ret; br != lastnode;) { 4570 if (OP(br) == BRANCH) { 4571 regtail(br, lastbranch); 4572 if (reg_toolong) { 4573 return NULL; 4574 } 4575 br = OPERAND(br); 4576 } else { 4577 br = regnext(br); 4578 } 4579 } 4580 } 4581 *flagp &= ~(HASWIDTH | SIMPLE); 4582 break; 4583 } 4584 4585 case 'd': // %d123 decimal 4586 case 'o': // %o123 octal 4587 case 'x': // %xab hex 2 4588 case 'u': // %uabcd hex 4 4589 case 'U': // %U1234abcd hex 8 4590 { 4591 int64_t i; 4592 4593 switch (c) { 4594 case 'd': 4595 i = getdecchrs(); break; 4596 case 'o': 4597 i = getoctchrs(); break; 4598 case 'x': 4599 i = gethexchrs(2); break; 4600 case 'u': 4601 i = gethexchrs(4); break; 4602 case 'U': 4603 i = gethexchrs(8); break; 4604 default: 4605 i = -1; break; 4606 } 4607 4608 if (i < 0 || i > INT_MAX) { 4609 EMSG2_RET_NULL(_("E678: Invalid character after %s%%[dxouU]"), 4610 reg_magic == MAGIC_ALL); 4611 } 4612 if (use_multibytecode((int)i)) { 4613 ret = regnode(MULTIBYTECODE); 4614 } else { 4615 ret = regnode(EXACTLY); 4616 } 4617 if (i == 0) { 4618 regc(0x0a); 4619 } else { 4620 regmbc((int)i); 4621 } 4622 regc(NUL); 4623 *flagp |= HASWIDTH; 4624 break; 4625 } 4626 4627 default: 4628 if (ascii_isdigit(c) || c == '<' || c == '>' || c == '\'' || c == '.') { 4629 uint32_t n = 0; 4630 int cmp; 4631 bool cur = false; 4632 bool got_digit = false; 4633 4634 cmp = c; 4635 if (cmp == '<' || cmp == '>') { 4636 c = getchr(); 4637 } 4638 if (no_Magic(c) == '.') { 4639 cur = true; 4640 c = getchr(); 4641 } 4642 while (ascii_isdigit(c)) { 4643 got_digit = true; 4644 n = n * 10 + (uint32_t)(c - '0'); 4645 c = getchr(); 4646 } 4647 if (no_Magic(c) == '\'' && n == 0) { 4648 // "\%'m", "\%<'m" and "\%>'m": Mark 4649 c = getchr(); 4650 ret = regnode(RE_MARK); 4651 if (ret == JUST_CALC_SIZE) { 4652 regsize += 2; 4653 } else { 4654 *regcode++ = (uint8_t)c; 4655 *regcode++ = (uint8_t)cmp; 4656 } 4657 break; 4658 } else if ((c == 'l' || c == 'c' || c == 'v') && (cur || got_digit)) { 4659 if (cur && n) { 4660 semsg(_(e_regexp_number_after_dot_pos_search_chr), no_Magic(c)); 4661 rc_did_emsg = true; 4662 return NULL; 4663 } 4664 if (c == 'l') { 4665 if (cur) { 4666 n = (uint32_t)curwin->w_cursor.lnum; 4667 } 4668 ret = regnode(RE_LNUM); 4669 if (save_prev_at_start) { 4670 at_start = true; 4671 } 4672 } else if (c == 'c') { 4673 if (cur) { 4674 n = (uint32_t)curwin->w_cursor.col; 4675 n++; 4676 } 4677 ret = regnode(RE_COL); 4678 } else { 4679 if (cur) { 4680 colnr_T vcol = 0; 4681 getvvcol(curwin, &curwin->w_cursor, NULL, NULL, &vcol); 4682 n = (uint32_t)(++vcol); 4683 } 4684 ret = regnode(RE_VCOL); 4685 } 4686 if (ret == JUST_CALC_SIZE) { 4687 regsize += 5; 4688 } else { 4689 // put the number and the optional 4690 // comparator after the opcode 4691 regcode = re_put_uint32(regcode, n); 4692 *regcode++ = (uint8_t)cmp; 4693 } 4694 break; 4695 } 4696 } 4697 4698 EMSG2_RET_NULL(_("E71: Invalid character after %s%%"), 4699 reg_magic == MAGIC_ALL); 4700 } 4701 break; 4702 4703 case Magic('['): 4704 collection: 4705 { 4706 uint8_t *lp; 4707 4708 // If there is no matching ']', we assume the '[' is a normal 4709 // character. This makes 'incsearch' and ":help [" work. 4710 lp = (uint8_t *)skip_anyof(regparse); 4711 if (*lp == ']') { // there is a matching ']' 4712 int startc = -1; // > 0 when next '-' is a range 4713 int endc; 4714 4715 // In a character class, different parsing rules apply. 4716 // Not even \ is special anymore, nothing is. 4717 if (*regparse == '^') { // Complement of range. 4718 ret = regnode(ANYBUT + extra); 4719 regparse++; 4720 } else { 4721 ret = regnode(ANYOF + extra); 4722 } 4723 4724 // At the start ']' and '-' mean the literal character. 4725 if (*regparse == ']' || *regparse == '-') { 4726 startc = (uint8_t)(*regparse); 4727 regc(*regparse++); 4728 } 4729 4730 while (*regparse != NUL && *regparse != ']') { 4731 if (*regparse == '-') { 4732 regparse++; 4733 // The '-' is not used for a range at the end and 4734 // after or before a '\n'. 4735 if (*regparse == ']' || *regparse == NUL 4736 || startc == -1 4737 || (regparse[0] == '\\' && regparse[1] == 'n')) { 4738 regc('-'); 4739 startc = '-'; // [--x] is a range 4740 } else { 4741 // Also accept "a-[.z.]" 4742 endc = 0; 4743 if (*regparse == '[') { 4744 endc = get_coll_element(®parse); 4745 } 4746 if (endc == 0) { 4747 endc = mb_ptr2char_adv((const char **)®parse); 4748 } 4749 4750 // Handle \o40, \x20 and \u20AC style sequences 4751 if (endc == '\\' && !reg_cpo_lit) { 4752 endc = coll_get_char(); 4753 } 4754 4755 if (startc > endc) { 4756 EMSG_RET_NULL(_(e_reverse_range)); 4757 } 4758 if (utf_char2len(startc) > 1 4759 || utf_char2len(endc) > 1) { 4760 // Limit to a range of 256 chars 4761 if (endc > startc + 256) { 4762 EMSG_RET_NULL(_(e_large_class)); 4763 } 4764 while (++startc <= endc) { 4765 regmbc(startc); 4766 } 4767 } else { 4768 while (++startc <= endc) { 4769 regc(startc); 4770 } 4771 } 4772 startc = -1; 4773 } 4774 } 4775 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim 4776 // accepts "\t", "\e", etc., but only when the 'l' flag in 4777 // 'cpoptions' is not included. 4778 else if (*regparse == '\\' 4779 && (vim_strchr(REGEXP_INRANGE, (uint8_t)regparse[1]) != NULL 4780 || (!reg_cpo_lit 4781 && vim_strchr(REGEXP_ABBR, 4782 (uint8_t)regparse[1]) != NULL))) { 4783 regparse++; 4784 if (*regparse == 'n') { 4785 // '\n' in range: also match NL 4786 if (ret != JUST_CALC_SIZE) { 4787 // Using \n inside [^] does not change what 4788 // matches. "[^\n]" is the same as ".". 4789 if (*ret == ANYOF) { 4790 *ret = ANYOF + ADD_NL; 4791 *flagp |= HASNL; 4792 } 4793 // else: must have had a \n already 4794 } 4795 regparse++; 4796 startc = -1; 4797 } else if (*regparse == 'd' 4798 || *regparse == 'o' 4799 || *regparse == 'x' 4800 || *regparse == 'u' 4801 || *regparse == 'U') { 4802 startc = coll_get_char(); 4803 // max UTF-8 Codepoint is U+10FFFF, 4804 // but allow values until INT_MAX 4805 if (startc == INT_MAX) { 4806 EMSG_RET_NULL(_(e_unicode_val_too_large)); 4807 } 4808 if (startc == 0) { 4809 regc(0x0a); 4810 } else { 4811 regmbc(startc); 4812 } 4813 } else { 4814 startc = backslash_trans(*regparse++); 4815 regc(startc); 4816 } 4817 } else if (*regparse == '[') { 4818 int c_class; 4819 int cu; 4820 4821 c_class = get_char_class(®parse); 4822 startc = -1; 4823 // Characters assumed to be 8 bits! 4824 switch (c_class) { 4825 case CLASS_NONE: 4826 c_class = get_equi_class(®parse); 4827 if (c_class != 0) { 4828 // produce equivalence class 4829 reg_equi_class(c_class); 4830 } else if ((c_class = get_coll_element(®parse)) != 0) { 4831 // produce a collating element 4832 regmbc(c_class); 4833 } else { 4834 // literal '[', allow [[-x] as a range 4835 startc = (uint8_t)(*regparse++); 4836 regc(startc); 4837 } 4838 break; 4839 case CLASS_ALNUM: 4840 for (cu = 1; cu < 128; cu++) { 4841 if (isalnum(cu)) { 4842 regmbc(cu); 4843 } 4844 } 4845 break; 4846 case CLASS_ALPHA: 4847 for (cu = 1; cu < 128; cu++) { 4848 if (isalpha(cu)) { 4849 regmbc(cu); 4850 } 4851 } 4852 break; 4853 case CLASS_BLANK: 4854 regc(' '); 4855 regc('\t'); 4856 break; 4857 case CLASS_CNTRL: 4858 for (cu = 1; cu <= 127; cu++) { 4859 if (iscntrl(cu)) { 4860 regmbc(cu); 4861 } 4862 } 4863 break; 4864 case CLASS_DIGIT: 4865 for (cu = 1; cu <= 127; cu++) { 4866 if (ascii_isdigit(cu)) { 4867 regmbc(cu); 4868 } 4869 } 4870 break; 4871 case CLASS_GRAPH: 4872 for (cu = 1; cu <= 127; cu++) { 4873 if (isgraph(cu)) { 4874 regmbc(cu); 4875 } 4876 } 4877 break; 4878 case CLASS_LOWER: 4879 for (cu = 1; cu <= 255; cu++) { 4880 if (mb_islower(cu) && cu != 170 && cu != 186) { 4881 regmbc(cu); 4882 } 4883 } 4884 break; 4885 case CLASS_PRINT: 4886 for (cu = 1; cu <= 255; cu++) { 4887 if (vim_isprintc(cu)) { 4888 regmbc(cu); 4889 } 4890 } 4891 break; 4892 case CLASS_PUNCT: 4893 for (cu = 1; cu < 128; cu++) { 4894 if (ispunct(cu)) { 4895 regmbc(cu); 4896 } 4897 } 4898 break; 4899 case CLASS_SPACE: 4900 for (cu = 9; cu <= 13; cu++) { 4901 regc(cu); 4902 } 4903 regc(' '); 4904 break; 4905 case CLASS_UPPER: 4906 for (cu = 1; cu <= 255; cu++) { 4907 if (mb_isupper(cu)) { 4908 regmbc(cu); 4909 } 4910 } 4911 break; 4912 case CLASS_XDIGIT: 4913 for (cu = 1; cu <= 255; cu++) { 4914 if (ascii_isxdigit(cu)) { 4915 regmbc(cu); 4916 } 4917 } 4918 break; 4919 case CLASS_TAB: 4920 regc('\t'); 4921 break; 4922 case CLASS_RETURN: 4923 regc('\r'); 4924 break; 4925 case CLASS_BACKSPACE: 4926 regc('\b'); 4927 break; 4928 case CLASS_ESCAPE: 4929 regc(ESC); 4930 break; 4931 case CLASS_IDENT: 4932 for (cu = 1; cu <= 255; cu++) { 4933 if (vim_isIDc(cu)) { 4934 regmbc(cu); 4935 } 4936 } 4937 break; 4938 case CLASS_KEYWORD: 4939 for (cu = 1; cu <= 255; cu++) { 4940 if (reg_iswordc(cu)) { 4941 regmbc(cu); 4942 } 4943 } 4944 break; 4945 case CLASS_FNAME: 4946 for (cu = 1; cu <= 255; cu++) { 4947 if (vim_isfilec(cu)) { 4948 regmbc(cu); 4949 } 4950 } 4951 break; 4952 } 4953 } else { 4954 // produce a multibyte character, including any 4955 // following composing characters. 4956 startc = utf_ptr2char(regparse); 4957 int len = utfc_ptr2len(regparse); 4958 if (utf_char2len(startc) != len) { 4959 // composing chars 4960 startc = -1; 4961 } 4962 while (--len >= 0) { 4963 regc(*regparse++); 4964 } 4965 } 4966 } 4967 regc(NUL); 4968 prevchr_len = 1; // last char was the ']' 4969 if (*regparse != ']') { 4970 EMSG_RET_NULL(_(e_toomsbra)); // Cannot happen? 4971 } 4972 skipchr(); // let's be friends with the lexer again 4973 *flagp |= HASWIDTH | SIMPLE; 4974 break; 4975 } else if (reg_strict) { 4976 EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF); 4977 } 4978 } 4979 FALLTHROUGH; 4980 4981 default: { 4982 int len; 4983 4984 // A multi-byte character is handled as a separate atom if it's 4985 // before a multi and when it's a composing char. 4986 if (use_multibytecode(c)) { 4987 do_multibyte: 4988 ret = regnode(MULTIBYTECODE); 4989 regmbc(c); 4990 *flagp |= HASWIDTH | SIMPLE; 4991 break; 4992 } 4993 4994 ret = regnode(EXACTLY); 4995 4996 // Append characters as long as: 4997 // - there is no following multi, we then need the character in 4998 // front of it as a single character operand 4999 // - not running into a Magic character 5000 // - "one_exactly" is not set 5001 // But always emit at least one character. Might be a Multi, 5002 // e.g., a "[" without matching "]". 5003 for (len = 0; c != NUL && (len == 0 5004 || (re_multi_type(peekchr()) == NOT_MULTI 5005 && !one_exactly 5006 && !is_Magic(c))); len++) { 5007 c = no_Magic(c); 5008 { 5009 regmbc(c); 5010 { 5011 int l; 5012 5013 // Need to get composing character too. 5014 GraphemeState state = GRAPHEME_STATE_INIT; 5015 while (true) { 5016 l = utf_ptr2len(regparse); 5017 if (!utf_composinglike(regparse, regparse + l, &state)) { 5018 break; 5019 } 5020 regmbc(utf_ptr2char(regparse)); 5021 skipchr(); 5022 } 5023 } 5024 } 5025 c = getchr(); 5026 } 5027 ungetchr(); 5028 5029 regc(NUL); 5030 *flagp |= HASWIDTH; 5031 if (len == 1) { 5032 *flagp |= SIMPLE; 5033 } 5034 } 5035 break; 5036 } 5037 5038 return ret; 5039 } 5040 5041 // Parse something followed by possible [*+=]. 5042 // 5043 // Note that the branching code sequences used for = and the general cases 5044 // of * and + are somewhat optimized: they use the same NOTHING node as 5045 // both the endmarker for their branch list and the body of the last branch. 5046 // It might seem that this node could be dispensed with entirely, but the 5047 // endmarker role is not redundant. 5048 static uint8_t *regpiece(int *flagp) 5049 { 5050 uint8_t *ret; 5051 int op; 5052 uint8_t *next; 5053 int flags; 5054 int minval; 5055 int maxval; 5056 5057 ret = regatom(&flags); 5058 if (ret == NULL) { 5059 return NULL; 5060 } 5061 5062 op = peekchr(); 5063 if (re_multi_type(op) == NOT_MULTI) { 5064 *flagp = flags; 5065 return ret; 5066 } 5067 // default flags 5068 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH))); 5069 5070 skipchr(); 5071 switch (op) { 5072 case Magic('*'): 5073 if (flags & SIMPLE) { 5074 reginsert(STAR, ret); 5075 } else { 5076 // Emit x* as (x&|), where & means "self". 5077 reginsert(BRANCH, ret); // Either x 5078 regoptail(ret, regnode(BACK)); // and loop 5079 regoptail(ret, ret); // back 5080 regtail(ret, regnode(BRANCH)); // or 5081 regtail(ret, regnode(NOTHING)); // null. 5082 } 5083 break; 5084 5085 case Magic('+'): 5086 if (flags & SIMPLE) { 5087 reginsert(PLUS, ret); 5088 } else { 5089 // Emit x+ as x(&|), where & means "self". 5090 next = regnode(BRANCH); // Either 5091 regtail(ret, next); 5092 regtail(regnode(BACK), ret); // loop back 5093 regtail(next, regnode(BRANCH)); // or 5094 regtail(ret, regnode(NOTHING)); // null. 5095 } 5096 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH))); 5097 break; 5098 5099 case Magic('@'): { 5100 int lop = END; 5101 int64_t nr = getdecchrs(); 5102 5103 switch (no_Magic(getchr())) { 5104 case '=': 5105 lop = MATCH; break; // \@= 5106 case '!': 5107 lop = NOMATCH; break; // \@! 5108 case '>': 5109 lop = SUBPAT; break; // \@> 5110 case '<': 5111 switch (no_Magic(getchr())) { 5112 case '=': 5113 lop = BEHIND; break; // \@<= 5114 case '!': 5115 lop = NOBEHIND; break; // \@<! 5116 } 5117 } 5118 if (lop == END) { 5119 EMSG2_RET_NULL(_(e_invalid_character_after_str_at), 5120 reg_magic == MAGIC_ALL); 5121 } 5122 // Look behind must match with behind_pos. 5123 if (lop == BEHIND || lop == NOBEHIND) { 5124 regtail(ret, regnode(BHPOS)); 5125 *flagp |= HASLOOKBH; 5126 } 5127 regtail(ret, regnode(END)); // operand ends 5128 if (lop == BEHIND || lop == NOBEHIND) { 5129 if (nr < 0) { 5130 nr = 0; // no limit is same as zero limit 5131 } 5132 reginsert_nr(lop, (uint32_t)nr, ret); 5133 } else { 5134 reginsert(lop, ret); 5135 } 5136 break; 5137 } 5138 5139 case Magic('?'): 5140 case Magic('='): 5141 // Emit x= as (x|) 5142 reginsert(BRANCH, ret); // Either x 5143 regtail(ret, regnode(BRANCH)); // or 5144 next = regnode(NOTHING); // null. 5145 regtail(ret, next); 5146 regoptail(ret, next); 5147 break; 5148 5149 case Magic('{'): 5150 if (!read_limits(&minval, &maxval)) { 5151 return NULL; 5152 } 5153 if (flags & SIMPLE) { 5154 reginsert(BRACE_SIMPLE, ret); 5155 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); 5156 } else { 5157 if (num_complex_braces >= 10) { 5158 EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"), 5159 reg_magic == MAGIC_ALL); 5160 } 5161 reginsert(BRACE_COMPLEX + num_complex_braces, ret); 5162 regoptail(ret, regnode(BACK)); 5163 regoptail(ret, ret); 5164 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); 5165 num_complex_braces++; 5166 } 5167 if (minval > 0 && maxval > 0) { 5168 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH))); 5169 } 5170 break; 5171 } 5172 if (re_multi_type(peekchr()) != NOT_MULTI) { 5173 // Can't have a multi follow a multi. 5174 if (peekchr() == Magic('*')) { 5175 EMSG2_RET_NULL(_("E61: Nested %s*"), reg_magic >= MAGIC_ON); 5176 } 5177 EMSG3_RET_NULL(_("E62: Nested %s%c"), reg_magic == MAGIC_ALL, no_Magic(peekchr())); 5178 } 5179 5180 return ret; 5181 } 5182 5183 // Parse one alternative of an | or & operator. 5184 // Implements the concatenation operator. 5185 static uint8_t *regconcat(int *flagp) 5186 { 5187 uint8_t *first = NULL; 5188 uint8_t *chain = NULL; 5189 uint8_t *latest; 5190 int flags; 5191 int cont = true; 5192 5193 *flagp = WORST; // Tentatively. 5194 5195 while (cont) { 5196 switch (peekchr()) { 5197 case NUL: 5198 case Magic('|'): 5199 case Magic('&'): 5200 case Magic(')'): 5201 cont = false; 5202 break; 5203 case Magic('Z'): 5204 regflags |= RF_ICOMBINE; 5205 skipchr_keepstart(); 5206 break; 5207 case Magic('c'): 5208 regflags |= RF_ICASE; 5209 skipchr_keepstart(); 5210 break; 5211 case Magic('C'): 5212 regflags |= RF_NOICASE; 5213 skipchr_keepstart(); 5214 break; 5215 case Magic('v'): 5216 reg_magic = MAGIC_ALL; 5217 skipchr_keepstart(); 5218 curchr = -1; 5219 break; 5220 case Magic('m'): 5221 reg_magic = MAGIC_ON; 5222 skipchr_keepstart(); 5223 curchr = -1; 5224 break; 5225 case Magic('M'): 5226 reg_magic = MAGIC_OFF; 5227 skipchr_keepstart(); 5228 curchr = -1; 5229 break; 5230 case Magic('V'): 5231 reg_magic = MAGIC_NONE; 5232 skipchr_keepstart(); 5233 curchr = -1; 5234 break; 5235 default: 5236 latest = regpiece(&flags); 5237 if (latest == NULL || reg_toolong) { 5238 return NULL; 5239 } 5240 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH); 5241 if (chain == NULL) { // First piece. 5242 *flagp |= flags & SPSTART; 5243 } else { 5244 regtail(chain, latest); 5245 } 5246 chain = latest; 5247 if (first == NULL) { 5248 first = latest; 5249 } 5250 break; 5251 } 5252 } 5253 if (first == NULL) { // Loop ran zero times. 5254 first = regnode(NOTHING); 5255 } 5256 return first; 5257 } 5258 5259 // Parse one alternative of an | operator. 5260 // Implements the & operator. 5261 static uint8_t *regbranch(int *flagp) 5262 { 5263 uint8_t *ret; 5264 uint8_t *chain = NULL; 5265 uint8_t *latest; 5266 int flags; 5267 5268 *flagp = WORST | HASNL; // Tentatively. 5269 5270 ret = regnode(BRANCH); 5271 while (true) { 5272 latest = regconcat(&flags); 5273 if (latest == NULL) { 5274 return NULL; 5275 } 5276 // If one of the branches has width, the whole thing has. If one of 5277 // the branches anchors at start-of-line, the whole thing does. 5278 // If one of the branches uses look-behind, the whole thing does. 5279 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH); 5280 // If one of the branches doesn't match a line-break, the whole thing 5281 // doesn't. 5282 *flagp &= ~HASNL | (flags & HASNL); 5283 if (chain != NULL) { 5284 regtail(chain, latest); 5285 } 5286 if (peekchr() != Magic('&')) { 5287 break; 5288 } 5289 skipchr(); 5290 regtail(latest, regnode(END)); // operand ends 5291 if (reg_toolong) { 5292 break; 5293 } 5294 reginsert(MATCH, latest); 5295 chain = latest; 5296 } 5297 5298 return ret; 5299 } 5300 5301 /// Parse regular expression, i.e. main body or parenthesized thing. 5302 /// 5303 /// Caller must absorb opening parenthesis. 5304 /// 5305 /// Combining parenthesis handling with the base level of regular expression 5306 /// is a trifle forced, but the need to tie the tails of the branches to what 5307 /// follows makes it hard to avoid. 5308 /// 5309 /// @param paren REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN 5310 static uint8_t *reg(int paren, int *flagp) 5311 { 5312 uint8_t *ret; 5313 uint8_t *br; 5314 uint8_t *ender; 5315 int parno = 0; 5316 int flags; 5317 5318 *flagp = HASWIDTH; // Tentatively. 5319 5320 if (paren == REG_ZPAREN) { 5321 // Make a ZOPEN node. 5322 if (regnzpar >= NSUBEXP) { 5323 EMSG_RET_NULL(_("E50: Too many \\z(")); 5324 } 5325 parno = regnzpar; 5326 regnzpar++; 5327 ret = regnode(ZOPEN + parno); 5328 } else if (paren == REG_PAREN) { 5329 // Make a MOPEN node. 5330 if (regnpar >= NSUBEXP) { 5331 EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL); 5332 } 5333 parno = regnpar; 5334 regnpar++; 5335 ret = regnode(MOPEN + parno); 5336 } else if (paren == REG_NPAREN) { 5337 // Make a NOPEN node. 5338 ret = regnode(NOPEN); 5339 } else { 5340 ret = NULL; 5341 } 5342 5343 // Pick up the branches, linking them together. 5344 br = regbranch(&flags); 5345 if (br == NULL) { 5346 return NULL; 5347 } 5348 if (ret != NULL) { 5349 regtail(ret, br); // [MZ]OPEN -> first. 5350 } else { 5351 ret = br; 5352 } 5353 // If one of the branches can be zero-width, the whole thing can. 5354 // If one of the branches has * at start or matches a line-break, the 5355 // whole thing can. 5356 if (!(flags & HASWIDTH)) { 5357 *flagp &= ~HASWIDTH; 5358 } 5359 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH); 5360 while (peekchr() == Magic('|')) { 5361 skipchr(); 5362 br = regbranch(&flags); 5363 if (br == NULL || reg_toolong) { 5364 return NULL; 5365 } 5366 regtail(ret, br); // BRANCH -> BRANCH. 5367 if (!(flags & HASWIDTH)) { 5368 *flagp &= ~HASWIDTH; 5369 } 5370 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH); 5371 } 5372 5373 // Make a closing node, and hook it on the end. 5374 ender = regnode(paren == REG_ZPAREN ? ZCLOSE + parno 5375 : paren == REG_PAREN ? MCLOSE + parno 5376 : paren == REG_NPAREN ? NCLOSE : END); 5377 regtail(ret, ender); 5378 5379 // Hook the tails of the branches to the closing node. 5380 for (br = ret; br != NULL; br = regnext(br)) { 5381 regoptail(br, ender); 5382 } 5383 5384 // Check for proper termination. 5385 if (paren != REG_NOPAREN && getchr() != Magic(')')) { 5386 if (paren == REG_ZPAREN) { 5387 EMSG_RET_NULL(_("E52: Unmatched \\z(")); 5388 } else if (paren == REG_NPAREN) { 5389 EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); 5390 } else { 5391 EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL); 5392 } 5393 } else if (paren == REG_NOPAREN && peekchr() != NUL) { 5394 if (curchr == Magic(')')) { 5395 EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); 5396 } else { 5397 EMSG_RET_NULL(_(e_trailing)); // "Can't happen". 5398 } 5399 // NOTREACHED 5400 } 5401 // Here we set the flag allowing back references to this set of 5402 // parentheses. 5403 if (paren == REG_PAREN) { 5404 had_endbrace[parno] = true; // have seen the close paren 5405 } 5406 return ret; 5407 } 5408 5409 // bt_regcomp() - compile a regular expression into internal code for the 5410 // traditional back track matcher. 5411 // Returns the program in allocated space. Returns NULL for an error. 5412 // 5413 // We can't allocate space until we know how big the compiled form will be, 5414 // but we can't compile it (and thus know how big it is) until we've got a 5415 // place to put the code. So we cheat: we compile it twice, once with code 5416 // generation turned off and size counting turned on, and once "for real". 5417 // This also means that we don't allocate space until we are sure that the 5418 // thing really will compile successfully, and we never have to move the 5419 // code and thus invalidate pointers into it. (Note that it has to be in 5420 // one piece because free() must be able to free it all.) 5421 // 5422 // Whether upper/lower case is to be ignored is decided when executing the 5423 // program, it does not matter here. 5424 // 5425 // Beware that the optimization-preparation code in here knows about some 5426 // of the structure of the compiled regexp. 5427 // "re_flags": RE_MAGIC and/or RE_STRING. 5428 static regprog_T *bt_regcomp(uint8_t *expr, int re_flags) 5429 { 5430 uint8_t *scan; 5431 uint8_t *longest; 5432 int len; 5433 int flags; 5434 5435 if (expr == NULL) { 5436 IEMSG_RET_NULL(_(e_null)); 5437 } 5438 5439 init_class_tab(); 5440 5441 // First pass: determine size, legality. 5442 regcomp_start(expr, re_flags); 5443 regcode = JUST_CALC_SIZE; 5444 regc(REGMAGIC); 5445 if (reg(REG_NOPAREN, &flags) == NULL) { 5446 return NULL; 5447 } 5448 5449 // Allocate space. 5450 bt_regprog_T *r = xmalloc(offsetof(bt_regprog_T, program) + (size_t)regsize); 5451 r->re_in_use = false; 5452 5453 // Second pass: emit code. 5454 regcomp_start(expr, re_flags); 5455 regcode = r->program; 5456 regc(REGMAGIC); 5457 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong) { 5458 xfree(r); 5459 if (reg_toolong) { 5460 EMSG_RET_NULL(_("E339: Pattern too long")); 5461 } 5462 return NULL; 5463 } 5464 5465 // Dig out information for optimizations. 5466 r->regstart = NUL; // Worst-case defaults. 5467 r->reganch = 0; 5468 r->regmust = NULL; 5469 r->regmlen = 0; 5470 r->regflags = regflags; 5471 if (flags & HASNL) { 5472 r->regflags |= RF_HASNL; 5473 } 5474 if (flags & HASLOOKBH) { 5475 r->regflags |= RF_LOOKBH; 5476 } 5477 // Remember whether this pattern has any \z specials in it. 5478 r->reghasz = (uint8_t)re_has_z; 5479 scan = &r->program[1]; // First BRANCH. 5480 if (OP(regnext(scan)) == END) { // Only one top-level choice. 5481 scan = OPERAND(scan); 5482 5483 // Starting-point info. 5484 if (OP(scan) == BOL || OP(scan) == RE_BOF) { 5485 r->reganch++; 5486 scan = regnext(scan); 5487 } 5488 5489 if (OP(scan) == EXACTLY) { 5490 r->regstart = utf_ptr2char((char *)OPERAND(scan)); 5491 } else if (OP(scan) == BOW 5492 || OP(scan) == EOW 5493 || OP(scan) == NOTHING 5494 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN 5495 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE) { 5496 uint8_t *regnext_scan = regnext(scan); 5497 if (OP(regnext_scan) == EXACTLY) { 5498 r->regstart = utf_ptr2char((char *)OPERAND(regnext_scan)); 5499 } 5500 } 5501 5502 // If there's something expensive in the r.e., find the longest 5503 // literal string that must appear and make it the regmust. Resolve 5504 // ties in favor of later strings, since the regstart check works 5505 // with the beginning of the r.e. and avoiding duplication 5506 // strengthens checking. Not a strong reason, but sufficient in the 5507 // absence of others. 5508 5509 // When the r.e. starts with BOW, it is faster to look for a regmust 5510 // first. Used a lot for "#" and "*" commands. (Added by mool). 5511 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW) 5512 && !(flags & HASNL)) { 5513 longest = NULL; 5514 len = 0; 5515 for (; scan != NULL; scan = regnext(scan)) { 5516 if (OP(scan) == EXACTLY) { 5517 size_t scanlen = strlen((char *)OPERAND(scan)); 5518 if (scanlen >= (size_t)len) { 5519 longest = OPERAND(scan); 5520 len = (int)scanlen; 5521 } 5522 } 5523 } 5524 r->regmust = longest; 5525 r->regmlen = len; 5526 } 5527 } 5528 #ifdef BT_REGEXP_DUMP 5529 regdump(expr, r); 5530 #endif 5531 r->engine = &bt_regengine; 5532 return (regprog_T *)r; 5533 } 5534 5535 // Check if during the previous call to vim_regcomp the EOL item "$" has been 5536 // found. This is messy, but it works fine. 5537 int vim_regcomp_had_eol(void) 5538 { 5539 return had_eol; 5540 } 5541 5542 // Get a number after a backslash that is inside []. 5543 // When nothing is recognized return a backslash. 5544 static int coll_get_char(void) 5545 { 5546 int64_t nr = -1; 5547 5548 switch (*regparse++) { 5549 case 'd': 5550 nr = getdecchrs(); break; 5551 case 'o': 5552 nr = getoctchrs(); break; 5553 case 'x': 5554 nr = gethexchrs(2); break; 5555 case 'u': 5556 nr = gethexchrs(4); break; 5557 case 'U': 5558 nr = gethexchrs(8); break; 5559 } 5560 if (nr < 0) { 5561 // If getting the number fails be backwards compatible: the character 5562 // is a backslash. 5563 regparse--; 5564 nr = '\\'; 5565 } 5566 if (nr > INT_MAX) { 5567 nr = INT_MAX; 5568 } 5569 return (int)nr; 5570 } 5571 5572 // Free a compiled regexp program, returned by bt_regcomp(). 5573 static void bt_regfree(regprog_T *prog) 5574 { 5575 xfree(prog); 5576 } 5577 5578 #define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input) 5579 5580 // The arguments from BRACE_LIMITS are stored here. They are actually local 5581 // to regmatch(), but they are here to reduce the amount of stack space used 5582 // (it can be called recursively many times). 5583 static int64_t bl_minval; 5584 static int64_t bl_maxval; 5585 5586 // Save the input line and position in a regsave_T. 5587 static void reg_save(regsave_T *save, garray_T *gap) 5588 FUNC_ATTR_NONNULL_ALL 5589 { 5590 if (REG_MULTI) { 5591 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line); 5592 save->rs_u.pos.lnum = rex.lnum; 5593 } else { 5594 save->rs_u.ptr = rex.input; 5595 } 5596 save->rs_len = gap->ga_len; 5597 } 5598 5599 // Restore the input line and position from a regsave_T. 5600 static void reg_restore(regsave_T *save, garray_T *gap) 5601 FUNC_ATTR_NONNULL_ALL 5602 { 5603 if (REG_MULTI) { 5604 if (rex.lnum != save->rs_u.pos.lnum) { 5605 // only call reg_getline() when the line number changed to save 5606 // a bit of time 5607 rex.lnum = save->rs_u.pos.lnum; 5608 rex.line = (uint8_t *)reg_getline(rex.lnum); 5609 } 5610 rex.input = rex.line + save->rs_u.pos.col; 5611 } else { 5612 rex.input = save->rs_u.ptr; 5613 } 5614 gap->ga_len = save->rs_len; 5615 } 5616 5617 // Return true if current position is equal to saved position. 5618 static bool reg_save_equal(const regsave_T *save) 5619 FUNC_ATTR_NONNULL_ALL 5620 { 5621 if (REG_MULTI) { 5622 return rex.lnum == save->rs_u.pos.lnum 5623 && rex.input == rex.line + save->rs_u.pos.col; 5624 } 5625 return rex.input == save->rs_u.ptr; 5626 } 5627 5628 // Save the sub-expressions before attempting a match. 5629 #define save_se(savep, posp, pp) \ 5630 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp)) 5631 5632 // After a failed match restore the sub-expressions. 5633 #define restore_se(savep, posp, pp) { \ 5634 if (REG_MULTI) \ 5635 *(posp) = (savep)->se_u.pos; \ 5636 else \ 5637 *(pp) = (savep)->se_u.ptr; } 5638 5639 // Tentatively set the sub-expression start to the current position (after 5640 // calling regmatch() they will have changed). Need to save the existing 5641 // values for when there is no match. 5642 // Use se_save() to use pointer (save_se_multi()) or position (save_se_one()), 5643 // depending on REG_MULTI. 5644 static void save_se_multi(save_se_T *savep, lpos_T *posp) 5645 { 5646 savep->se_u.pos = *posp; 5647 posp->lnum = rex.lnum; 5648 posp->col = (colnr_T)(rex.input - rex.line); 5649 } 5650 5651 static void save_se_one(save_se_T *savep, uint8_t **pp) 5652 { 5653 savep->se_u.ptr = *pp; 5654 *pp = rex.input; 5655 } 5656 5657 /// regrepeat - repeatedly match something simple, return how many. 5658 /// Advances rex.input (and rex.lnum) to just after the matched chars. 5659 /// 5660 /// @param maxcount maximum number of matches allowed 5661 static int regrepeat(uint8_t *p, int64_t maxcount) 5662 { 5663 int64_t count = 0; 5664 uint8_t *opnd; 5665 int mask; 5666 int testval = 0; 5667 5668 uint8_t *scan = rex.input; // Make local copy of rex.input for speed. 5669 opnd = OPERAND(p); 5670 switch (OP(p)) { 5671 case ANY: 5672 case ANY + ADD_NL: 5673 while (count < maxcount) { 5674 // Matching anything means we continue until end-of-line (or 5675 // end-of-file for ANY + ADD_NL), only limited by maxcount. 5676 while (*scan != NUL && count < maxcount) { 5677 count++; 5678 MB_PTR_ADV(scan); 5679 } 5680 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5681 || rex.reg_line_lbr || count == maxcount) { 5682 break; 5683 } 5684 count++; // count the line-break 5685 reg_nextline(); 5686 scan = rex.input; 5687 if (got_int) { 5688 break; 5689 } 5690 } 5691 break; 5692 5693 case IDENT: 5694 case IDENT + ADD_NL: 5695 testval = 1; 5696 FALLTHROUGH; 5697 case SIDENT: 5698 case SIDENT + ADD_NL: 5699 while (count < maxcount) { 5700 if (vim_isIDc(utf_ptr2char((char *)scan)) && (testval || !ascii_isdigit(*scan))) { 5701 MB_PTR_ADV(scan); 5702 } else if (*scan == NUL) { 5703 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5704 || rex.reg_line_lbr) { 5705 break; 5706 } 5707 reg_nextline(); 5708 scan = rex.input; 5709 if (got_int) { 5710 break; 5711 } 5712 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { 5713 scan++; 5714 } else { 5715 break; 5716 } 5717 count++; 5718 } 5719 break; 5720 5721 case KWORD: 5722 case KWORD + ADD_NL: 5723 testval = 1; 5724 FALLTHROUGH; 5725 case SKWORD: 5726 case SKWORD + ADD_NL: 5727 while (count < maxcount) { 5728 if (vim_iswordp_buf((char *)scan, rex.reg_buf) 5729 && (testval || !ascii_isdigit(*scan))) { 5730 MB_PTR_ADV(scan); 5731 } else if (*scan == NUL) { 5732 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5733 || rex.reg_line_lbr) { 5734 break; 5735 } 5736 reg_nextline(); 5737 scan = rex.input; 5738 if (got_int) { 5739 break; 5740 } 5741 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { 5742 scan++; 5743 } else { 5744 break; 5745 } 5746 count++; 5747 } 5748 break; 5749 5750 case FNAME: 5751 case FNAME + ADD_NL: 5752 testval = 1; 5753 FALLTHROUGH; 5754 case SFNAME: 5755 case SFNAME + ADD_NL: 5756 while (count < maxcount) { 5757 if (vim_isfilec(utf_ptr2char((char *)scan)) && (testval || !ascii_isdigit(*scan))) { 5758 MB_PTR_ADV(scan); 5759 } else if (*scan == NUL) { 5760 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5761 || rex.reg_line_lbr) { 5762 break; 5763 } 5764 reg_nextline(); 5765 scan = rex.input; 5766 if (got_int) { 5767 break; 5768 } 5769 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { 5770 scan++; 5771 } else { 5772 break; 5773 } 5774 count++; 5775 } 5776 break; 5777 5778 case PRINT: 5779 case PRINT + ADD_NL: 5780 testval = 1; 5781 FALLTHROUGH; 5782 case SPRINT: 5783 case SPRINT + ADD_NL: 5784 while (count < maxcount) { 5785 if (*scan == NUL) { 5786 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5787 || rex.reg_line_lbr) { 5788 break; 5789 } 5790 reg_nextline(); 5791 scan = rex.input; 5792 if (got_int) { 5793 break; 5794 } 5795 } else if (vim_isprintc(utf_ptr2char((char *)scan)) == 1 5796 && (testval || !ascii_isdigit(*scan))) { 5797 MB_PTR_ADV(scan); 5798 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { 5799 scan++; 5800 } else { 5801 break; 5802 } 5803 count++; 5804 } 5805 break; 5806 5807 case WHITE: 5808 case WHITE + ADD_NL: 5809 testval = mask = RI_WHITE; 5810 do_class: 5811 while (count < maxcount) { 5812 int l; 5813 if (*scan == NUL) { 5814 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5815 || rex.reg_line_lbr) { 5816 break; 5817 } 5818 reg_nextline(); 5819 scan = rex.input; 5820 if (got_int) { 5821 break; 5822 } 5823 } else if ((l = utfc_ptr2len((char *)scan)) > 1) { 5824 if (testval != 0) { 5825 break; 5826 } 5827 scan += l; 5828 } else if ((class_tab[*scan] & mask) == testval) { 5829 scan++; 5830 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { 5831 scan++; 5832 } else { 5833 break; 5834 } 5835 count++; 5836 } 5837 break; 5838 5839 case NWHITE: 5840 case NWHITE + ADD_NL: 5841 mask = RI_WHITE; 5842 goto do_class; 5843 case DIGIT: 5844 case DIGIT + ADD_NL: 5845 testval = mask = RI_DIGIT; 5846 goto do_class; 5847 case NDIGIT: 5848 case NDIGIT + ADD_NL: 5849 mask = RI_DIGIT; 5850 goto do_class; 5851 case HEX: 5852 case HEX + ADD_NL: 5853 testval = mask = RI_HEX; 5854 goto do_class; 5855 case NHEX: 5856 case NHEX + ADD_NL: 5857 mask = RI_HEX; 5858 goto do_class; 5859 case OCTAL: 5860 case OCTAL + ADD_NL: 5861 testval = mask = RI_OCTAL; 5862 goto do_class; 5863 case NOCTAL: 5864 case NOCTAL + ADD_NL: 5865 mask = RI_OCTAL; 5866 goto do_class; 5867 case WORD: 5868 case WORD + ADD_NL: 5869 testval = mask = RI_WORD; 5870 goto do_class; 5871 case NWORD: 5872 case NWORD + ADD_NL: 5873 mask = RI_WORD; 5874 goto do_class; 5875 case HEAD: 5876 case HEAD + ADD_NL: 5877 testval = mask = RI_HEAD; 5878 goto do_class; 5879 case NHEAD: 5880 case NHEAD + ADD_NL: 5881 mask = RI_HEAD; 5882 goto do_class; 5883 case ALPHA: 5884 case ALPHA + ADD_NL: 5885 testval = mask = RI_ALPHA; 5886 goto do_class; 5887 case NALPHA: 5888 case NALPHA + ADD_NL: 5889 mask = RI_ALPHA; 5890 goto do_class; 5891 case LOWER: 5892 case LOWER + ADD_NL: 5893 testval = mask = RI_LOWER; 5894 goto do_class; 5895 case NLOWER: 5896 case NLOWER + ADD_NL: 5897 mask = RI_LOWER; 5898 goto do_class; 5899 case UPPER: 5900 case UPPER + ADD_NL: 5901 testval = mask = RI_UPPER; 5902 goto do_class; 5903 case NUPPER: 5904 case NUPPER + ADD_NL: 5905 mask = RI_UPPER; 5906 goto do_class; 5907 5908 case EXACTLY: { 5909 int cu, cl; 5910 5911 // This doesn't do a multi-byte character, because a MULTIBYTECODE 5912 // would have been used for it. It does handle single-byte 5913 // characters, such as latin1. 5914 if (rex.reg_ic) { 5915 cu = mb_toupper(*opnd); 5916 cl = mb_tolower(*opnd); 5917 while (count < maxcount && (*scan == cu || *scan == cl)) { 5918 count++; 5919 scan++; 5920 } 5921 } else { 5922 cu = *opnd; 5923 while (count < maxcount && *scan == cu) { 5924 count++; 5925 scan++; 5926 } 5927 } 5928 break; 5929 } 5930 5931 case MULTIBYTECODE: { 5932 int i, len, cf = 0; 5933 5934 // Safety check (just in case 'encoding' was changed since 5935 // compiling the program). 5936 if ((len = utfc_ptr2len((char *)opnd)) > 1) { 5937 if (rex.reg_ic) { 5938 cf = utf_fold(utf_ptr2char((char *)opnd)); 5939 } 5940 while (count < maxcount && utfc_ptr2len((char *)scan) >= len) { 5941 for (i = 0; i < len; i++) { 5942 if (opnd[i] != scan[i]) { 5943 break; 5944 } 5945 } 5946 if (i < len && (!rex.reg_ic 5947 || utf_fold(utf_ptr2char((char *)scan)) != cf)) { 5948 break; 5949 } 5950 scan += len; 5951 count++; 5952 } 5953 } 5954 } 5955 break; 5956 5957 case ANYOF: 5958 case ANYOF + ADD_NL: 5959 testval = 1; 5960 FALLTHROUGH; 5961 5962 case ANYBUT: 5963 case ANYBUT + ADD_NL: 5964 while (count < maxcount) { 5965 int len; 5966 if (*scan == NUL) { 5967 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline 5968 || rex.reg_line_lbr) { 5969 break; 5970 } 5971 reg_nextline(); 5972 scan = rex.input; 5973 if (got_int) { 5974 break; 5975 } 5976 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { 5977 scan++; 5978 } else if ((len = utfc_ptr2len((char *)scan)) > 1) { 5979 if ((cstrchr((char *)opnd, utf_ptr2char((char *)scan)) == NULL) == testval) { 5980 break; 5981 } 5982 scan += len; 5983 } else { 5984 if ((cstrchr((char *)opnd, *scan) == NULL) == testval) { 5985 break; 5986 } 5987 scan++; 5988 } 5989 count++; 5990 } 5991 break; 5992 5993 case NEWL: 5994 while (count < maxcount 5995 && ((*scan == NUL && rex.lnum <= rex.reg_maxline && !rex.reg_line_lbr 5996 && REG_MULTI) || (*scan == '\n' && rex.reg_line_lbr))) { 5997 count++; 5998 if (rex.reg_line_lbr) { 5999 ADVANCE_REGINPUT(); 6000 } else { 6001 reg_nextline(); 6002 } 6003 scan = rex.input; 6004 if (got_int) { 6005 break; 6006 } 6007 } 6008 break; 6009 6010 default: // Oh dear. Called inappropriately. 6011 iemsg(_(e_re_corr)); 6012 #ifdef REGEXP_DEBUG 6013 printf("Called regrepeat with op code %d\n", OP(p)); 6014 #endif 6015 break; 6016 } 6017 6018 rex.input = scan; 6019 6020 return (int)count; 6021 } 6022 6023 // Push an item onto the regstack. 6024 // Returns pointer to new item. Returns NULL when out of memory. 6025 static regitem_T *regstack_push(regstate_T state, uint8_t *scan) 6026 { 6027 regitem_T *rp; 6028 6029 if ((int64_t)((unsigned)regstack.ga_len >> 10) >= p_mmp) { 6030 emsg(_(e_pattern_uses_more_memory_than_maxmempattern)); 6031 return NULL; 6032 } 6033 ga_grow(®stack, sizeof(regitem_T)); 6034 6035 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len); 6036 rp->rs_state = state; 6037 rp->rs_scan = scan; 6038 6039 regstack.ga_len += (int)sizeof(regitem_T); 6040 return rp; 6041 } 6042 6043 // Pop an item from the regstack. 6044 static void regstack_pop(uint8_t **scan) 6045 { 6046 regitem_T *rp; 6047 6048 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1; 6049 *scan = rp->rs_scan; 6050 6051 regstack.ga_len -= (int)sizeof(regitem_T); 6052 } 6053 6054 // Save the current subexpr to "bp", so that they can be restored 6055 // later by restore_subexpr(). 6056 static void save_subexpr(regbehind_T *bp) 6057 FUNC_ATTR_NONNULL_ALL 6058 { 6059 // When "rex.need_clear_subexpr" is set we don't need to save the values, only 6060 // remember that this flag needs to be set again when restoring. 6061 bp->save_need_clear_subexpr = rex.need_clear_subexpr; 6062 if (rex.need_clear_subexpr) { 6063 return; 6064 } 6065 6066 for (int i = 0; i < NSUBEXP; i++) { 6067 if (REG_MULTI) { 6068 bp->save_start[i].se_u.pos = rex.reg_startpos[i]; 6069 bp->save_end[i].se_u.pos = rex.reg_endpos[i]; 6070 } else { 6071 bp->save_start[i].se_u.ptr = rex.reg_startp[i]; 6072 bp->save_end[i].se_u.ptr = rex.reg_endp[i]; 6073 } 6074 } 6075 } 6076 6077 // Restore the subexpr from "bp". 6078 static void restore_subexpr(regbehind_T *bp) 6079 FUNC_ATTR_NONNULL_ALL 6080 { 6081 // Only need to restore saved values when they are not to be cleared. 6082 rex.need_clear_subexpr = bp->save_need_clear_subexpr; 6083 if (rex.need_clear_subexpr) { 6084 return; 6085 } 6086 6087 for (int i = 0; i < NSUBEXP; i++) { 6088 if (REG_MULTI) { 6089 rex.reg_startpos[i] = bp->save_start[i].se_u.pos; 6090 rex.reg_endpos[i] = bp->save_end[i].se_u.pos; 6091 } else { 6092 rex.reg_startp[i] = bp->save_start[i].se_u.ptr; 6093 rex.reg_endp[i] = bp->save_end[i].se_u.ptr; 6094 } 6095 } 6096 } 6097 /// Main matching routine 6098 /// 6099 /// Conceptually the strategy is simple: Check to see whether the current node 6100 /// matches, push an item onto the regstack and loop to see whether the rest 6101 /// matches, and then act accordingly. In practice we make some effort to 6102 /// avoid using the regstack, in particular by going through "ordinary" nodes 6103 /// (that don't need to know whether the rest of the match failed) by a nested 6104 /// loop. 6105 /// 6106 /// @param scan Current node. 6107 /// @param tm timeout limit or NULL 6108 /// @param timed_out flag set on timeout or NULL 6109 /// 6110 /// @return - true when there is a match. Leaves rex.input and rex.lnum 6111 /// just after the last matched character. 6112 /// - false when there is no match. Leaves rex.input and rex.lnum in an 6113 /// undefined state! 6114 static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) 6115 { 6116 uint8_t *next; // Next node. 6117 int op; 6118 int c; 6119 regitem_T *rp; 6120 int no; 6121 int status; // one of the RA_ values: 6122 int tm_count = 0; 6123 6124 // Make "regstack" and "backpos" empty. They are allocated and freed in 6125 // bt_regexec_both() to reduce malloc()/free() calls. 6126 regstack.ga_len = 0; 6127 backpos.ga_len = 0; 6128 6129 // Repeat until "regstack" is empty. 6130 while (true) { 6131 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q". 6132 // Allow interrupting them with CTRL-C. 6133 reg_breakcheck(); 6134 6135 #ifdef REGEXP_DEBUG 6136 if (scan != NULL && regnarrate) { 6137 fprintf(stderr, "%s", (char *)regprop(scan)); 6138 fprintf(stderr, "%s", "(\n"); 6139 } 6140 #endif 6141 6142 // Repeat for items that can be matched sequentially, without using the 6143 // regstack. 6144 while (true) { 6145 if (got_int || scan == NULL) { 6146 status = RA_FAIL; 6147 break; 6148 } 6149 // Check for timeout once in a 100 times to avoid overhead. 6150 if (tm != NULL && ++tm_count == 100) { 6151 tm_count = 0; 6152 if (profile_passed_limit(*tm)) { 6153 if (timed_out != NULL) { 6154 *timed_out = true; 6155 } 6156 status = RA_FAIL; 6157 break; 6158 } 6159 } 6160 status = RA_CONT; 6161 6162 #ifdef REGEXP_DEBUG 6163 if (regnarrate) { 6164 fprintf(stderr, "%s", (char *)regprop(scan)); 6165 fprintf(stderr, "%s", "...\n"); 6166 if (re_extmatch_in != NULL) { 6167 int i; 6168 6169 fprintf(stderr, _("External submatches:\n")); 6170 for (i = 0; i < NSUBEXP; i++) { 6171 fprintf(stderr, "%s", " \""); 6172 if (re_extmatch_in->matches[i] != NULL) { 6173 fprintf(stderr, "%s", (char *)re_extmatch_in->matches[i]); 6174 } 6175 fprintf(stderr, "%s", "\"\n"); 6176 } 6177 } 6178 } 6179 #endif 6180 next = regnext(scan); 6181 6182 op = OP(scan); 6183 // Check for character class with NL added. 6184 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI 6185 && *rex.input == NUL && rex.lnum <= rex.reg_maxline) { 6186 reg_nextline(); 6187 } else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n') { 6188 ADVANCE_REGINPUT(); 6189 } else { 6190 if (WITH_NL(op)) { 6191 op -= ADD_NL; 6192 } 6193 c = utf_ptr2char((char *)rex.input); 6194 switch (op) { 6195 case BOL: 6196 if (rex.input != rex.line) { 6197 status = RA_NOMATCH; 6198 } 6199 break; 6200 6201 case EOL: 6202 if (c != NUL) { 6203 status = RA_NOMATCH; 6204 } 6205 break; 6206 6207 case RE_BOF: 6208 // We're not at the beginning of the file when below the first 6209 // line where we started, not at the start of the line or we 6210 // didn't start at the first line of the buffer. 6211 if (rex.lnum != 0 || rex.input != rex.line 6212 || (REG_MULTI && rex.reg_firstlnum > 1)) { 6213 status = RA_NOMATCH; 6214 } 6215 break; 6216 6217 case RE_EOF: 6218 if (rex.lnum != rex.reg_maxline || c != NUL) { 6219 status = RA_NOMATCH; 6220 } 6221 break; 6222 6223 case CURSOR: 6224 // Check if the buffer is in a window and compare the 6225 // rex.reg_win->w_cursor position to the match position. 6226 if (rex.reg_win == NULL 6227 || (rex.lnum + rex.reg_firstlnum != rex.reg_win->w_cursor.lnum) 6228 || ((colnr_T)(rex.input - rex.line) != 6229 rex.reg_win->w_cursor.col)) { 6230 status = RA_NOMATCH; 6231 } 6232 break; 6233 6234 case RE_MARK: 6235 // Compare the mark position to the match position. 6236 { 6237 int mark = OPERAND(scan)[0]; 6238 int cmp = OPERAND(scan)[1]; 6239 pos_T *pos; 6240 size_t col = REG_MULTI ? (size_t)(rex.input - rex.line) : 0; 6241 fmark_T *fm = mark_get(rex.reg_buf, curwin, NULL, kMarkBufLocal, mark); 6242 6243 // Line may have been freed, get it again. 6244 if (REG_MULTI) { 6245 rex.line = (uint8_t *)reg_getline(rex.lnum); 6246 rex.input = rex.line + col; 6247 } 6248 6249 if (fm == NULL // mark doesn't exist 6250 || fm->mark.lnum <= 0) { // mark isn't set in reg_buf 6251 status = RA_NOMATCH; 6252 } else { 6253 pos = &fm->mark; 6254 const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum 6255 && pos->col == MAXCOL 6256 ? reg_getline_len(pos->lnum - rex.reg_firstlnum) 6257 : pos->col; 6258 6259 if (pos->lnum == rex.lnum + rex.reg_firstlnum 6260 ? (pos_col == (colnr_T)(rex.input - rex.line) 6261 ? (cmp == '<' || cmp == '>') 6262 : (pos_col < (colnr_T)(rex.input - rex.line) 6263 ? cmp != '>' 6264 : cmp != '<')) 6265 : (pos->lnum < rex.lnum + rex.reg_firstlnum 6266 ? cmp != '>' 6267 : cmp != '<')) { 6268 status = RA_NOMATCH; 6269 } 6270 } 6271 } 6272 break; 6273 6274 case RE_VISUAL: 6275 if (!reg_match_visual()) { 6276 status = RA_NOMATCH; 6277 } 6278 break; 6279 6280 case RE_LNUM: 6281 assert(rex.lnum + rex.reg_firstlnum >= 0 6282 && (uintmax_t)(rex.lnum + rex.reg_firstlnum) <= UINT32_MAX); 6283 if (!REG_MULTI 6284 || !re_num_cmp((uint32_t)(rex.lnum + rex.reg_firstlnum), scan)) { 6285 status = RA_NOMATCH; 6286 } 6287 break; 6288 6289 case RE_COL: 6290 assert(rex.input - rex.line + 1 >= 0 6291 && (uintmax_t)(rex.input - rex.line + 1) <= UINT32_MAX); 6292 if (!re_num_cmp((uint32_t)(rex.input - rex.line + 1), scan)) { 6293 status = RA_NOMATCH; 6294 } 6295 break; 6296 6297 case RE_VCOL: { 6298 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win; 6299 linenr_T lnum = REG_MULTI ? rex.reg_firstlnum + rex.lnum : 1; 6300 if (REG_MULTI && (lnum <= 0 || lnum > wp->w_buffer->b_ml.ml_line_count)) { 6301 lnum = 1; 6302 } 6303 int vcol = win_linetabsize(wp, lnum, (char *)rex.line, 6304 (colnr_T)(rex.input - rex.line)); 6305 if (!re_num_cmp((uint32_t)vcol + 1, scan)) { 6306 status = RA_NOMATCH; 6307 } 6308 break; 6309 } 6310 break; 6311 6312 case BOW: // \<word; rex.input points to w 6313 if (c == NUL) { // Can't match at end of line 6314 status = RA_NOMATCH; 6315 } else { 6316 // Get class of current and previous char (if it exists). 6317 const int this_class = 6318 mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab); 6319 if (this_class <= 1) { 6320 status = RA_NOMATCH; // Not on a word at all. 6321 } else if (reg_prev_class() == this_class) { 6322 status = RA_NOMATCH; // Previous char is in same word. 6323 } 6324 } 6325 break; 6326 6327 case EOW: // word\>; rex.input points after d 6328 if (rex.input == rex.line) { // Can't match at start of line 6329 status = RA_NOMATCH; 6330 } else { 6331 int this_class, prev_class; 6332 6333 // Get class of current and previous char (if it exists). 6334 this_class = mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab); 6335 prev_class = reg_prev_class(); 6336 if (this_class == prev_class 6337 || prev_class == 0 || prev_class == 1) { 6338 status = RA_NOMATCH; 6339 } 6340 } 6341 break; // Matched with EOW 6342 6343 case ANY: 6344 // ANY does not match new lines. 6345 if (c == NUL) { 6346 status = RA_NOMATCH; 6347 } else { 6348 ADVANCE_REGINPUT(); 6349 } 6350 break; 6351 6352 case IDENT: 6353 if (!vim_isIDc(c)) { 6354 status = RA_NOMATCH; 6355 } else { 6356 ADVANCE_REGINPUT(); 6357 } 6358 break; 6359 6360 case SIDENT: 6361 if (ascii_isdigit(*rex.input) || !vim_isIDc(c)) { 6362 status = RA_NOMATCH; 6363 } else { 6364 ADVANCE_REGINPUT(); 6365 } 6366 break; 6367 6368 case KWORD: 6369 if (!vim_iswordp_buf((char *)rex.input, rex.reg_buf)) { 6370 status = RA_NOMATCH; 6371 } else { 6372 ADVANCE_REGINPUT(); 6373 } 6374 break; 6375 6376 case SKWORD: 6377 if (ascii_isdigit(*rex.input) 6378 || !vim_iswordp_buf((char *)rex.input, rex.reg_buf)) { 6379 status = RA_NOMATCH; 6380 } else { 6381 ADVANCE_REGINPUT(); 6382 } 6383 break; 6384 6385 case FNAME: 6386 if (!vim_isfilec(c)) { 6387 status = RA_NOMATCH; 6388 } else { 6389 ADVANCE_REGINPUT(); 6390 } 6391 break; 6392 6393 case SFNAME: 6394 if (ascii_isdigit(*rex.input) || !vim_isfilec(c)) { 6395 status = RA_NOMATCH; 6396 } else { 6397 ADVANCE_REGINPUT(); 6398 } 6399 break; 6400 6401 case PRINT: 6402 if (!vim_isprintc(utf_ptr2char((char *)rex.input))) { 6403 status = RA_NOMATCH; 6404 } else { 6405 ADVANCE_REGINPUT(); 6406 } 6407 break; 6408 6409 case SPRINT: 6410 if (ascii_isdigit(*rex.input) || !vim_isprintc(utf_ptr2char((char *)rex.input))) { 6411 status = RA_NOMATCH; 6412 } else { 6413 ADVANCE_REGINPUT(); 6414 } 6415 break; 6416 6417 case WHITE: 6418 if (!ascii_iswhite(c)) { 6419 status = RA_NOMATCH; 6420 } else { 6421 ADVANCE_REGINPUT(); 6422 } 6423 break; 6424 6425 case NWHITE: 6426 if (c == NUL || ascii_iswhite(c)) { 6427 status = RA_NOMATCH; 6428 } else { 6429 ADVANCE_REGINPUT(); 6430 } 6431 break; 6432 6433 case DIGIT: 6434 if (!ri_digit(c)) { 6435 status = RA_NOMATCH; 6436 } else { 6437 ADVANCE_REGINPUT(); 6438 } 6439 break; 6440 6441 case NDIGIT: 6442 if (c == NUL || ri_digit(c)) { 6443 status = RA_NOMATCH; 6444 } else { 6445 ADVANCE_REGINPUT(); 6446 } 6447 break; 6448 6449 case HEX: 6450 if (!ri_hex(c)) { 6451 status = RA_NOMATCH; 6452 } else { 6453 ADVANCE_REGINPUT(); 6454 } 6455 break; 6456 6457 case NHEX: 6458 if (c == NUL || ri_hex(c)) { 6459 status = RA_NOMATCH; 6460 } else { 6461 ADVANCE_REGINPUT(); 6462 } 6463 break; 6464 6465 case OCTAL: 6466 if (!ri_octal(c)) { 6467 status = RA_NOMATCH; 6468 } else { 6469 ADVANCE_REGINPUT(); 6470 } 6471 break; 6472 6473 case NOCTAL: 6474 if (c == NUL || ri_octal(c)) { 6475 status = RA_NOMATCH; 6476 } else { 6477 ADVANCE_REGINPUT(); 6478 } 6479 break; 6480 6481 case WORD: 6482 if (!ri_word(c)) { 6483 status = RA_NOMATCH; 6484 } else { 6485 ADVANCE_REGINPUT(); 6486 } 6487 break; 6488 6489 case NWORD: 6490 if (c == NUL || ri_word(c)) { 6491 status = RA_NOMATCH; 6492 } else { 6493 ADVANCE_REGINPUT(); 6494 } 6495 break; 6496 6497 case HEAD: 6498 if (!ri_head(c)) { 6499 status = RA_NOMATCH; 6500 } else { 6501 ADVANCE_REGINPUT(); 6502 } 6503 break; 6504 6505 case NHEAD: 6506 if (c == NUL || ri_head(c)) { 6507 status = RA_NOMATCH; 6508 } else { 6509 ADVANCE_REGINPUT(); 6510 } 6511 break; 6512 6513 case ALPHA: 6514 if (!ri_alpha(c)) { 6515 status = RA_NOMATCH; 6516 } else { 6517 ADVANCE_REGINPUT(); 6518 } 6519 break; 6520 6521 case NALPHA: 6522 if (c == NUL || ri_alpha(c)) { 6523 status = RA_NOMATCH; 6524 } else { 6525 ADVANCE_REGINPUT(); 6526 } 6527 break; 6528 6529 case LOWER: 6530 if (!ri_lower(c)) { 6531 status = RA_NOMATCH; 6532 } else { 6533 ADVANCE_REGINPUT(); 6534 } 6535 break; 6536 6537 case NLOWER: 6538 if (c == NUL || ri_lower(c)) { 6539 status = RA_NOMATCH; 6540 } else { 6541 ADVANCE_REGINPUT(); 6542 } 6543 break; 6544 6545 case UPPER: 6546 if (!ri_upper(c)) { 6547 status = RA_NOMATCH; 6548 } else { 6549 ADVANCE_REGINPUT(); 6550 } 6551 break; 6552 6553 case NUPPER: 6554 if (c == NUL || ri_upper(c)) { 6555 status = RA_NOMATCH; 6556 } else { 6557 ADVANCE_REGINPUT(); 6558 } 6559 break; 6560 6561 case EXACTLY: { 6562 int len; 6563 uint8_t *opnd; 6564 6565 opnd = OPERAND(scan); 6566 // Inline the first byte, for speed. 6567 if (*opnd != *rex.input 6568 && (!rex.reg_ic)) { 6569 status = RA_NOMATCH; 6570 } else if (*opnd == NUL) { 6571 // match empty string always works; happens when "~" is 6572 // empty. 6573 } else { 6574 if (opnd[1] == NUL && !rex.reg_ic) { 6575 len = 1; // matched a single byte above 6576 } else { 6577 // Need to match first byte again for multi-byte. 6578 len = (int)strlen((char *)opnd); 6579 if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) { 6580 status = RA_NOMATCH; 6581 } 6582 } 6583 // Check for following composing character, unless %C 6584 // follows (skips over all composing chars). 6585 if (status != RA_NOMATCH 6586 && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL) 6587 && !rex.reg_icombine 6588 && OP(next) != RE_COMPOSING) { 6589 // raaron: This code makes a composing character get 6590 // ignored, which is the correct behavior (sometimes) 6591 // for voweled Hebrew texts. 6592 status = RA_NOMATCH; 6593 } 6594 if (status != RA_NOMATCH) { 6595 rex.input += len; 6596 } 6597 } 6598 } 6599 break; 6600 6601 case ANYOF: 6602 case ANYBUT: { 6603 uint8_t *q = OPERAND(scan); 6604 6605 if (c == NUL) { 6606 status = RA_NOMATCH; 6607 } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) { 6608 status = RA_NOMATCH; 6609 } else { // Check following combining characters 6610 int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q); 6611 6612 rex.input += utf_ptr2len((char *)rex.input); 6613 q += utf_ptr2len((char *)q); 6614 6615 if (len == 0) { 6616 break; 6617 } 6618 6619 for (int i = 0; i < len; i++) { 6620 if (q[i] != rex.input[i]) { 6621 status = RA_NOMATCH; 6622 break; 6623 } 6624 } 6625 rex.input += len; 6626 } 6627 break; 6628 } 6629 6630 case MULTIBYTECODE: { 6631 int i, len; 6632 6633 const uint8_t *opnd = OPERAND(scan); 6634 // Safety check (just in case 'encoding' was changed since 6635 // compiling the program). 6636 if ((len = utfc_ptr2len((char *)opnd)) < 2) { 6637 status = RA_NOMATCH; 6638 break; 6639 } 6640 const int opndc = utf_ptr2char((char *)opnd); 6641 if (utf_iscomposing_legacy(opndc)) { 6642 // When only a composing char is given match at any 6643 // position where that composing char appears. 6644 status = RA_NOMATCH; 6645 for (i = 0; rex.input[i] != NUL; 6646 i += utf_ptr2len((char *)rex.input + i)) { 6647 const int inpc = utf_ptr2char((char *)rex.input + i); 6648 if (!utf_iscomposing_legacy(inpc)) { 6649 if (i > 0) { 6650 break; 6651 } 6652 } else if (opndc == inpc) { 6653 // Include all following composing chars. 6654 len = i + utfc_ptr2len((char *)rex.input + i); 6655 status = RA_MATCH; 6656 break; 6657 } 6658 } 6659 } else { 6660 if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) { 6661 status = RA_NOMATCH; 6662 break; 6663 } 6664 } 6665 rex.input += len; 6666 } 6667 break; 6668 6669 case RE_COMPOSING: 6670 // Skip composing characters. 6671 while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) { 6672 rex.input += utf_ptr2len((char *)rex.input); 6673 } 6674 break; 6675 6676 case NOTHING: 6677 break; 6678 6679 case BACK: { 6680 int i; 6681 6682 // When we run into BACK we need to check if we don't keep 6683 // looping without matching any input. The second and later 6684 // times a BACK is encountered it fails if the input is still 6685 // at the same position as the previous time. 6686 // The positions are stored in "backpos" and found by the 6687 // current value of "scan", the position in the RE program. 6688 backpos_T *bp = (backpos_T *)backpos.ga_data; 6689 for (i = 0; i < backpos.ga_len; i++) { 6690 if (bp[i].bp_scan == scan) { 6691 break; 6692 } 6693 } 6694 if (i == backpos.ga_len) { 6695 backpos_T *p = GA_APPEND_VIA_PTR(backpos_T, &backpos); 6696 p->bp_scan = scan; 6697 } else if (reg_save_equal(&bp[i].bp_pos)) { 6698 // Still at same position as last time, fail. 6699 status = RA_NOMATCH; 6700 } 6701 6702 assert(status != RA_FAIL); 6703 if (status != RA_NOMATCH) { 6704 reg_save(&bp[i].bp_pos, &backpos); 6705 } 6706 } 6707 break; 6708 6709 case MOPEN + 0: // Match start: \zs 6710 case MOPEN + 1: // \( 6711 case MOPEN + 2: 6712 case MOPEN + 3: 6713 case MOPEN + 4: 6714 case MOPEN + 5: 6715 case MOPEN + 6: 6716 case MOPEN + 7: 6717 case MOPEN + 8: 6718 case MOPEN + 9: 6719 no = op - MOPEN; 6720 cleanup_subexpr(); 6721 rp = regstack_push(RS_MOPEN, scan); 6722 if (rp == NULL) { 6723 status = RA_FAIL; 6724 } else { 6725 rp->rs_no = (int16_t)no; 6726 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no], 6727 &rex.reg_startp[no]); 6728 // We simply continue and handle the result when done. 6729 } 6730 break; 6731 6732 case NOPEN: // \%( 6733 case NCLOSE: // \) after \%( 6734 if (regstack_push(RS_NOPEN, scan) == NULL) { 6735 status = RA_FAIL; 6736 } 6737 // We simply continue and handle the result when done. 6738 break; 6739 6740 case ZOPEN + 1: 6741 case ZOPEN + 2: 6742 case ZOPEN + 3: 6743 case ZOPEN + 4: 6744 case ZOPEN + 5: 6745 case ZOPEN + 6: 6746 case ZOPEN + 7: 6747 case ZOPEN + 8: 6748 case ZOPEN + 9: 6749 no = op - ZOPEN; 6750 cleanup_zsubexpr(); 6751 rp = regstack_push(RS_ZOPEN, scan); 6752 if (rp == NULL) { 6753 status = RA_FAIL; 6754 } else { 6755 rp->rs_no = (int16_t)no; 6756 save_se(&rp->rs_un.sesave, ®_startzpos[no], 6757 ®_startzp[no]); 6758 // We simply continue and handle the result when done. 6759 } 6760 break; 6761 6762 case MCLOSE + 0: // Match end: \ze 6763 case MCLOSE + 1: // \) 6764 case MCLOSE + 2: 6765 case MCLOSE + 3: 6766 case MCLOSE + 4: 6767 case MCLOSE + 5: 6768 case MCLOSE + 6: 6769 case MCLOSE + 7: 6770 case MCLOSE + 8: 6771 case MCLOSE + 9: 6772 no = op - MCLOSE; 6773 cleanup_subexpr(); 6774 rp = regstack_push(RS_MCLOSE, scan); 6775 if (rp == NULL) { 6776 status = RA_FAIL; 6777 } else { 6778 rp->rs_no = (int16_t)no; 6779 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no], &rex.reg_endp[no]); 6780 // We simply continue and handle the result when done. 6781 } 6782 break; 6783 6784 case ZCLOSE + 1: // \) after \z( 6785 case ZCLOSE + 2: 6786 case ZCLOSE + 3: 6787 case ZCLOSE + 4: 6788 case ZCLOSE + 5: 6789 case ZCLOSE + 6: 6790 case ZCLOSE + 7: 6791 case ZCLOSE + 8: 6792 case ZCLOSE + 9: 6793 no = op - ZCLOSE; 6794 cleanup_zsubexpr(); 6795 rp = regstack_push(RS_ZCLOSE, scan); 6796 if (rp == NULL) { 6797 status = RA_FAIL; 6798 } else { 6799 rp->rs_no = (int16_t)no; 6800 save_se(&rp->rs_un.sesave, ®_endzpos[no], 6801 ®_endzp[no]); 6802 // We simply continue and handle the result when done. 6803 } 6804 break; 6805 6806 case BACKREF + 1: 6807 case BACKREF + 2: 6808 case BACKREF + 3: 6809 case BACKREF + 4: 6810 case BACKREF + 5: 6811 case BACKREF + 6: 6812 case BACKREF + 7: 6813 case BACKREF + 8: 6814 case BACKREF + 9: { 6815 int len; 6816 6817 no = op - BACKREF; 6818 cleanup_subexpr(); 6819 if (!REG_MULTI) { // Single-line regexp 6820 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL) { 6821 // Backref was not set: Match an empty string. 6822 len = 0; 6823 } else { 6824 // Compare current input with back-ref in the same line. 6825 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]); 6826 if (cstrncmp((char *)rex.reg_startp[no], (char *)rex.input, &len) != 0) { 6827 status = RA_NOMATCH; 6828 } 6829 } 6830 } else { // Multi-line regexp 6831 if (rex.reg_startpos[no].lnum < 0 || rex.reg_endpos[no].lnum < 0) { 6832 // Backref was not set: Match an empty string. 6833 len = 0; 6834 } else { 6835 if (rex.reg_startpos[no].lnum == rex.lnum 6836 && rex.reg_endpos[no].lnum == rex.lnum) { 6837 // Compare back-ref within the current line. 6838 len = rex.reg_endpos[no].col - rex.reg_startpos[no].col; 6839 if (cstrncmp((char *)rex.line + rex.reg_startpos[no].col, 6840 (char *)rex.input, &len) != 0) { 6841 status = RA_NOMATCH; 6842 } 6843 } else { 6844 // Messy situation: Need to compare between two lines. 6845 int r = match_with_backref(rex.reg_startpos[no].lnum, 6846 rex.reg_startpos[no].col, 6847 rex.reg_endpos[no].lnum, 6848 rex.reg_endpos[no].col, 6849 &len); 6850 if (r != RA_MATCH) { 6851 status = r; 6852 } 6853 } 6854 } 6855 } 6856 6857 // Matched the backref, skip over it. 6858 rex.input += len; 6859 } 6860 break; 6861 6862 case ZREF + 1: 6863 case ZREF + 2: 6864 case ZREF + 3: 6865 case ZREF + 4: 6866 case ZREF + 5: 6867 case ZREF + 6: 6868 case ZREF + 7: 6869 case ZREF + 8: 6870 case ZREF + 9: 6871 cleanup_zsubexpr(); 6872 no = op - ZREF; 6873 if (re_extmatch_in != NULL 6874 && re_extmatch_in->matches[no] != NULL) { 6875 int len = (int)strlen((char *)re_extmatch_in->matches[no]); 6876 if (cstrncmp((char *)re_extmatch_in->matches[no], (char *)rex.input, &len) != 0) { 6877 status = RA_NOMATCH; 6878 } else { 6879 rex.input += len; 6880 } 6881 } else { 6882 // Backref was not set: Match an empty string. 6883 } 6884 break; 6885 6886 case BRANCH: 6887 if (OP(next) != BRANCH) { // No choice. 6888 next = OPERAND(scan); // Avoid recursion. 6889 } else { 6890 rp = regstack_push(RS_BRANCH, scan); 6891 if (rp == NULL) { 6892 status = RA_FAIL; 6893 } else { 6894 status = RA_BREAK; // rest is below 6895 } 6896 } 6897 break; 6898 6899 case BRACE_LIMITS: 6900 if (OP(next) == BRACE_SIMPLE) { 6901 bl_minval = OPERAND_MIN(scan); 6902 bl_maxval = OPERAND_MAX(scan); 6903 } else if (OP(next) >= BRACE_COMPLEX 6904 && OP(next) < BRACE_COMPLEX + 10) { 6905 no = OP(next) - BRACE_COMPLEX; 6906 brace_min[no] = OPERAND_MIN(scan); 6907 brace_max[no] = OPERAND_MAX(scan); 6908 brace_count[no] = 0; 6909 } else { 6910 internal_error("BRACE_LIMITS"); 6911 status = RA_FAIL; 6912 } 6913 break; 6914 6915 case BRACE_COMPLEX + 0: 6916 case BRACE_COMPLEX + 1: 6917 case BRACE_COMPLEX + 2: 6918 case BRACE_COMPLEX + 3: 6919 case BRACE_COMPLEX + 4: 6920 case BRACE_COMPLEX + 5: 6921 case BRACE_COMPLEX + 6: 6922 case BRACE_COMPLEX + 7: 6923 case BRACE_COMPLEX + 8: 6924 case BRACE_COMPLEX + 9: 6925 no = op - BRACE_COMPLEX; 6926 brace_count[no]++; 6927 6928 // If not matched enough times yet, try one more 6929 if (brace_count[no] <= (brace_min[no] <= brace_max[no] 6930 ? brace_min[no] : brace_max[no])) { 6931 rp = regstack_push(RS_BRCPLX_MORE, scan); 6932 if (rp == NULL) { 6933 status = RA_FAIL; 6934 } else { 6935 rp->rs_no = (int16_t)no; 6936 reg_save(&rp->rs_un.regsave, &backpos); 6937 next = OPERAND(scan); 6938 // We continue and handle the result when done. 6939 } 6940 break; 6941 } 6942 6943 // If matched enough times, may try matching some more 6944 if (brace_min[no] <= brace_max[no]) { 6945 // Range is the normal way around, use longest match 6946 if (brace_count[no] <= brace_max[no]) { 6947 rp = regstack_push(RS_BRCPLX_LONG, scan); 6948 if (rp == NULL) { 6949 status = RA_FAIL; 6950 } else { 6951 rp->rs_no = (int16_t)no; 6952 reg_save(&rp->rs_un.regsave, &backpos); 6953 next = OPERAND(scan); 6954 // We continue and handle the result when done. 6955 } 6956 } 6957 } else { 6958 // Range is backwards, use shortest match first 6959 if (brace_count[no] <= brace_min[no]) { 6960 rp = regstack_push(RS_BRCPLX_SHORT, scan); 6961 if (rp == NULL) { 6962 status = RA_FAIL; 6963 } else { 6964 reg_save(&rp->rs_un.regsave, &backpos); 6965 // We continue and handle the result when done. 6966 } 6967 } 6968 } 6969 break; 6970 6971 case BRACE_SIMPLE: 6972 case STAR: 6973 case PLUS: { 6974 regstar_T rst; 6975 6976 // Lookahead to avoid useless match attempts when we know 6977 // what character comes next. 6978 if (OP(next) == EXACTLY) { 6979 rst.nextb = *OPERAND(next); 6980 if (rex.reg_ic) { 6981 if (mb_isupper(rst.nextb)) { 6982 rst.nextb_ic = mb_tolower(rst.nextb); 6983 } else { 6984 rst.nextb_ic = mb_toupper(rst.nextb); 6985 } 6986 } else { 6987 rst.nextb_ic = rst.nextb; 6988 } 6989 } else { 6990 rst.nextb = NUL; 6991 rst.nextb_ic = NUL; 6992 } 6993 if (op != BRACE_SIMPLE) { 6994 rst.minval = (op == STAR) ? 0 : 1; 6995 rst.maxval = MAX_LIMIT; 6996 } else { 6997 rst.minval = bl_minval; 6998 rst.maxval = bl_maxval; 6999 } 7000 7001 // When maxval > minval, try matching as much as possible, up 7002 // to maxval. When maxval < minval, try matching at least the 7003 // minimal number (since the range is backwards, that's also 7004 // maxval!). 7005 rst.count = regrepeat(OPERAND(scan), rst.maxval); 7006 if (got_int) { 7007 status = RA_FAIL; 7008 break; 7009 } 7010 if (rst.minval <= rst.maxval 7011 ? rst.count >= rst.minval : rst.count >= rst.maxval) { 7012 // It could match. Prepare for trying to match what 7013 // follows. The code is below. Parameters are stored in 7014 // a regstar_T on the regstack. 7015 if ((int64_t)((unsigned)regstack.ga_len >> 10) >= p_mmp) { 7016 emsg(_(e_pattern_uses_more_memory_than_maxmempattern)); 7017 status = RA_FAIL; 7018 } else { 7019 ga_grow(®stack, sizeof(regstar_T)); 7020 regstack.ga_len += (int)sizeof(regstar_T); 7021 rp = regstack_push(rst.minval <= rst.maxval ? RS_STAR_LONG : RS_STAR_SHORT, scan); 7022 if (rp == NULL) { 7023 status = RA_FAIL; 7024 } else { 7025 *(((regstar_T *)rp) - 1) = rst; 7026 status = RA_BREAK; // skip the restore bits 7027 } 7028 } 7029 } else { 7030 status = RA_NOMATCH; 7031 } 7032 } 7033 break; 7034 7035 case NOMATCH: 7036 case MATCH: 7037 case SUBPAT: 7038 rp = regstack_push(RS_NOMATCH, scan); 7039 if (rp == NULL) { 7040 status = RA_FAIL; 7041 } else { 7042 rp->rs_no = (int16_t)op; 7043 reg_save(&rp->rs_un.regsave, &backpos); 7044 next = OPERAND(scan); 7045 // We continue and handle the result when done. 7046 } 7047 break; 7048 7049 case BEHIND: 7050 case NOBEHIND: 7051 // Need a bit of room to store extra positions. 7052 if ((int64_t)((unsigned)regstack.ga_len >> 10) >= p_mmp) { 7053 emsg(_(e_pattern_uses_more_memory_than_maxmempattern)); 7054 status = RA_FAIL; 7055 } else { 7056 ga_grow(®stack, sizeof(regbehind_T)); 7057 regstack.ga_len += (int)sizeof(regbehind_T); 7058 rp = regstack_push(RS_BEHIND1, scan); 7059 if (rp == NULL) { 7060 status = RA_FAIL; 7061 } else { 7062 // Need to save the subexpr to be able to restore them 7063 // when there is a match but we don't use it. 7064 save_subexpr(((regbehind_T *)rp) - 1); 7065 7066 rp->rs_no = (int16_t)op; 7067 reg_save(&rp->rs_un.regsave, &backpos); 7068 // First try if what follows matches. If it does then we 7069 // check the behind match by looping. 7070 } 7071 } 7072 break; 7073 7074 case BHPOS: 7075 if (REG_MULTI) { 7076 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line) 7077 || behind_pos.rs_u.pos.lnum != rex.lnum) { 7078 status = RA_NOMATCH; 7079 } 7080 } else if (behind_pos.rs_u.ptr != rex.input) { 7081 status = RA_NOMATCH; 7082 } 7083 break; 7084 7085 case NEWL: 7086 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline 7087 || rex.reg_line_lbr) && (c != '\n' || !rex.reg_line_lbr)) { 7088 status = RA_NOMATCH; 7089 } else if (rex.reg_line_lbr) { 7090 ADVANCE_REGINPUT(); 7091 } else { 7092 reg_nextline(); 7093 } 7094 break; 7095 7096 case END: 7097 status = RA_MATCH; // Success! 7098 break; 7099 7100 default: 7101 iemsg(_(e_re_corr)); 7102 #ifdef REGEXP_DEBUG 7103 printf("Illegal op code %d\n", op); 7104 #endif 7105 status = RA_FAIL; 7106 break; 7107 } 7108 } 7109 7110 // If we can't continue sequentially, break the inner loop. 7111 if (status != RA_CONT) { 7112 break; 7113 } 7114 7115 // Continue in inner loop, advance to next item. 7116 scan = next; 7117 } // end of inner loop 7118 7119 // If there is something on the regstack execute the code for the state. 7120 // If the state is popped then loop and use the older state. 7121 while (!GA_EMPTY(®stack) && status != RA_FAIL) { 7122 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1; 7123 switch (rp->rs_state) { 7124 case RS_NOPEN: 7125 // Result is passed on as-is, simply pop the state. 7126 regstack_pop(&scan); 7127 break; 7128 7129 case RS_MOPEN: 7130 // Pop the state. Restore pointers when there is no match. 7131 if (status == RA_NOMATCH) { 7132 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no], 7133 &rex.reg_startp[rp->rs_no]); 7134 } 7135 regstack_pop(&scan); 7136 break; 7137 7138 case RS_ZOPEN: 7139 // Pop the state. Restore pointers when there is no match. 7140 if (status == RA_NOMATCH) { 7141 restore_se(&rp->rs_un.sesave, ®_startzpos[rp->rs_no], 7142 ®_startzp[rp->rs_no]); 7143 } 7144 regstack_pop(&scan); 7145 break; 7146 7147 case RS_MCLOSE: 7148 // Pop the state. Restore pointers when there is no match. 7149 if (status == RA_NOMATCH) { 7150 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no], 7151 &rex.reg_endp[rp->rs_no]); 7152 } 7153 regstack_pop(&scan); 7154 break; 7155 7156 case RS_ZCLOSE: 7157 // Pop the state. Restore pointers when there is no match. 7158 if (status == RA_NOMATCH) { 7159 restore_se(&rp->rs_un.sesave, ®_endzpos[rp->rs_no], 7160 ®_endzp[rp->rs_no]); 7161 } 7162 regstack_pop(&scan); 7163 break; 7164 7165 case RS_BRANCH: 7166 if (status == RA_MATCH) { 7167 // this branch matched, use it 7168 regstack_pop(&scan); 7169 } else { 7170 if (status != RA_BREAK) { 7171 // After a non-matching branch: try next one. 7172 reg_restore(&rp->rs_un.regsave, &backpos); 7173 scan = rp->rs_scan; 7174 } 7175 if (scan == NULL || OP(scan) != BRANCH) { 7176 // no more branches, didn't find a match 7177 status = RA_NOMATCH; 7178 regstack_pop(&scan); 7179 } else { 7180 // Prepare to try a branch. 7181 rp->rs_scan = regnext(scan); 7182 reg_save(&rp->rs_un.regsave, &backpos); 7183 scan = OPERAND(scan); 7184 } 7185 } 7186 break; 7187 7188 case RS_BRCPLX_MORE: 7189 // Pop the state. Restore pointers when there is no match. 7190 if (status == RA_NOMATCH) { 7191 reg_restore(&rp->rs_un.regsave, &backpos); 7192 brace_count[rp->rs_no]--; // decrement match count 7193 } 7194 regstack_pop(&scan); 7195 break; 7196 7197 case RS_BRCPLX_LONG: 7198 // Pop the state. Restore pointers when there is no match. 7199 if (status == RA_NOMATCH) { 7200 // There was no match, but we did find enough matches. 7201 reg_restore(&rp->rs_un.regsave, &backpos); 7202 brace_count[rp->rs_no]--; 7203 // continue with the items after "\{}" 7204 status = RA_CONT; 7205 } 7206 regstack_pop(&scan); 7207 if (status == RA_CONT) { 7208 scan = regnext(scan); 7209 } 7210 break; 7211 7212 case RS_BRCPLX_SHORT: 7213 // Pop the state. Restore pointers when there is no match. 7214 if (status == RA_NOMATCH) { 7215 // There was no match, try to match one more item. 7216 reg_restore(&rp->rs_un.regsave, &backpos); 7217 } 7218 regstack_pop(&scan); 7219 if (status == RA_NOMATCH) { 7220 scan = OPERAND(scan); 7221 status = RA_CONT; 7222 } 7223 break; 7224 7225 case RS_NOMATCH: 7226 // Pop the state. If the operand matches for NOMATCH or 7227 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup, 7228 // except for SUBPAT, and continue with the next item. 7229 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH)) { 7230 status = RA_NOMATCH; 7231 } else { 7232 status = RA_CONT; 7233 if (rp->rs_no != SUBPAT) { // zero-width 7234 reg_restore(&rp->rs_un.regsave, &backpos); 7235 } 7236 } 7237 regstack_pop(&scan); 7238 if (status == RA_CONT) { 7239 scan = regnext(scan); 7240 } 7241 break; 7242 7243 case RS_BEHIND1: 7244 if (status == RA_NOMATCH) { 7245 regstack_pop(&scan); 7246 regstack.ga_len -= (int)sizeof(regbehind_T); 7247 } else { 7248 // The stuff after BEHIND/NOBEHIND matches. Now try if 7249 // the behind part does (not) match before the current 7250 // position in the input. This must be done at every 7251 // position in the input and checking if the match ends at 7252 // the current position. 7253 7254 // save the position after the found match for next 7255 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos); 7256 7257 // Start looking for a match with operand at the current 7258 // position. Go back one character until we find the 7259 // result, hitting the start of the line or the previous 7260 // line (for multi-line matching). 7261 // Set behind_pos to where the match should end, BHPOS 7262 // will match it. Save the current value. 7263 (((regbehind_T *)rp) - 1)->save_behind = behind_pos; 7264 behind_pos = rp->rs_un.regsave; 7265 7266 rp->rs_state = RS_BEHIND2; 7267 7268 reg_restore(&rp->rs_un.regsave, &backpos); 7269 scan = OPERAND(rp->rs_scan) + 4; 7270 } 7271 break; 7272 7273 case RS_BEHIND2: 7274 // Looping for BEHIND / NOBEHIND match. 7275 if (status == RA_MATCH && reg_save_equal(&behind_pos)) { 7276 // found a match that ends where "next" started 7277 behind_pos = (((regbehind_T *)rp) - 1)->save_behind; 7278 if (rp->rs_no == BEHIND) { 7279 reg_restore(&(((regbehind_T *)rp) - 1)->save_after, 7280 &backpos); 7281 } else { 7282 // But we didn't want a match. Need to restore the 7283 // subexpr, because what follows matched, so they have 7284 // been set. 7285 status = RA_NOMATCH; 7286 restore_subexpr(((regbehind_T *)rp) - 1); 7287 } 7288 regstack_pop(&scan); 7289 regstack.ga_len -= (int)sizeof(regbehind_T); 7290 } else { 7291 int64_t limit; 7292 7293 // No match or a match that doesn't end where we want it: Go 7294 // back one character. May go to previous line once. 7295 no = OK; 7296 limit = OPERAND_MIN(rp->rs_scan); 7297 if (REG_MULTI) { 7298 if (limit > 0 7299 && ((rp->rs_un.regsave.rs_u.pos.lnum 7300 < behind_pos.rs_u.pos.lnum 7301 ? (colnr_T)strlen((char *)rex.line) 7302 : behind_pos.rs_u.pos.col) 7303 - rp->rs_un.regsave.rs_u.pos.col >= limit)) { 7304 no = FAIL; 7305 } else if (rp->rs_un.regsave.rs_u.pos.col == 0) { 7306 if (rp->rs_un.regsave.rs_u.pos.lnum 7307 < behind_pos.rs_u.pos.lnum 7308 || reg_getline(--rp->rs_un.regsave.rs_u.pos.lnum) 7309 == NULL) { 7310 no = FAIL; 7311 } else { 7312 reg_restore(&rp->rs_un.regsave, &backpos); 7313 rp->rs_un.regsave.rs_u.pos.col = 7314 (colnr_T)strlen((char *)rex.line); 7315 } 7316 } else { 7317 const uint8_t *const line = 7318 (uint8_t *)reg_getline(rp->rs_un.regsave.rs_u.pos.lnum); 7319 7320 rp->rs_un.regsave.rs_u.pos.col -= 7321 utf_head_off((char *)line, 7322 (char *)line + rp->rs_un.regsave.rs_u.pos.col - 1) 7323 + 1; 7324 } 7325 } else { 7326 if (rp->rs_un.regsave.rs_u.ptr == rex.line) { 7327 no = FAIL; 7328 } else { 7329 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr); 7330 if (limit > 0 7331 && (behind_pos.rs_u.ptr - rp->rs_un.regsave.rs_u.ptr) > (ptrdiff_t)limit) { 7332 no = FAIL; 7333 } 7334 } 7335 } 7336 if (no == OK) { 7337 // Advanced, prepare for finding match again. 7338 reg_restore(&rp->rs_un.regsave, &backpos); 7339 scan = OPERAND(rp->rs_scan) + 4; 7340 if (status == RA_MATCH) { 7341 // We did match, so subexpr may have been changed, 7342 // need to restore them for the next try. 7343 status = RA_NOMATCH; 7344 restore_subexpr(((regbehind_T *)rp) - 1); 7345 } 7346 } else { 7347 // Can't advance. For NOBEHIND that's a match. 7348 behind_pos = (((regbehind_T *)rp) - 1)->save_behind; 7349 if (rp->rs_no == NOBEHIND) { 7350 reg_restore(&(((regbehind_T *)rp) - 1)->save_after, 7351 &backpos); 7352 status = RA_MATCH; 7353 } else { 7354 // We do want a proper match. Need to restore the 7355 // subexpr if we had a match, because they may have 7356 // been set. 7357 if (status == RA_MATCH) { 7358 status = RA_NOMATCH; 7359 restore_subexpr(((regbehind_T *)rp) - 1); 7360 } 7361 } 7362 regstack_pop(&scan); 7363 regstack.ga_len -= (int)sizeof(regbehind_T); 7364 } 7365 } 7366 break; 7367 7368 case RS_STAR_LONG: 7369 case RS_STAR_SHORT: { 7370 regstar_T *rst = ((regstar_T *)rp) - 1; 7371 7372 if (status == RA_MATCH) { 7373 regstack_pop(&scan); 7374 regstack.ga_len -= (int)sizeof(regstar_T); 7375 break; 7376 } 7377 7378 // Tried once already, restore input pointers. 7379 if (status != RA_BREAK) { 7380 reg_restore(&rp->rs_un.regsave, &backpos); 7381 } 7382 7383 // Repeat until we found a position where it could match. 7384 while (true) { 7385 if (status != RA_BREAK) { 7386 // Tried first position already, advance. 7387 if (rp->rs_state == RS_STAR_LONG) { 7388 // Trying for longest match, but couldn't or 7389 // didn't match -- back up one char. 7390 if (--rst->count < rst->minval) { 7391 break; 7392 } 7393 if (rex.input == rex.line) { 7394 // backup to last char of previous line 7395 if (rex.lnum == 0) { 7396 status = RA_NOMATCH; 7397 break; 7398 } 7399 rex.lnum--; 7400 rex.line = (uint8_t *)reg_getline(rex.lnum); 7401 // Just in case regrepeat() didn't count right. 7402 if (rex.line == NULL) { 7403 break; 7404 } 7405 rex.input = rex.line + reg_getline_len(rex.lnum); 7406 reg_breakcheck(); 7407 } else { 7408 MB_PTR_BACK(rex.line, rex.input); 7409 } 7410 } else { 7411 // Range is backwards, use shortest match first. 7412 // Careful: maxval and minval are exchanged! 7413 // Couldn't or didn't match: try advancing one 7414 // char. 7415 if (rst->count == rst->minval 7416 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0) { 7417 break; 7418 } 7419 rst->count++; 7420 } 7421 if (got_int) { 7422 break; 7423 } 7424 } else { 7425 status = RA_NOMATCH; 7426 } 7427 7428 // If it could match, try it. 7429 if (rst->nextb == NUL || *rex.input == rst->nextb 7430 || *rex.input == rst->nextb_ic) { 7431 reg_save(&rp->rs_un.regsave, &backpos); 7432 scan = regnext(rp->rs_scan); 7433 status = RA_CONT; 7434 break; 7435 } 7436 } 7437 if (status != RA_CONT) { 7438 // Failed. 7439 regstack_pop(&scan); 7440 regstack.ga_len -= (int)sizeof(regstar_T); 7441 status = RA_NOMATCH; 7442 } 7443 } 7444 break; 7445 } 7446 7447 // If we want to continue the inner loop or didn't pop a state 7448 // continue matching loop 7449 if (status == RA_CONT || rp == (regitem_T *) 7450 ((char *)regstack.ga_data + regstack.ga_len) - 1) { 7451 break; 7452 } 7453 } 7454 7455 // May need to continue with the inner loop, starting at "scan". 7456 if (status == RA_CONT) { 7457 continue; 7458 } 7459 7460 // If the regstack is empty or something failed we are done. 7461 if (GA_EMPTY(®stack) || status == RA_FAIL) { 7462 if (scan == NULL) { 7463 // We get here only if there's trouble -- normally "case END" is 7464 // the terminating point. 7465 iemsg(_(e_re_corr)); 7466 #ifdef REGEXP_DEBUG 7467 printf("Premature EOL\n"); 7468 #endif 7469 } 7470 return status == RA_MATCH; 7471 } 7472 } // End of loop until the regstack is empty. 7473 7474 // NOTREACHED 7475 } 7476 7477 /// Try match of "prog" with at rex.line["col"]. 7478 /// 7479 /// @param tm timeout limit or NULL 7480 /// @param timed_out flag set on timeout or NULL 7481 /// 7482 /// @return 0 for failure, or number of lines contained in the match. 7483 static int regtry(bt_regprog_T *prog, colnr_T col, proftime_T *tm, int *timed_out) 7484 { 7485 rex.input = rex.line + col; 7486 rex.need_clear_subexpr = true; 7487 // Clear the external match subpointers if necessaey. 7488 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET); 7489 7490 if (regmatch(&prog->program[1], tm, timed_out) == 0) { 7491 return 0; 7492 } 7493 7494 cleanup_subexpr(); 7495 if (REG_MULTI) { 7496 if (rex.reg_startpos[0].lnum < 0) { 7497 rex.reg_startpos[0].lnum = 0; 7498 rex.reg_startpos[0].col = col; 7499 } 7500 if (rex.reg_endpos[0].lnum < 0) { 7501 rex.reg_endpos[0].lnum = rex.lnum; 7502 rex.reg_endpos[0].col = (int)(rex.input - rex.line); 7503 } else { 7504 // Use line number of "\ze". 7505 rex.lnum = rex.reg_endpos[0].lnum; 7506 } 7507 } else { 7508 if (rex.reg_startp[0] == NULL) { 7509 rex.reg_startp[0] = rex.line + col; 7510 } 7511 if (rex.reg_endp[0] == NULL) { 7512 rex.reg_endp[0] = rex.input; 7513 } 7514 } 7515 // Package any found \z(...\) matches for export. Default is none. 7516 unref_extmatch(re_extmatch_out); 7517 re_extmatch_out = NULL; 7518 7519 if (prog->reghasz == REX_SET) { 7520 int i; 7521 7522 cleanup_zsubexpr(); 7523 re_extmatch_out = make_extmatch(); 7524 for (i = 0; i < NSUBEXP; i++) { 7525 if (REG_MULTI) { 7526 // Only accept single line matches. 7527 if (reg_startzpos[i].lnum >= 0 7528 && reg_endzpos[i].lnum == reg_startzpos[i].lnum 7529 && reg_endzpos[i].col >= reg_startzpos[i].col) { 7530 re_extmatch_out->matches[i] = 7531 (uint8_t *)xstrnsave(reg_getline(reg_startzpos[i].lnum) + reg_startzpos[i].col, 7532 (size_t)(reg_endzpos[i].col - reg_startzpos[i].col)); 7533 } 7534 } else { 7535 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL) { 7536 re_extmatch_out->matches[i] = 7537 (uint8_t *)xstrnsave((char *)reg_startzp[i], (size_t)(reg_endzp[i] - reg_startzp[i])); 7538 } 7539 } 7540 } 7541 } 7542 return 1 + rex.lnum; 7543 } 7544 7545 /// Match a regexp against a string ("line" points to the string) or multiple 7546 /// lines (if "line" is NULL, use reg_getline()). 7547 /// 7548 /// @param startcol column to start looking for match 7549 /// @param tm timeout limit or NULL 7550 /// @param timed_out flag set on timeout or NULL 7551 /// 7552 /// @return 0 for failure, or number of lines contained in the match. 7553 static int bt_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int *timed_out) 7554 { 7555 bt_regprog_T *prog; 7556 uint8_t *s; 7557 colnr_T col = startcol; 7558 int retval = 0; 7559 7560 // Create "regstack" and "backpos" if they are not allocated yet. 7561 // We allocate *_INITIAL amount of bytes first and then set the grow size 7562 // to much bigger value to avoid many malloc calls in case of deep regular 7563 // expressions. 7564 if (regstack.ga_data == NULL) { 7565 // Use an item size of 1 byte, since we push different things 7566 // onto the regstack. 7567 ga_init(®stack, 1, REGSTACK_INITIAL); 7568 ga_grow(®stack, REGSTACK_INITIAL); 7569 ga_set_growsize(®stack, REGSTACK_INITIAL * 8); 7570 } 7571 7572 if (backpos.ga_data == NULL) { 7573 ga_init(&backpos, sizeof(backpos_T), BACKPOS_INITIAL); 7574 ga_grow(&backpos, BACKPOS_INITIAL); 7575 ga_set_growsize(&backpos, BACKPOS_INITIAL * 8); 7576 } 7577 7578 if (REG_MULTI) { 7579 prog = (bt_regprog_T *)rex.reg_mmatch->regprog; 7580 line = (uint8_t *)reg_getline(0); 7581 rex.reg_startpos = rex.reg_mmatch->startpos; 7582 rex.reg_endpos = rex.reg_mmatch->endpos; 7583 } else { 7584 prog = (bt_regprog_T *)rex.reg_match->regprog; 7585 rex.reg_startp = (uint8_t **)rex.reg_match->startp; 7586 rex.reg_endp = (uint8_t **)rex.reg_match->endp; 7587 } 7588 7589 // Be paranoid... 7590 if (prog == NULL || line == NULL) { 7591 iemsg(_(e_null)); 7592 goto theend; 7593 } 7594 7595 // Check validity of program. 7596 if (prog_magic_wrong()) { 7597 goto theend; 7598 } 7599 7600 // If the start column is past the maximum column: no need to try. 7601 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) { 7602 goto theend; 7603 } 7604 7605 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic 7606 if (prog->regflags & RF_ICASE) { 7607 rex.reg_ic = true; 7608 } else if (prog->regflags & RF_NOICASE) { 7609 rex.reg_ic = false; 7610 } 7611 7612 // If pattern contains "\Z" overrule value of rex.reg_icombine 7613 if (prog->regflags & RF_ICOMBINE) { 7614 rex.reg_icombine = true; 7615 } 7616 7617 // If there is a "must appear" string, look for it. 7618 if (prog->regmust != NULL) { 7619 int c = utf_ptr2char((char *)prog->regmust); 7620 s = line + col; 7621 7622 // This is used very often, esp. for ":global". Use two versions of 7623 // the loop to avoid overhead of conditions. 7624 if (!rex.reg_ic) { 7625 while ((s = (uint8_t *)vim_strchr((char *)s, c)) != NULL) { 7626 if (cstrncmp((char *)s, (char *)prog->regmust, &prog->regmlen) == 0) { 7627 break; // Found it. 7628 } 7629 MB_PTR_ADV(s); 7630 } 7631 } else { 7632 while ((s = (uint8_t *)cstrchr((char *)s, c)) != NULL) { 7633 if (cstrncmp((char *)s, (char *)prog->regmust, &prog->regmlen) == 0) { 7634 break; // Found it. 7635 } 7636 MB_PTR_ADV(s); 7637 } 7638 } 7639 if (s == NULL) { // Not present. 7640 goto theend; 7641 } 7642 } 7643 7644 rex.line = line; 7645 rex.lnum = 0; 7646 reg_toolong = false; 7647 7648 // Simplest case: Anchored match need be tried only once. 7649 if (prog->reganch) { 7650 int c = utf_ptr2char((char *)rex.line + col); 7651 if (prog->regstart == NUL 7652 || prog->regstart == c 7653 || (rex.reg_ic 7654 && (utf_fold(prog->regstart) == utf_fold(c) 7655 || (c < 255 && prog->regstart < 255 7656 && mb_tolower(prog->regstart) == mb_tolower(c))))) { 7657 retval = regtry(prog, col, tm, timed_out); 7658 } else { 7659 retval = 0; 7660 } 7661 } else { 7662 int tm_count = 0; 7663 // Messy cases: unanchored match. 7664 while (!got_int) { 7665 if (prog->regstart != NUL) { 7666 // Skip until the char we know it must start with. 7667 s = (uint8_t *)cstrchr((char *)rex.line + col, prog->regstart); 7668 if (s == NULL) { 7669 retval = 0; 7670 break; 7671 } 7672 col = (int)(s - rex.line); 7673 } 7674 7675 // Check for maximum column to try. 7676 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) { 7677 retval = 0; 7678 break; 7679 } 7680 7681 retval = regtry(prog, col, tm, timed_out); 7682 if (retval > 0) { 7683 break; 7684 } 7685 7686 // if not currently on the first line, get it again 7687 if (rex.lnum != 0) { 7688 rex.lnum = 0; 7689 rex.line = (uint8_t *)reg_getline(0); 7690 } 7691 if (rex.line[col] == NUL) { 7692 break; 7693 } 7694 col += utfc_ptr2len((char *)rex.line + col); 7695 // Check for timeout once in a twenty times to avoid overhead. 7696 if (tm != NULL && ++tm_count == 20) { 7697 tm_count = 0; 7698 if (profile_passed_limit(*tm)) { 7699 if (timed_out != NULL) { 7700 *timed_out = true; 7701 } 7702 break; 7703 } 7704 } 7705 } 7706 } 7707 7708 theend: 7709 // Free "reg_tofree" when it's a bit big. 7710 // Free regstack and backpos if they are bigger than their initial size. 7711 if (reg_tofreelen > 400) { 7712 XFREE_CLEAR(reg_tofree); 7713 } 7714 if (regstack.ga_maxlen > REGSTACK_INITIAL) { 7715 ga_clear(®stack); 7716 } 7717 if (backpos.ga_maxlen > BACKPOS_INITIAL) { 7718 ga_clear(&backpos); 7719 } 7720 7721 if (retval > 0) { 7722 // Make sure the end is never before the start. Can happen when \zs 7723 // and \ze are used. 7724 if (REG_MULTI) { 7725 const lpos_T *const start = &rex.reg_mmatch->startpos[0]; 7726 const lpos_T *const end = &rex.reg_mmatch->endpos[0]; 7727 7728 if (end->lnum < start->lnum 7729 || (end->lnum == start->lnum && end->col < start->col)) { 7730 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0]; 7731 } 7732 7733 // startpos[0] may be set by "\zs", also return the column where 7734 // the whole pattern matched. 7735 rex.reg_mmatch->rmm_matchcol = col; 7736 } else { 7737 if (rex.reg_match->endp[0] < rex.reg_match->startp[0]) { 7738 rex.reg_match->endp[0] = rex.reg_match->startp[0]; 7739 } 7740 7741 // startpos[0] may be set by "\zs", also return the column where 7742 // the whole pattern matched. 7743 rex.reg_match->rm_matchcol = col; 7744 } 7745 } 7746 7747 return retval; 7748 } 7749 7750 /// Match a regexp against a string. 7751 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). 7752 /// Uses curbuf for line count and 'iskeyword'. 7753 /// If "line_lbr" is true, consider a "\n" in "line" to be a line break. 7754 /// 7755 /// @param line string to match against 7756 /// @param col column to start looking for match 7757 /// 7758 /// @return 0 for failure, number of lines contained in the match otherwise. 7759 static int bt_regexec_nl(regmatch_T *rmp, uint8_t *line, colnr_T col, bool line_lbr) 7760 { 7761 rex.reg_match = rmp; 7762 rex.reg_mmatch = NULL; 7763 rex.reg_maxline = 0; 7764 rex.reg_line_lbr = line_lbr; 7765 rex.reg_buf = curbuf; 7766 rex.reg_win = NULL; 7767 rex.reg_ic = rmp->rm_ic; 7768 rex.reg_icombine = false; 7769 rex.reg_nobreak = rmp->regprog->re_flags & RE_NOBREAK; 7770 rex.reg_maxcol = 0; 7771 7772 int64_t r = bt_regexec_both(line, col, NULL, NULL); 7773 assert(r <= INT_MAX); 7774 return (int)r; 7775 } 7776 7777 /// Matches a regexp against multiple lines. 7778 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). 7779 /// Uses curbuf for line count and 'iskeyword'. 7780 /// 7781 /// @param win Window in which to search or NULL 7782 /// @param buf Buffer in which to search 7783 /// @param lnum Number of line to start looking for match 7784 /// @param col Column to start looking for match 7785 /// @param tm Timeout limit or NULL 7786 /// 7787 /// @return zero if there is no match and number of lines contained in the match 7788 /// otherwise. 7789 static int bt_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, 7790 proftime_T *tm, int *timed_out) 7791 { 7792 init_regexec_multi(rmp, win, buf, lnum); 7793 return bt_regexec_both(NULL, col, tm, timed_out); 7794 } 7795 7796 // Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL. 7797 static int re_num_cmp(uint32_t val, const uint8_t *scan) 7798 { 7799 uint32_t n = (uint32_t)OPERAND_MIN(scan); 7800 7801 if (OPERAND_CMP(scan) == '>') { 7802 return val > n; 7803 } 7804 if (OPERAND_CMP(scan) == '<') { 7805 return val < n; 7806 } 7807 return val == n; 7808 } 7809 7810 #ifdef BT_REGEXP_DUMP 7811 7812 // regdump - dump a regexp onto stdout in vaguely comprehensible form 7813 static void regdump(uint8_t *pattern, bt_regprog_T *r) 7814 { 7815 uint8_t *s; 7816 int op = EXACTLY; // Arbitrary non-END op. 7817 uint8_t *next; 7818 uint8_t *end = NULL; 7819 FILE *f; 7820 7821 # ifdef BT_REGEXP_LOG 7822 f = fopen("bt_regexp_log.log", "a"); 7823 # else 7824 f = stdout; 7825 # endif 7826 if (f == NULL) { 7827 return; 7828 } 7829 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", 7830 pattern); 7831 7832 s = &r->program[1]; 7833 // Loop until we find the END that isn't before a referred next (an END 7834 // can also appear in a NOMATCH operand). 7835 while (op != END || s <= end) { 7836 op = OP(s); 7837 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what. 7838 next = regnext(s); 7839 if (next == NULL) { // Next ptr. 7840 fprintf(f, "(0)"); 7841 } else { 7842 fprintf(f, "(%d)", (int)((s - r->program) + (next - s))); 7843 } 7844 if (end < next) { 7845 end = next; 7846 } 7847 if (op == BRACE_LIMITS) { 7848 // Two ints 7849 fprintf(f, " minval %" PRId64 ", maxval %" PRId64, 7850 (int64_t)OPERAND_MIN(s), (int64_t)OPERAND_MAX(s)); 7851 s += 8; 7852 } else if (op == BEHIND || op == NOBEHIND) { 7853 // one int 7854 fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s)); 7855 s += 4; 7856 } else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL) { 7857 // one int plus comparator 7858 fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s)); 7859 s += 5; 7860 } 7861 s += 3; 7862 if (op == ANYOF || op == ANYOF + ADD_NL 7863 || op == ANYBUT || op == ANYBUT + ADD_NL 7864 || op == EXACTLY) { 7865 // Literal string, where present. 7866 fprintf(f, "\nxxxxxxxxx\n"); 7867 while (*s != NUL) { 7868 fprintf(f, "%c", *s++); 7869 } 7870 fprintf(f, "\nxxxxxxxxx\n"); 7871 s++; 7872 } 7873 fprintf(f, "\r\n"); 7874 } 7875 7876 // Header fields of interest. 7877 if (r->regstart != NUL) { 7878 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256 7879 ? (char *)transchar(r->regstart) 7880 : "multibyte", r->regstart); 7881 } 7882 if (r->reganch) { 7883 fprintf(f, "anchored; "); 7884 } 7885 if (r->regmust != NULL) { 7886 fprintf(f, "must have \"%s\"", r->regmust); 7887 } 7888 fprintf(f, "\r\n"); 7889 7890 # ifdef BT_REGEXP_LOG 7891 fclose(f); 7892 # endif 7893 } 7894 #endif // BT_REGEXP_DUMP 7895 7896 #ifdef REGEXP_DEBUG 7897 7898 // regprop - printable representation of opcode 7899 static uint8_t *regprop(uint8_t *op) 7900 { 7901 char *p; 7902 static char buf[50]; 7903 static size_t buflen = 0; 7904 7905 STRCPY(buf, ":"); 7906 buflen = 1; 7907 7908 switch ((int)OP(op)) { 7909 case BOL: 7910 p = "BOL"; 7911 break; 7912 case EOL: 7913 p = "EOL"; 7914 break; 7915 case RE_BOF: 7916 p = "BOF"; 7917 break; 7918 case RE_EOF: 7919 p = "EOF"; 7920 break; 7921 case CURSOR: 7922 p = "CURSOR"; 7923 break; 7924 case RE_VISUAL: 7925 p = "RE_VISUAL"; 7926 break; 7927 case RE_LNUM: 7928 p = "RE_LNUM"; 7929 break; 7930 case RE_MARK: 7931 p = "RE_MARK"; 7932 break; 7933 case RE_COL: 7934 p = "RE_COL"; 7935 break; 7936 case RE_VCOL: 7937 p = "RE_VCOL"; 7938 break; 7939 case BOW: 7940 p = "BOW"; 7941 break; 7942 case EOW: 7943 p = "EOW"; 7944 break; 7945 case ANY: 7946 p = "ANY"; 7947 break; 7948 case ANY + ADD_NL: 7949 p = "ANY+NL"; 7950 break; 7951 case ANYOF: 7952 p = "ANYOF"; 7953 break; 7954 case ANYOF + ADD_NL: 7955 p = "ANYOF+NL"; 7956 break; 7957 case ANYBUT: 7958 p = "ANYBUT"; 7959 break; 7960 case ANYBUT + ADD_NL: 7961 p = "ANYBUT+NL"; 7962 break; 7963 case IDENT: 7964 p = "IDENT"; 7965 break; 7966 case IDENT + ADD_NL: 7967 p = "IDENT+NL"; 7968 break; 7969 case SIDENT: 7970 p = "SIDENT"; 7971 break; 7972 case SIDENT + ADD_NL: 7973 p = "SIDENT+NL"; 7974 break; 7975 case KWORD: 7976 p = "KWORD"; 7977 break; 7978 case KWORD + ADD_NL: 7979 p = "KWORD+NL"; 7980 break; 7981 case SKWORD: 7982 p = "SKWORD"; 7983 break; 7984 case SKWORD + ADD_NL: 7985 p = "SKWORD+NL"; 7986 break; 7987 case FNAME: 7988 p = "FNAME"; 7989 break; 7990 case FNAME + ADD_NL: 7991 p = "FNAME+NL"; 7992 break; 7993 case SFNAME: 7994 p = "SFNAME"; 7995 break; 7996 case SFNAME + ADD_NL: 7997 p = "SFNAME+NL"; 7998 break; 7999 case PRINT: 8000 p = "PRINT"; 8001 break; 8002 case PRINT + ADD_NL: 8003 p = "PRINT+NL"; 8004 break; 8005 case SPRINT: 8006 p = "SPRINT"; 8007 break; 8008 case SPRINT + ADD_NL: 8009 p = "SPRINT+NL"; 8010 break; 8011 case WHITE: 8012 p = "WHITE"; 8013 break; 8014 case WHITE + ADD_NL: 8015 p = "WHITE+NL"; 8016 break; 8017 case NWHITE: 8018 p = "NWHITE"; 8019 break; 8020 case NWHITE + ADD_NL: 8021 p = "NWHITE+NL"; 8022 break; 8023 case DIGIT: 8024 p = "DIGIT"; 8025 break; 8026 case DIGIT + ADD_NL: 8027 p = "DIGIT+NL"; 8028 break; 8029 case NDIGIT: 8030 p = "NDIGIT"; 8031 break; 8032 case NDIGIT + ADD_NL: 8033 p = "NDIGIT+NL"; 8034 break; 8035 case HEX: 8036 p = "HEX"; 8037 break; 8038 case HEX + ADD_NL: 8039 p = "HEX+NL"; 8040 break; 8041 case NHEX: 8042 p = "NHEX"; 8043 break; 8044 case NHEX + ADD_NL: 8045 p = "NHEX+NL"; 8046 break; 8047 case OCTAL: 8048 p = "OCTAL"; 8049 break; 8050 case OCTAL + ADD_NL: 8051 p = "OCTAL+NL"; 8052 break; 8053 case NOCTAL: 8054 p = "NOCTAL"; 8055 break; 8056 case NOCTAL + ADD_NL: 8057 p = "NOCTAL+NL"; 8058 break; 8059 case WORD: 8060 p = "WORD"; 8061 break; 8062 case WORD + ADD_NL: 8063 p = "WORD+NL"; 8064 break; 8065 case NWORD: 8066 p = "NWORD"; 8067 break; 8068 case NWORD + ADD_NL: 8069 p = "NWORD+NL"; 8070 break; 8071 case HEAD: 8072 p = "HEAD"; 8073 break; 8074 case HEAD + ADD_NL: 8075 p = "HEAD+NL"; 8076 break; 8077 case NHEAD: 8078 p = "NHEAD"; 8079 break; 8080 case NHEAD + ADD_NL: 8081 p = "NHEAD+NL"; 8082 break; 8083 case ALPHA: 8084 p = "ALPHA"; 8085 break; 8086 case ALPHA + ADD_NL: 8087 p = "ALPHA+NL"; 8088 break; 8089 case NALPHA: 8090 p = "NALPHA"; 8091 break; 8092 case NALPHA + ADD_NL: 8093 p = "NALPHA+NL"; 8094 break; 8095 case LOWER: 8096 p = "LOWER"; 8097 break; 8098 case LOWER + ADD_NL: 8099 p = "LOWER+NL"; 8100 break; 8101 case NLOWER: 8102 p = "NLOWER"; 8103 break; 8104 case NLOWER + ADD_NL: 8105 p = "NLOWER+NL"; 8106 break; 8107 case UPPER: 8108 p = "UPPER"; 8109 break; 8110 case UPPER + ADD_NL: 8111 p = "UPPER+NL"; 8112 break; 8113 case NUPPER: 8114 p = "NUPPER"; 8115 break; 8116 case NUPPER + ADD_NL: 8117 p = "NUPPER+NL"; 8118 break; 8119 case BRANCH: 8120 p = "BRANCH"; 8121 break; 8122 case EXACTLY: 8123 p = "EXACTLY"; 8124 break; 8125 case NOTHING: 8126 p = "NOTHING"; 8127 break; 8128 case BACK: 8129 p = "BACK"; 8130 break; 8131 case END: 8132 p = "END"; 8133 break; 8134 case MOPEN + 0: 8135 p = "MATCH START"; 8136 break; 8137 case MOPEN + 1: 8138 case MOPEN + 2: 8139 case MOPEN + 3: 8140 case MOPEN + 4: 8141 case MOPEN + 5: 8142 case MOPEN + 6: 8143 case MOPEN + 7: 8144 case MOPEN + 8: 8145 case MOPEN + 9: 8146 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8147 "MOPEN%d", OP(op) - MOPEN); 8148 p = NULL; 8149 break; 8150 case MCLOSE + 0: 8151 p = "MATCH END"; 8152 break; 8153 case MCLOSE + 1: 8154 case MCLOSE + 2: 8155 case MCLOSE + 3: 8156 case MCLOSE + 4: 8157 case MCLOSE + 5: 8158 case MCLOSE + 6: 8159 case MCLOSE + 7: 8160 case MCLOSE + 8: 8161 case MCLOSE + 9: 8162 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8163 "MCLOSE%d", OP(op) - MCLOSE); 8164 p = NULL; 8165 break; 8166 case BACKREF + 1: 8167 case BACKREF + 2: 8168 case BACKREF + 3: 8169 case BACKREF + 4: 8170 case BACKREF + 5: 8171 case BACKREF + 6: 8172 case BACKREF + 7: 8173 case BACKREF + 8: 8174 case BACKREF + 9: 8175 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8176 "BACKREF%d", OP(op) - BACKREF); 8177 p = NULL; 8178 break; 8179 case NOPEN: 8180 p = "NOPEN"; 8181 break; 8182 case NCLOSE: 8183 p = "NCLOSE"; 8184 break; 8185 case ZOPEN + 1: 8186 case ZOPEN + 2: 8187 case ZOPEN + 3: 8188 case ZOPEN + 4: 8189 case ZOPEN + 5: 8190 case ZOPEN + 6: 8191 case ZOPEN + 7: 8192 case ZOPEN + 8: 8193 case ZOPEN + 9: 8194 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8195 "ZOPEN%d", OP(op) - ZOPEN); 8196 p = NULL; 8197 break; 8198 case ZCLOSE + 1: 8199 case ZCLOSE + 2: 8200 case ZCLOSE + 3: 8201 case ZCLOSE + 4: 8202 case ZCLOSE + 5: 8203 case ZCLOSE + 6: 8204 case ZCLOSE + 7: 8205 case ZCLOSE + 8: 8206 case ZCLOSE + 9: 8207 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8208 "ZCLOSE%d", OP(op) - ZCLOSE); 8209 p = NULL; 8210 break; 8211 case ZREF + 1: 8212 case ZREF + 2: 8213 case ZREF + 3: 8214 case ZREF + 4: 8215 case ZREF + 5: 8216 case ZREF + 6: 8217 case ZREF + 7: 8218 case ZREF + 8: 8219 case ZREF + 9: 8220 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8221 "ZREF%d", OP(op) - ZREF); 8222 p = NULL; 8223 break; 8224 case STAR: 8225 p = "STAR"; 8226 break; 8227 case PLUS: 8228 p = "PLUS"; 8229 break; 8230 case NOMATCH: 8231 p = "NOMATCH"; 8232 break; 8233 case MATCH: 8234 p = "MATCH"; 8235 break; 8236 case BEHIND: 8237 p = "BEHIND"; 8238 break; 8239 case NOBEHIND: 8240 p = "NOBEHIND"; 8241 break; 8242 case SUBPAT: 8243 p = "SUBPAT"; 8244 break; 8245 case BRACE_LIMITS: 8246 p = "BRACE_LIMITS"; 8247 break; 8248 case BRACE_SIMPLE: 8249 p = "BRACE_SIMPLE"; 8250 break; 8251 case BRACE_COMPLEX + 0: 8252 case BRACE_COMPLEX + 1: 8253 case BRACE_COMPLEX + 2: 8254 case BRACE_COMPLEX + 3: 8255 case BRACE_COMPLEX + 4: 8256 case BRACE_COMPLEX + 5: 8257 case BRACE_COMPLEX + 6: 8258 case BRACE_COMPLEX + 7: 8259 case BRACE_COMPLEX + 8: 8260 case BRACE_COMPLEX + 9: 8261 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8262 "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX); 8263 p = NULL; 8264 break; 8265 case MULTIBYTECODE: 8266 p = "MULTIBYTECODE"; 8267 break; 8268 case NEWL: 8269 p = "NEWL"; 8270 break; 8271 default: 8272 buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, 8273 "corrupt %d", OP(op)); 8274 p = NULL; 8275 break; 8276 } 8277 if (p != NULL) { 8278 STRCPY(buf + buflen, p); 8279 } 8280 return (uint8_t *)buf; 8281 } 8282 #endif // REGEXP_DEBUG 8283 8284 // }}}1 8285 8286 // regexp_nfa.c {{{1 8287 // NFA regular expression implementation. 8288 8289 // Logging of NFA engine. 8290 // 8291 // The NFA engine can write four log files: 8292 // - Error log: Contains NFA engine's fatal errors. 8293 // - Dump log: Contains compiled NFA state machine's information. 8294 // - Run log: Contains information of matching procedure. 8295 // - Debug log: Contains detailed information of matching procedure. Can be 8296 // disabled by undefining NFA_REGEXP_DEBUG_LOG. 8297 // The first one can also be used without debug mode. 8298 // The last three are enabled when compiled as debug mode and individually 8299 // disabled by commenting them out. 8300 // The log files can get quite big! 8301 // To disable all of this when compiling Vim for debugging, undefine REGEXP_DEBUG in 8302 // regexp.c 8303 #ifdef REGEXP_DEBUG 8304 # define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log" 8305 # define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log" 8306 # define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log" 8307 # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log" 8308 #endif 8309 8310 // Added to NFA_ANY - NFA_NUPPER_IC to include a NL. 8311 #define NFA_ADD_NL 31 8312 8313 enum { 8314 NFA_SPLIT = -1024, 8315 NFA_MATCH, 8316 NFA_EMPTY, // matches 0-length 8317 8318 NFA_START_COLL, // [abc] start 8319 NFA_END_COLL, // [abc] end 8320 NFA_START_NEG_COLL, // [^abc] start 8321 NFA_END_NEG_COLL, // [^abc] end (postfix only) 8322 NFA_RANGE, // range of the two previous items 8323 // (postfix only) 8324 NFA_RANGE_MIN, // low end of a range 8325 NFA_RANGE_MAX, // high end of a range 8326 8327 NFA_CONCAT, // concatenate two previous items (postfix 8328 // only) 8329 NFA_OR, // \| (postfix only) 8330 NFA_STAR, // greedy * (postfix only) 8331 NFA_STAR_NONGREEDY, // non-greedy * (postfix only) 8332 NFA_QUEST, // greedy \? (postfix only) 8333 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only) 8334 8335 NFA_BOL, // ^ Begin line 8336 NFA_EOL, // $ End line 8337 NFA_BOW, // \< Begin word 8338 NFA_EOW, // \> End word 8339 NFA_BOF, // \%^ Begin file 8340 NFA_EOF, // \%$ End file 8341 NFA_NEWL, 8342 NFA_ZSTART, // Used for \zs 8343 NFA_ZEND, // Used for \ze 8344 NFA_NOPEN, // Start of subexpression marked with \%( 8345 NFA_NCLOSE, // End of subexpr. marked with \%( ... \) 8346 NFA_START_INVISIBLE, 8347 NFA_START_INVISIBLE_FIRST, 8348 NFA_START_INVISIBLE_NEG, 8349 NFA_START_INVISIBLE_NEG_FIRST, 8350 NFA_START_INVISIBLE_BEFORE, 8351 NFA_START_INVISIBLE_BEFORE_FIRST, 8352 NFA_START_INVISIBLE_BEFORE_NEG, 8353 NFA_START_INVISIBLE_BEFORE_NEG_FIRST, 8354 NFA_START_PATTERN, 8355 NFA_END_INVISIBLE, 8356 NFA_END_INVISIBLE_NEG, 8357 NFA_END_PATTERN, 8358 NFA_COMPOSING, // Next nodes in NFA are part of the 8359 // composing multibyte char 8360 NFA_END_COMPOSING, // End of a composing char in the NFA 8361 NFA_ANY_COMPOSING, // \%C: Any composing characters. 8362 NFA_OPT_CHARS, // \%[abc] 8363 8364 // The following are used only in the postfix form, not in the NFA 8365 NFA_PREV_ATOM_NO_WIDTH, // Used for \@= 8366 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@! 8367 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<= 8368 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<! 8369 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@> 8370 8371 NFA_BACKREF1, // \1 8372 NFA_BACKREF2, // \2 8373 NFA_BACKREF3, // \3 8374 NFA_BACKREF4, // \4 8375 NFA_BACKREF5, // \5 8376 NFA_BACKREF6, // \6 8377 NFA_BACKREF7, // \7 8378 NFA_BACKREF8, // \8 8379 NFA_BACKREF9, // \9 8380 NFA_ZREF1, // \z1 8381 NFA_ZREF2, // \z2 8382 NFA_ZREF3, // \z3 8383 NFA_ZREF4, // \z4 8384 NFA_ZREF5, // \z5 8385 NFA_ZREF6, // \z6 8386 NFA_ZREF7, // \z7 8387 NFA_ZREF8, // \z8 8388 NFA_ZREF9, // \z9 8389 NFA_SKIP, // Skip characters 8390 8391 NFA_MOPEN, 8392 NFA_MOPEN1, 8393 NFA_MOPEN2, 8394 NFA_MOPEN3, 8395 NFA_MOPEN4, 8396 NFA_MOPEN5, 8397 NFA_MOPEN6, 8398 NFA_MOPEN7, 8399 NFA_MOPEN8, 8400 NFA_MOPEN9, 8401 8402 NFA_MCLOSE, 8403 NFA_MCLOSE1, 8404 NFA_MCLOSE2, 8405 NFA_MCLOSE3, 8406 NFA_MCLOSE4, 8407 NFA_MCLOSE5, 8408 NFA_MCLOSE6, 8409 NFA_MCLOSE7, 8410 NFA_MCLOSE8, 8411 NFA_MCLOSE9, 8412 8413 NFA_ZOPEN, 8414 NFA_ZOPEN1, 8415 NFA_ZOPEN2, 8416 NFA_ZOPEN3, 8417 NFA_ZOPEN4, 8418 NFA_ZOPEN5, 8419 NFA_ZOPEN6, 8420 NFA_ZOPEN7, 8421 NFA_ZOPEN8, 8422 NFA_ZOPEN9, 8423 8424 NFA_ZCLOSE, 8425 NFA_ZCLOSE1, 8426 NFA_ZCLOSE2, 8427 NFA_ZCLOSE3, 8428 NFA_ZCLOSE4, 8429 NFA_ZCLOSE5, 8430 NFA_ZCLOSE6, 8431 NFA_ZCLOSE7, 8432 NFA_ZCLOSE8, 8433 NFA_ZCLOSE9, 8434 8435 // NFA_FIRST_NL 8436 NFA_ANY, // Match any one character. 8437 NFA_IDENT, // Match identifier char 8438 NFA_SIDENT, // Match identifier char but no digit 8439 NFA_KWORD, // Match keyword char 8440 NFA_SKWORD, // Match word char but no digit 8441 NFA_FNAME, // Match file name char 8442 NFA_SFNAME, // Match file name char but no digit 8443 NFA_PRINT, // Match printable char 8444 NFA_SPRINT, // Match printable char but no digit 8445 NFA_WHITE, // Match whitespace char 8446 NFA_NWHITE, // Match non-whitespace char 8447 NFA_DIGIT, // Match digit char 8448 NFA_NDIGIT, // Match non-digit char 8449 NFA_HEX, // Match hex char 8450 NFA_NHEX, // Match non-hex char 8451 NFA_OCTAL, // Match octal char 8452 NFA_NOCTAL, // Match non-octal char 8453 NFA_WORD, // Match word char 8454 NFA_NWORD, // Match non-word char 8455 NFA_HEAD, // Match head char 8456 NFA_NHEAD, // Match non-head char 8457 NFA_ALPHA, // Match alpha char 8458 NFA_NALPHA, // Match non-alpha char 8459 NFA_LOWER, // Match lowercase char 8460 NFA_NLOWER, // Match non-lowercase char 8461 NFA_UPPER, // Match uppercase char 8462 NFA_NUPPER, // Match non-uppercase char 8463 NFA_LOWER_IC, // Match [a-z] 8464 NFA_NLOWER_IC, // Match [^a-z] 8465 NFA_UPPER_IC, // Match [A-Z] 8466 NFA_NUPPER_IC, // Match [^A-Z] 8467 8468 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL, 8469 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL, 8470 8471 NFA_CURSOR, // Match cursor pos 8472 NFA_LNUM, // Match line number 8473 NFA_LNUM_GT, // Match > line number 8474 NFA_LNUM_LT, // Match < line number 8475 NFA_COL, // Match cursor column 8476 NFA_COL_GT, // Match > cursor column 8477 NFA_COL_LT, // Match < cursor column 8478 NFA_VCOL, // Match cursor virtual column 8479 NFA_VCOL_GT, // Match > cursor virtual column 8480 NFA_VCOL_LT, // Match < cursor virtual column 8481 NFA_MARK, // Match mark 8482 NFA_MARK_GT, // Match > mark 8483 NFA_MARK_LT, // Match < mark 8484 NFA_VISUAL, // Match Visual area 8485 8486 // Character classes [:alnum:] etc 8487 NFA_CLASS_ALNUM, 8488 NFA_CLASS_ALPHA, 8489 NFA_CLASS_BLANK, 8490 NFA_CLASS_CNTRL, 8491 NFA_CLASS_DIGIT, 8492 NFA_CLASS_GRAPH, 8493 NFA_CLASS_LOWER, 8494 NFA_CLASS_PRINT, 8495 NFA_CLASS_PUNCT, 8496 NFA_CLASS_SPACE, 8497 NFA_CLASS_UPPER, 8498 NFA_CLASS_XDIGIT, 8499 NFA_CLASS_TAB, 8500 NFA_CLASS_RETURN, 8501 NFA_CLASS_BACKSPACE, 8502 NFA_CLASS_ESCAPE, 8503 NFA_CLASS_IDENT, 8504 NFA_CLASS_KEYWORD, 8505 NFA_CLASS_FNAME, 8506 }; 8507 8508 // Keep in sync with classchars. 8509 static int nfa_classcodes[] = { 8510 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD, NFA_SKWORD, 8511 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT, 8512 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT, 8513 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL, 8514 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD, 8515 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER, 8516 NFA_UPPER, NFA_NUPPER 8517 }; 8518 8519 static const char e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely"); 8520 static const char e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c"); 8521 static const char e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %" PRId64); 8522 static const char e_value_too_large[] = N_("E951: \\% value too large"); 8523 8524 // Variables only used in nfa_regcomp() and descendants. 8525 static int nfa_re_flags; ///< re_flags passed to nfa_regcomp(). 8526 static int *post_start; ///< holds the postfix form of r.e. 8527 static int *post_end; 8528 static int *post_ptr; 8529 8530 // Set when the pattern should use the NFA engine. 8531 // E.g. [[:upper:]] only allows 8bit characters for BT engine, 8532 // while NFA engine handles multibyte characters correctly. 8533 static bool wants_nfa; 8534 8535 static int nstate; ///< Number of states in the NFA. Also used when executing. 8536 static int istate; ///< Index in the state vector, used in alloc_state() 8537 8538 // If not NULL match must end at this position 8539 static save_se_T *nfa_endp = NULL; 8540 8541 // 0 for first call to nfa_regmatch(), 1 for recursive call. 8542 static int nfa_ll_index = 0; 8543 8544 // Helper functions used when doing re2post() ... regatom() parsing 8545 #define EMIT(c) \ 8546 do { \ 8547 if (post_ptr >= post_end) { \ 8548 realloc_post_list(); \ 8549 } \ 8550 *post_ptr++ = c; \ 8551 } while (0) 8552 8553 /// Initialize internal variables before NFA compilation. 8554 /// 8555 /// @param re_flags @see vim_regcomp() 8556 static void nfa_regcomp_start(uint8_t *expr, int re_flags) 8557 { 8558 size_t postfix_size; 8559 size_t nstate_max; 8560 8561 nstate = 0; 8562 istate = 0; 8563 // A reasonable estimation for maximum size 8564 nstate_max = (strlen((char *)expr) + 1) * 25; 8565 8566 // Some items blow up in size, such as [A-z]. Add more space for that. 8567 // When it is still not enough realloc_post_list() will be used. 8568 nstate_max += 1000; 8569 8570 // Size for postfix representation of expr. 8571 postfix_size = sizeof(int) * nstate_max; 8572 8573 post_start = (int *)xmalloc(postfix_size); 8574 post_ptr = post_start; 8575 post_end = post_start + nstate_max; 8576 wants_nfa = false; 8577 rex.nfa_has_zend = false; 8578 rex.nfa_has_backref = false; 8579 8580 // shared with BT engine 8581 regcomp_start(expr, re_flags); 8582 } 8583 8584 // Figure out if the NFA state list starts with an anchor, must match at start 8585 // of the line. 8586 static int nfa_get_reganch(nfa_state_T *start, int depth) 8587 { 8588 nfa_state_T *p = start; 8589 8590 if (depth > 4) { 8591 return 0; 8592 } 8593 8594 while (p != NULL) { 8595 switch (p->c) { 8596 case NFA_BOL: 8597 case NFA_BOF: 8598 return 1; // yes! 8599 8600 case NFA_ZSTART: 8601 case NFA_ZEND: 8602 case NFA_CURSOR: 8603 case NFA_VISUAL: 8604 8605 case NFA_MOPEN: 8606 case NFA_MOPEN1: 8607 case NFA_MOPEN2: 8608 case NFA_MOPEN3: 8609 case NFA_MOPEN4: 8610 case NFA_MOPEN5: 8611 case NFA_MOPEN6: 8612 case NFA_MOPEN7: 8613 case NFA_MOPEN8: 8614 case NFA_MOPEN9: 8615 case NFA_NOPEN: 8616 case NFA_ZOPEN: 8617 case NFA_ZOPEN1: 8618 case NFA_ZOPEN2: 8619 case NFA_ZOPEN3: 8620 case NFA_ZOPEN4: 8621 case NFA_ZOPEN5: 8622 case NFA_ZOPEN6: 8623 case NFA_ZOPEN7: 8624 case NFA_ZOPEN8: 8625 case NFA_ZOPEN9: 8626 p = p->out; 8627 break; 8628 8629 case NFA_SPLIT: 8630 return nfa_get_reganch(p->out, depth + 1) 8631 && nfa_get_reganch(p->out1, depth + 1); 8632 8633 default: 8634 return 0; // noooo 8635 } 8636 } 8637 return 0; 8638 } 8639 8640 // Figure out if the NFA state list starts with a character which must match 8641 // at start of the match. 8642 static int nfa_get_regstart(nfa_state_T *start, int depth) 8643 { 8644 nfa_state_T *p = start; 8645 8646 if (depth > 4) { 8647 return 0; 8648 } 8649 8650 while (p != NULL) { 8651 switch (p->c) { 8652 // all kinds of zero-width matches 8653 case NFA_BOL: 8654 case NFA_BOF: 8655 case NFA_BOW: 8656 case NFA_EOW: 8657 case NFA_ZSTART: 8658 case NFA_ZEND: 8659 case NFA_CURSOR: 8660 case NFA_VISUAL: 8661 case NFA_LNUM: 8662 case NFA_LNUM_GT: 8663 case NFA_LNUM_LT: 8664 case NFA_COL: 8665 case NFA_COL_GT: 8666 case NFA_COL_LT: 8667 case NFA_VCOL: 8668 case NFA_VCOL_GT: 8669 case NFA_VCOL_LT: 8670 case NFA_MARK: 8671 case NFA_MARK_GT: 8672 case NFA_MARK_LT: 8673 8674 case NFA_MOPEN: 8675 case NFA_MOPEN1: 8676 case NFA_MOPEN2: 8677 case NFA_MOPEN3: 8678 case NFA_MOPEN4: 8679 case NFA_MOPEN5: 8680 case NFA_MOPEN6: 8681 case NFA_MOPEN7: 8682 case NFA_MOPEN8: 8683 case NFA_MOPEN9: 8684 case NFA_NOPEN: 8685 case NFA_ZOPEN: 8686 case NFA_ZOPEN1: 8687 case NFA_ZOPEN2: 8688 case NFA_ZOPEN3: 8689 case NFA_ZOPEN4: 8690 case NFA_ZOPEN5: 8691 case NFA_ZOPEN6: 8692 case NFA_ZOPEN7: 8693 case NFA_ZOPEN8: 8694 case NFA_ZOPEN9: 8695 p = p->out; 8696 break; 8697 8698 case NFA_SPLIT: { 8699 int c1 = nfa_get_regstart(p->out, depth + 1); 8700 int c2 = nfa_get_regstart(p->out1, depth + 1); 8701 8702 if (c1 == c2) { 8703 return c1; // yes! 8704 } 8705 return 0; 8706 } 8707 8708 default: 8709 if (p->c > 0) { 8710 return p->c; // yes! 8711 } 8712 return 0; 8713 } 8714 } 8715 return 0; 8716 } 8717 8718 // Figure out if the NFA state list contains just literal text and nothing 8719 // else. If so return a string in allocated memory with what must match after 8720 // regstart. Otherwise return NULL. 8721 static uint8_t *nfa_get_match_text(nfa_state_T *start) 8722 { 8723 nfa_state_T *p = start; 8724 int len = 0; 8725 uint8_t *ret; 8726 uint8_t *s; 8727 8728 if (p->c != NFA_MOPEN) { 8729 return NULL; // just in case 8730 } 8731 p = p->out; 8732 while (p->c > 0) { 8733 len += utf_char2len(p->c); 8734 p = p->out; 8735 } 8736 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH) { 8737 return NULL; 8738 } 8739 8740 ret = xmalloc((size_t)len); 8741 p = start->out->out; // skip first char, it goes into regstart 8742 s = ret; 8743 while (p->c > 0) { 8744 s += utf_char2bytes(p->c, (char *)s); 8745 p = p->out; 8746 } 8747 *s = NUL; 8748 8749 return ret; 8750 } 8751 8752 // Allocate more space for post_start. Called when 8753 // running above the estimated number of states. 8754 static void realloc_post_list(void) 8755 { 8756 // For weird patterns the number of states can be very high. Increasing by 8757 // 50% seems a reasonable compromise between memory use and speed. 8758 const size_t new_max = (size_t)(post_end - post_start) * 3 / 2; 8759 int *new_start = xrealloc(post_start, new_max * sizeof(int)); 8760 post_ptr = new_start + (post_ptr - post_start); 8761 post_end = new_start + new_max; 8762 post_start = new_start; 8763 } 8764 8765 // Search between "start" and "end" and try to recognize a 8766 // character class in expanded form. For example [0-9]. 8767 // On success, return the id the character class to be emitted. 8768 // On failure, return 0 (=FAIL) 8769 // Start points to the first char of the range, while end should point 8770 // to the closing brace. 8771 // Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may 8772 // need to be interpreted as [a-zA-Z]. 8773 static int nfa_recognize_char_class(uint8_t *start, const uint8_t *end, int extra_newl) 8774 { 8775 #define CLASS_not 0x80 8776 #define CLASS_af 0x40 8777 #define CLASS_AF 0x20 8778 #define CLASS_az 0x10 8779 #define CLASS_AZ 0x08 8780 #define CLASS_o7 0x04 8781 #define CLASS_o9 0x02 8782 #define CLASS_underscore 0x01 8783 8784 uint8_t *p; 8785 int config = 0; 8786 8787 bool newl = extra_newl == true; 8788 8789 if (*end != ']') { 8790 return FAIL; 8791 } 8792 p = start; 8793 if (*p == '^') { 8794 config |= CLASS_not; 8795 p++; 8796 } 8797 8798 while (p < end) { 8799 if (p + 2 < end && *(p + 1) == '-') { 8800 switch (*p) { 8801 case '0': 8802 if (*(p + 2) == '9') { 8803 config |= CLASS_o9; 8804 break; 8805 } else if (*(p + 2) == '7') { 8806 config |= CLASS_o7; 8807 break; 8808 } 8809 return FAIL; 8810 case 'a': 8811 if (*(p + 2) == 'z') { 8812 config |= CLASS_az; 8813 break; 8814 } else if (*(p + 2) == 'f') { 8815 config |= CLASS_af; 8816 break; 8817 } 8818 return FAIL; 8819 case 'A': 8820 if (*(p + 2) == 'Z') { 8821 config |= CLASS_AZ; 8822 break; 8823 } else if (*(p + 2) == 'F') { 8824 config |= CLASS_AF; 8825 break; 8826 } 8827 return FAIL; 8828 default: 8829 return FAIL; 8830 } 8831 p += 3; 8832 } else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n') { 8833 newl = true; 8834 p += 2; 8835 } else if (*p == '_') { 8836 config |= CLASS_underscore; 8837 p++; 8838 } else if (*p == '\n') { 8839 newl = true; 8840 p++; 8841 } else { 8842 return FAIL; 8843 } 8844 } // while (p < end) 8845 8846 if (p != end) { 8847 return FAIL; 8848 } 8849 8850 if (newl == true) { 8851 extra_newl = NFA_ADD_NL; 8852 } 8853 8854 switch (config) { 8855 case CLASS_o9: 8856 return extra_newl + NFA_DIGIT; 8857 case CLASS_not | CLASS_o9: 8858 return extra_newl + NFA_NDIGIT; 8859 case CLASS_af | CLASS_AF | CLASS_o9: 8860 return extra_newl + NFA_HEX; 8861 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9: 8862 return extra_newl + NFA_NHEX; 8863 case CLASS_o7: 8864 return extra_newl + NFA_OCTAL; 8865 case CLASS_not | CLASS_o7: 8866 return extra_newl + NFA_NOCTAL; 8867 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore: 8868 return extra_newl + NFA_WORD; 8869 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore: 8870 return extra_newl + NFA_NWORD; 8871 case CLASS_az | CLASS_AZ | CLASS_underscore: 8872 return extra_newl + NFA_HEAD; 8873 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore: 8874 return extra_newl + NFA_NHEAD; 8875 case CLASS_az | CLASS_AZ: 8876 return extra_newl + NFA_ALPHA; 8877 case CLASS_not | CLASS_az | CLASS_AZ: 8878 return extra_newl + NFA_NALPHA; 8879 case CLASS_az: 8880 return extra_newl + NFA_LOWER_IC; 8881 case CLASS_not | CLASS_az: 8882 return extra_newl + NFA_NLOWER_IC; 8883 case CLASS_AZ: 8884 return extra_newl + NFA_UPPER_IC; 8885 case CLASS_not | CLASS_AZ: 8886 return extra_newl + NFA_NUPPER_IC; 8887 } 8888 return FAIL; 8889 } 8890 8891 // Produce the bytes for equivalence class "c". 8892 // Currently only handles latin1, latin9 and utf-8. 8893 // Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is 8894 // equivalent to 'a OR b OR c' 8895 // 8896 // NOTE! When changing this function, also update reg_equi_class() 8897 static void nfa_emit_equi_class(int c) 8898 { 8899 #define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT); 8900 8901 { 8902 #define A_grave 0xc0 8903 #define A_acute 0xc1 8904 #define A_circumflex 0xc2 8905 #define A_virguilla 0xc3 8906 #define A_diaeresis 0xc4 8907 #define A_ring 0xc5 8908 #define C_cedilla 0xc7 8909 #define E_grave 0xc8 8910 #define E_acute 0xc9 8911 #define E_circumflex 0xca 8912 #define E_diaeresis 0xcb 8913 #define I_grave 0xcc 8914 #define I_acute 0xcd 8915 #define I_circumflex 0xce 8916 #define I_diaeresis 0xcf 8917 #define N_virguilla 0xd1 8918 #define O_grave 0xd2 8919 #define O_acute 0xd3 8920 #define O_circumflex 0xd4 8921 #define O_virguilla 0xd5 8922 #define O_diaeresis 0xd6 8923 #define O_slash 0xd8 8924 #define U_grave 0xd9 8925 #define U_acute 0xda 8926 #define U_circumflex 0xdb 8927 #define U_diaeresis 0xdc 8928 #define Y_acute 0xdd 8929 #define a_grave 0xe0 8930 #define a_acute 0xe1 8931 #define a_circumflex 0xe2 8932 #define a_virguilla 0xe3 8933 #define a_diaeresis 0xe4 8934 #define a_ring 0xe5 8935 #define c_cedilla 0xe7 8936 #define e_grave 0xe8 8937 #define e_acute 0xe9 8938 #define e_circumflex 0xea 8939 #define e_diaeresis 0xeb 8940 #define i_grave 0xec 8941 #define i_acute 0xed 8942 #define i_circumflex 0xee 8943 #define i_diaeresis 0xef 8944 #define n_virguilla 0xf1 8945 #define o_grave 0xf2 8946 #define o_acute 0xf3 8947 #define o_circumflex 0xf4 8948 #define o_virguilla 0xf5 8949 #define o_diaeresis 0xf6 8950 #define o_slash 0xf8 8951 #define u_grave 0xf9 8952 #define u_acute 0xfa 8953 #define u_circumflex 0xfb 8954 #define u_diaeresis 0xfc 8955 #define y_acute 0xfd 8956 #define y_diaeresis 0xff 8957 switch (c) { 8958 case 'A': 8959 case A_grave: 8960 case A_acute: 8961 case A_circumflex: 8962 case A_virguilla: 8963 case A_diaeresis: 8964 case A_ring: 8965 case 0x100: 8966 case 0x102: 8967 case 0x104: 8968 case 0x1cd: 8969 case 0x1de: 8970 case 0x1e0: 8971 case 0x1fa: 8972 case 0x200: 8973 case 0x202: 8974 case 0x226: 8975 case 0x23a: 8976 case 0x1e00: 8977 case 0x1ea0: 8978 case 0x1ea2: 8979 case 0x1ea4: 8980 case 0x1ea6: 8981 case 0x1ea8: 8982 case 0x1eaa: 8983 case 0x1eac: 8984 case 0x1eae: 8985 case 0x1eb0: 8986 case 0x1eb2: 8987 case 0x1eb4: 8988 case 0x1eb6: 8989 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute) 8990 EMIT2(A_circumflex) EMIT2(A_virguilla) 8991 EMIT2(A_diaeresis) EMIT2(A_ring) 8992 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104) 8993 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0) 8994 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202) 8995 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00) 8996 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4) 8997 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa) 8998 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0) 8999 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4) 9000 return; 9001 9002 case 'B': 9003 case 0x181: 9004 case 0x243: 9005 case 0x1e02: 9006 case 0x1e04: 9007 case 0x1e06: 9008 EMIT2('B') 9009 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02) 9010 EMIT2(0x1e04) EMIT2(0x1e06) 9011 return; 9012 9013 case 'C': 9014 case C_cedilla: 9015 case 0x106: 9016 case 0x108: 9017 case 0x10a: 9018 case 0x10c: 9019 case 0x187: 9020 case 0x23b: 9021 case 0x1e08: 9022 case 0xa792: 9023 EMIT2('C') EMIT2(C_cedilla) 9024 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a) 9025 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b) 9026 EMIT2(0x1e08) EMIT2(0xa792) 9027 return; 9028 9029 case 'D': 9030 case 0x10e: 9031 case 0x110: 9032 case 0x18a: 9033 case 0x1e0a: 9034 case 0x1e0c: 9035 case 0x1e0e: 9036 case 0x1e10: 9037 case 0x1e12: 9038 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a) 9039 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e) 9040 EMIT2(0x1e10) EMIT2(0x1e12) 9041 return; 9042 9043 case 'E': 9044 case E_grave: 9045 case E_acute: 9046 case E_circumflex: 9047 case E_diaeresis: 9048 case 0x112: 9049 case 0x114: 9050 case 0x116: 9051 case 0x118: 9052 case 0x11a: 9053 case 0x204: 9054 case 0x206: 9055 case 0x228: 9056 case 0x246: 9057 case 0x1e14: 9058 case 0x1e16: 9059 case 0x1e18: 9060 case 0x1e1a: 9061 case 0x1e1c: 9062 case 0x1eb8: 9063 case 0x1eba: 9064 case 0x1ebc: 9065 case 0x1ebe: 9066 case 0x1ec0: 9067 case 0x1ec2: 9068 case 0x1ec4: 9069 case 0x1ec6: 9070 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute) 9071 EMIT2(E_circumflex) EMIT2(E_diaeresis) 9072 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116) 9073 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204) 9074 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246) 9075 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18) 9076 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8) 9077 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe) 9078 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4) 9079 EMIT2(0x1ec6) 9080 return; 9081 9082 case 'F': 9083 case 0x191: 9084 case 0x1e1e: 9085 case 0xa798: 9086 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798) 9087 return; 9088 9089 case 'G': 9090 case 0x11c: 9091 case 0x11e: 9092 case 0x120: 9093 case 0x122: 9094 case 0x193: 9095 case 0x1e4: 9096 case 0x1e6: 9097 case 0x1f4: 9098 case 0x1e20: 9099 case 0xa7a0: 9100 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120) 9101 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4) 9102 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20) 9103 EMIT2(0xa7a0) 9104 return; 9105 9106 case 'H': 9107 case 0x124: 9108 case 0x126: 9109 case 0x21e: 9110 case 0x1e22: 9111 case 0x1e24: 9112 case 0x1e26: 9113 case 0x1e28: 9114 case 0x1e2a: 9115 case 0x2c67: 9116 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e) 9117 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26) 9118 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67) 9119 return; 9120 9121 case 'I': 9122 case I_grave: 9123 case I_acute: 9124 case I_circumflex: 9125 case I_diaeresis: 9126 case 0x128: 9127 case 0x12a: 9128 case 0x12c: 9129 case 0x12e: 9130 case 0x130: 9131 case 0x197: 9132 case 0x1cf: 9133 case 0x208: 9134 case 0x20a: 9135 case 0x1e2c: 9136 case 0x1e2e: 9137 case 0x1ec8: 9138 case 0x1eca: 9139 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute) 9140 EMIT2(I_circumflex) EMIT2(I_diaeresis) 9141 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c) 9142 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197) 9143 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a) 9144 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8) 9145 EMIT2(0x1eca) 9146 return; 9147 9148 case 'J': 9149 case 0x134: 9150 case 0x248: 9151 EMIT2('J') EMIT2(0x134) EMIT2(0x248) 9152 return; 9153 9154 case 'K': 9155 case 0x136: 9156 case 0x198: 9157 case 0x1e8: 9158 case 0x1e30: 9159 case 0x1e32: 9160 case 0x1e34: 9161 case 0x2c69: 9162 case 0xa740: 9163 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8) 9164 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34) 9165 EMIT2(0x2c69) EMIT2(0xa740) 9166 return; 9167 9168 case 'L': 9169 case 0x139: 9170 case 0x13b: 9171 case 0x13d: 9172 case 0x13f: 9173 case 0x141: 9174 case 0x23d: 9175 case 0x1e36: 9176 case 0x1e38: 9177 case 0x1e3a: 9178 case 0x1e3c: 9179 case 0x2c60: 9180 EMIT2('L') EMIT2(0x139) EMIT2(0x13b) 9181 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141) 9182 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38) 9183 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60) 9184 return; 9185 9186 case 'M': 9187 case 0x1e3e: 9188 case 0x1e40: 9189 case 0x1e42: 9190 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40) 9191 EMIT2(0x1e42) 9192 return; 9193 9194 case 'N': 9195 case N_virguilla: 9196 case 0x143: 9197 case 0x145: 9198 case 0x147: 9199 case 0x1f8: 9200 case 0x1e44: 9201 case 0x1e46: 9202 case 0x1e48: 9203 case 0x1e4a: 9204 case 0xa7a4: 9205 EMIT2('N') EMIT2(N_virguilla) 9206 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147) 9207 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46) 9208 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4) 9209 return; 9210 9211 case 'O': 9212 case O_grave: 9213 case O_acute: 9214 case O_circumflex: 9215 case O_virguilla: 9216 case O_diaeresis: 9217 case O_slash: 9218 case 0x14c: 9219 case 0x14e: 9220 case 0x150: 9221 case 0x19f: 9222 case 0x1a0: 9223 case 0x1d1: 9224 case 0x1ea: 9225 case 0x1ec: 9226 case 0x1fe: 9227 case 0x20c: 9228 case 0x20e: 9229 case 0x22a: 9230 case 0x22c: 9231 case 0x22e: 9232 case 0x230: 9233 case 0x1e4c: 9234 case 0x1e4e: 9235 case 0x1e50: 9236 case 0x1e52: 9237 case 0x1ecc: 9238 case 0x1ece: 9239 case 0x1ed0: 9240 case 0x1ed2: 9241 case 0x1ed4: 9242 case 0x1ed6: 9243 case 0x1ed8: 9244 case 0x1eda: 9245 case 0x1edc: 9246 case 0x1ede: 9247 case 0x1ee0: 9248 case 0x1ee2: 9249 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute) 9250 EMIT2(O_circumflex) EMIT2(O_virguilla) 9251 EMIT2(O_diaeresis) EMIT2(O_slash) 9252 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150) 9253 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1) 9254 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe) 9255 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a) 9256 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230) 9257 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50) 9258 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece) 9259 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4) 9260 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda) 9261 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0) 9262 EMIT2(0x1ee2) 9263 return; 9264 9265 case 'P': 9266 case 0x1a4: 9267 case 0x1e54: 9268 case 0x1e56: 9269 case 0x2c63: 9270 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56) 9271 EMIT2(0x2c63) 9272 return; 9273 9274 case 'Q': 9275 case 0x24a: 9276 EMIT2('Q') EMIT2(0x24a) 9277 return; 9278 9279 case 'R': 9280 case 0x154: 9281 case 0x156: 9282 case 0x158: 9283 case 0x210: 9284 case 0x212: 9285 case 0x24c: 9286 case 0x1e58: 9287 case 0x1e5a: 9288 case 0x1e5c: 9289 case 0x1e5e: 9290 case 0x2c64: 9291 case 0xa7a6: 9292 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158) 9293 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58) 9294 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64) 9295 EMIT2(0xa7a6) 9296 return; 9297 9298 case 'S': 9299 case 0x15a: 9300 case 0x15c: 9301 case 0x15e: 9302 case 0x160: 9303 case 0x218: 9304 case 0x1e60: 9305 case 0x1e62: 9306 case 0x1e64: 9307 case 0x1e66: 9308 case 0x1e68: 9309 case 0x2c7e: 9310 case 0xa7a8: 9311 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e) 9312 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62) 9313 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e) 9314 EMIT2(0xa7a8) 9315 return; 9316 9317 case 'T': 9318 case 0x162: 9319 case 0x164: 9320 case 0x166: 9321 case 0x1ac: 9322 case 0x1ae: 9323 case 0x21a: 9324 case 0x23e: 9325 case 0x1e6a: 9326 case 0x1e6c: 9327 case 0x1e6e: 9328 case 0x1e70: 9329 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166) 9330 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a) 9331 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70) 9332 return; 9333 9334 case 'U': 9335 case U_grave: 9336 case U_acute: 9337 case U_diaeresis: 9338 case U_circumflex: 9339 case 0x168: 9340 case 0x16a: 9341 case 0x16c: 9342 case 0x16e: 9343 case 0x170: 9344 case 0x172: 9345 case 0x1af: 9346 case 0x1d3: 9347 case 0x1d5: 9348 case 0x1d7: 9349 case 0x1d9: 9350 case 0x1db: 9351 case 0x214: 9352 case 0x216: 9353 case 0x244: 9354 case 0x1e72: 9355 case 0x1e74: 9356 case 0x1e76: 9357 case 0x1e78: 9358 case 0x1e7a: 9359 case 0x1ee4: 9360 case 0x1ee6: 9361 case 0x1ee8: 9362 case 0x1eea: 9363 case 0x1eec: 9364 case 0x1eee: 9365 case 0x1ef0: 9366 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute) 9367 EMIT2(U_diaeresis) EMIT2(U_circumflex) 9368 EMIT2(0x168) EMIT2(0x16a) 9369 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170) 9370 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3) 9371 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9) 9372 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216) 9373 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74) 9374 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a) 9375 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8) 9376 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee) 9377 EMIT2(0x1ef0) 9378 return; 9379 9380 case 'V': 9381 case 0x1b2: 9382 case 0x1e7c: 9383 case 0x1e7e: 9384 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e) 9385 return; 9386 9387 case 'W': 9388 case 0x174: 9389 case 0x1e80: 9390 case 0x1e82: 9391 case 0x1e84: 9392 case 0x1e86: 9393 case 0x1e88: 9394 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82) 9395 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88) 9396 return; 9397 9398 case 'X': 9399 case 0x1e8a: 9400 case 0x1e8c: 9401 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c) 9402 return; 9403 9404 case 'Y': 9405 case Y_acute: 9406 case 0x176: 9407 case 0x178: 9408 case 0x1b3: 9409 case 0x232: 9410 case 0x24e: 9411 case 0x1e8e: 9412 case 0x1ef2: 9413 case 0x1ef4: 9414 case 0x1ef6: 9415 case 0x1ef8: 9416 EMIT2('Y') EMIT2(Y_acute) 9417 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3) 9418 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e) 9419 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6) 9420 EMIT2(0x1ef8) 9421 return; 9422 9423 case 'Z': 9424 case 0x179: 9425 case 0x17b: 9426 case 0x17d: 9427 case 0x1b5: 9428 case 0x1e90: 9429 case 0x1e92: 9430 case 0x1e94: 9431 case 0x2c6b: 9432 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d) 9433 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92) 9434 EMIT2(0x1e94) EMIT2(0x2c6b) 9435 return; 9436 9437 case 'a': 9438 case a_grave: 9439 case a_acute: 9440 case a_circumflex: 9441 case a_virguilla: 9442 case a_diaeresis: 9443 case a_ring: 9444 case 0x101: 9445 case 0x103: 9446 case 0x105: 9447 case 0x1ce: 9448 case 0x1df: 9449 case 0x1e1: 9450 case 0x1fb: 9451 case 0x201: 9452 case 0x203: 9453 case 0x227: 9454 case 0x1d8f: 9455 case 0x1e01: 9456 case 0x1e9a: 9457 case 0x1ea1: 9458 case 0x1ea3: 9459 case 0x1ea5: 9460 case 0x1ea7: 9461 case 0x1ea9: 9462 case 0x1eab: 9463 case 0x1ead: 9464 case 0x1eaf: 9465 case 0x1eb1: 9466 case 0x1eb3: 9467 case 0x1eb5: 9468 case 0x1eb7: 9469 case 0x2c65: 9470 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute) 9471 EMIT2(a_circumflex) EMIT2(a_virguilla) 9472 EMIT2(a_diaeresis) EMIT2(a_ring) 9473 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105) 9474 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1) 9475 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203) 9476 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01) 9477 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3) 9478 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9) 9479 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf) 9480 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5) 9481 EMIT2(0x1eb7) EMIT2(0x2c65) 9482 return; 9483 9484 case 'b': 9485 case 0x180: 9486 case 0x253: 9487 case 0x1d6c: 9488 case 0x1d80: 9489 case 0x1e03: 9490 case 0x1e05: 9491 case 0x1e07: 9492 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c) 9493 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07) 9494 return; 9495 9496 case 'c': 9497 case c_cedilla: 9498 case 0x107: 9499 case 0x109: 9500 case 0x10b: 9501 case 0x10d: 9502 case 0x188: 9503 case 0x23c: 9504 case 0x1e09: 9505 case 0xa793: 9506 case 0xa794: 9507 EMIT2('c') EMIT2(c_cedilla) 9508 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b) 9509 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c) 9510 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794) 9511 return; 9512 9513 case 'd': 9514 case 0x10f: 9515 case 0x111: 9516 case 0x257: 9517 case 0x1d6d: 9518 case 0x1d81: 9519 case 0x1d91: 9520 case 0x1e0b: 9521 case 0x1e0d: 9522 case 0x1e0f: 9523 case 0x1e11: 9524 case 0x1e13: 9525 EMIT2('d') EMIT2(0x10f) EMIT2(0x111) 9526 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81) 9527 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d) 9528 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13) 9529 return; 9530 9531 case 'e': 9532 case e_grave: 9533 case e_acute: 9534 case e_circumflex: 9535 case e_diaeresis: 9536 case 0x113: 9537 case 0x115: 9538 case 0x117: 9539 case 0x119: 9540 case 0x11b: 9541 case 0x205: 9542 case 0x207: 9543 case 0x229: 9544 case 0x247: 9545 case 0x1d92: 9546 case 0x1e15: 9547 case 0x1e17: 9548 case 0x1e19: 9549 case 0x1e1b: 9550 case 0x1e1d: 9551 case 0x1eb9: 9552 case 0x1ebb: 9553 case 0x1ebd: 9554 case 0x1ebf: 9555 case 0x1ec1: 9556 case 0x1ec3: 9557 case 0x1ec5: 9558 case 0x1ec7: 9559 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute) 9560 EMIT2(e_circumflex) EMIT2(e_diaeresis) 9561 EMIT2(0x113) EMIT2(0x115) 9562 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b) 9563 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229) 9564 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15) 9565 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b) 9566 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb) 9567 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1) 9568 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7) 9569 return; 9570 9571 case 'f': 9572 case 0x192: 9573 case 0x1d6e: 9574 case 0x1d82: 9575 case 0x1e1f: 9576 case 0xa799: 9577 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82) 9578 EMIT2(0x1e1f) EMIT2(0xa799) 9579 return; 9580 9581 case 'g': 9582 case 0x11d: 9583 case 0x11f: 9584 case 0x121: 9585 case 0x123: 9586 case 0x1e5: 9587 case 0x1e7: 9588 case 0x1f5: 9589 case 0x260: 9590 case 0x1d83: 9591 case 0x1e21: 9592 case 0xa7a1: 9593 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121) 9594 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7) 9595 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83) 9596 EMIT2(0x1e21) EMIT2(0xa7a1) 9597 return; 9598 9599 case 'h': 9600 case 0x125: 9601 case 0x127: 9602 case 0x21f: 9603 case 0x1e23: 9604 case 0x1e25: 9605 case 0x1e27: 9606 case 0x1e29: 9607 case 0x1e2b: 9608 case 0x1e96: 9609 case 0x2c68: 9610 case 0xa795: 9611 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f) 9612 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27) 9613 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96) 9614 EMIT2(0x2c68) EMIT2(0xa795) 9615 return; 9616 9617 case 'i': 9618 case i_grave: 9619 case i_acute: 9620 case i_circumflex: 9621 case i_diaeresis: 9622 case 0x129: 9623 case 0x12b: 9624 case 0x12d: 9625 case 0x12f: 9626 case 0x1d0: 9627 case 0x209: 9628 case 0x20b: 9629 case 0x268: 9630 case 0x1d96: 9631 case 0x1e2d: 9632 case 0x1e2f: 9633 case 0x1ec9: 9634 case 0x1ecb: 9635 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute) 9636 EMIT2(i_circumflex) EMIT2(i_diaeresis) 9637 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d) 9638 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209) 9639 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96) 9640 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9) 9641 EMIT2(0x1ecb) EMIT2(0x1ecb) 9642 return; 9643 9644 case 'j': 9645 case 0x135: 9646 case 0x1f0: 9647 case 0x249: 9648 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249) 9649 return; 9650 9651 case 'k': 9652 case 0x137: 9653 case 0x199: 9654 case 0x1e9: 9655 case 0x1d84: 9656 case 0x1e31: 9657 case 0x1e33: 9658 case 0x1e35: 9659 case 0x2c6a: 9660 case 0xa741: 9661 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9) 9662 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33) 9663 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741) 9664 return; 9665 9666 case 'l': 9667 case 0x13a: 9668 case 0x13c: 9669 case 0x13e: 9670 case 0x140: 9671 case 0x142: 9672 case 0x19a: 9673 case 0x1e37: 9674 case 0x1e39: 9675 case 0x1e3b: 9676 case 0x1e3d: 9677 case 0x2c61: 9678 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c) 9679 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142) 9680 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39) 9681 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61) 9682 return; 9683 9684 case 'm': 9685 case 0x1d6f: 9686 case 0x1e3f: 9687 case 0x1e41: 9688 case 0x1e43: 9689 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f) 9690 EMIT2(0x1e41) EMIT2(0x1e43) 9691 return; 9692 9693 case 'n': 9694 case n_virguilla: 9695 case 0x144: 9696 case 0x146: 9697 case 0x148: 9698 case 0x149: 9699 case 0x1f9: 9700 case 0x1d70: 9701 case 0x1d87: 9702 case 0x1e45: 9703 case 0x1e47: 9704 case 0x1e49: 9705 case 0x1e4b: 9706 case 0xa7a5: 9707 EMIT2('n') EMIT2(n_virguilla) 9708 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148) 9709 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70) 9710 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47) 9711 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5) 9712 return; 9713 9714 case 'o': 9715 case o_grave: 9716 case o_acute: 9717 case o_circumflex: 9718 case o_virguilla: 9719 case o_diaeresis: 9720 case o_slash: 9721 case 0x14d: 9722 case 0x14f: 9723 case 0x151: 9724 case 0x1a1: 9725 case 0x1d2: 9726 case 0x1eb: 9727 case 0x1ed: 9728 case 0x1ff: 9729 case 0x20d: 9730 case 0x20f: 9731 case 0x22b: 9732 case 0x22d: 9733 case 0x22f: 9734 case 0x231: 9735 case 0x275: 9736 case 0x1e4d: 9737 case 0x1e4f: 9738 case 0x1e51: 9739 case 0x1e53: 9740 case 0x1ecd: 9741 case 0x1ecf: 9742 case 0x1ed1: 9743 case 0x1ed3: 9744 case 0x1ed5: 9745 case 0x1ed7: 9746 case 0x1ed9: 9747 case 0x1edb: 9748 case 0x1edd: 9749 case 0x1edf: 9750 case 0x1ee1: 9751 case 0x1ee3: 9752 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute) 9753 EMIT2(o_circumflex) EMIT2(o_virguilla) 9754 EMIT2(o_diaeresis) EMIT2(o_slash) 9755 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151) 9756 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb) 9757 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d) 9758 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d) 9759 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275) 9760 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51) 9761 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf) 9762 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5) 9763 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb) 9764 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1) 9765 EMIT2(0x1ee3) 9766 return; 9767 9768 case 'p': 9769 case 0x1a5: 9770 case 0x1d71: 9771 case 0x1d7d: 9772 case 0x1d88: 9773 case 0x1e55: 9774 case 0x1e57: 9775 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d) 9776 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57) 9777 return; 9778 9779 case 'q': 9780 case 0x24b: 9781 case 0x2a0: 9782 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0) 9783 return; 9784 9785 case 'r': 9786 case 0x155: 9787 case 0x157: 9788 case 0x159: 9789 case 0x211: 9790 case 0x213: 9791 case 0x24d: 9792 case 0x27d: 9793 case 0x1d72: 9794 case 0x1d73: 9795 case 0x1d89: 9796 case 0x1e59: 9797 case 0x1e5b: 9798 case 0x1e5d: 9799 case 0x1e5f: 9800 case 0xa7a7: 9801 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159) 9802 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d) 9803 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59) 9804 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7) 9805 return; 9806 9807 case 's': 9808 case 0x15b: 9809 case 0x15d: 9810 case 0x15f: 9811 case 0x161: 9812 case 0x219: 9813 case 0x23f: 9814 case 0x1d74: 9815 case 0x1d8a: 9816 case 0x1e61: 9817 case 0x1e63: 9818 case 0x1e65: 9819 case 0x1e67: 9820 case 0x1e69: 9821 case 0xa7a9: 9822 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f) 9823 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74) 9824 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65) 9825 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9) 9826 return; 9827 9828 case 't': 9829 case 0x163: 9830 case 0x165: 9831 case 0x167: 9832 case 0x1ab: 9833 case 0x1ad: 9834 case 0x21b: 9835 case 0x288: 9836 case 0x1d75: 9837 case 0x1e6b: 9838 case 0x1e6d: 9839 case 0x1e6f: 9840 case 0x1e71: 9841 case 0x1e97: 9842 case 0x2c66: 9843 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167) 9844 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288) 9845 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f) 9846 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66) 9847 return; 9848 9849 case 'u': 9850 case u_grave: 9851 case u_acute: 9852 case u_circumflex: 9853 case u_diaeresis: 9854 case 0x169: 9855 case 0x16b: 9856 case 0x16d: 9857 case 0x16f: 9858 case 0x171: 9859 case 0x173: 9860 case 0x1b0: 9861 case 0x1d4: 9862 case 0x1d6: 9863 case 0x1d8: 9864 case 0x1da: 9865 case 0x1dc: 9866 case 0x215: 9867 case 0x217: 9868 case 0x289: 9869 case 0x1d7e: 9870 case 0x1d99: 9871 case 0x1e73: 9872 case 0x1e75: 9873 case 0x1e77: 9874 case 0x1e79: 9875 case 0x1e7b: 9876 case 0x1ee5: 9877 case 0x1ee7: 9878 case 0x1ee9: 9879 case 0x1eeb: 9880 case 0x1eed: 9881 case 0x1eef: 9882 case 0x1ef1: 9883 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute) 9884 EMIT2(u_circumflex) EMIT2(u_diaeresis) 9885 EMIT2(0x169) EMIT2(0x16b) 9886 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171) 9887 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8) 9888 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0) 9889 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc) 9890 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e) 9891 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77) 9892 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5) 9893 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb) 9894 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1) 9895 return; 9896 9897 case 'v': 9898 case 0x28b: 9899 case 0x1d8c: 9900 case 0x1e7d: 9901 case 0x1e7f: 9902 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d) 9903 EMIT2(0x1e7f) 9904 return; 9905 9906 case 'w': 9907 case 0x175: 9908 case 0x1e81: 9909 case 0x1e83: 9910 case 0x1e85: 9911 case 0x1e87: 9912 case 0x1e89: 9913 case 0x1e98: 9914 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83) 9915 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98) 9916 return; 9917 9918 case 'x': 9919 case 0x1e8b: 9920 case 0x1e8d: 9921 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d) 9922 return; 9923 9924 case 'y': 9925 case y_acute: 9926 case y_diaeresis: 9927 case 0x177: 9928 case 0x1b4: 9929 case 0x233: 9930 case 0x24f: 9931 case 0x1e8f: 9932 case 0x1e99: 9933 case 0x1ef3: 9934 case 0x1ef5: 9935 case 0x1ef7: 9936 case 0x1ef9: 9937 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis) 9938 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f) 9939 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3) 9940 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9) 9941 return; 9942 9943 case 'z': 9944 case 0x17a: 9945 case 0x17c: 9946 case 0x17e: 9947 case 0x1b6: 9948 case 0x1d76: 9949 case 0x1d8e: 9950 case 0x1e91: 9951 case 0x1e93: 9952 case 0x1e95: 9953 case 0x2c6c: 9954 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e) 9955 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91) 9956 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c) 9957 return; 9958 9959 // default: character itself 9960 } 9961 } 9962 9963 EMIT2(c); 9964 #undef EMIT2 9965 } 9966 9967 // Code to parse regular expression. 9968 // 9969 // We try to reuse parsing functions in regexp.c to 9970 // minimize surprise and keep the syntax consistent. 9971 9972 // Parse the lowest level. 9973 // 9974 // An atom can be one of a long list of items. Many atoms match one character 9975 // in the text. It is often an ordinary character or a character class. 9976 // Braces can be used to make a pattern into an atom. The "\z(\)" construct 9977 // is only for syntax highlighting. 9978 // 9979 // atom ::= ordinary-atom 9980 // or \( pattern \) 9981 // or \%( pattern \) 9982 // or \z( pattern \) 9983 static int nfa_regatom(void) 9984 { 9985 int c; 9986 int charclass; 9987 int equiclass; 9988 int collclass; 9989 int got_coll_char; 9990 uint8_t *p; 9991 uint8_t *endp; 9992 uint8_t *old_regparse = (uint8_t *)regparse; 9993 int extra = 0; 9994 int emit_range; 9995 int negated; 9996 int startc = -1; 9997 int save_prev_at_start = prev_at_start; 9998 9999 c = getchr(); 10000 switch (c) { 10001 case NUL: 10002 EMSG_RET_FAIL(_(e_nul_found)); 10003 10004 case Magic('^'): 10005 EMIT(NFA_BOL); 10006 break; 10007 10008 case Magic('$'): 10009 EMIT(NFA_EOL); 10010 had_eol = true; 10011 break; 10012 10013 case Magic('<'): 10014 EMIT(NFA_BOW); 10015 break; 10016 10017 case Magic('>'): 10018 EMIT(NFA_EOW); 10019 break; 10020 10021 case Magic('_'): 10022 c = no_Magic(getchr()); 10023 if (c == NUL) { 10024 EMSG_RET_FAIL(_(e_nul_found)); 10025 } 10026 10027 if (c == '^') { // "\_^" is start-of-line 10028 EMIT(NFA_BOL); 10029 break; 10030 } 10031 if (c == '$') { // "\_$" is end-of-line 10032 EMIT(NFA_EOL); 10033 had_eol = true; 10034 break; 10035 } 10036 10037 extra = NFA_ADD_NL; 10038 10039 // "\_[" is collection plus newline 10040 if (c == '[') { 10041 goto collection; 10042 } 10043 10044 // "\_x" is character class plus newline 10045 FALLTHROUGH; 10046 10047 // Character classes. 10048 case Magic('.'): 10049 case Magic('i'): 10050 case Magic('I'): 10051 case Magic('k'): 10052 case Magic('K'): 10053 case Magic('f'): 10054 case Magic('F'): 10055 case Magic('p'): 10056 case Magic('P'): 10057 case Magic('s'): 10058 case Magic('S'): 10059 case Magic('d'): 10060 case Magic('D'): 10061 case Magic('x'): 10062 case Magic('X'): 10063 case Magic('o'): 10064 case Magic('O'): 10065 case Magic('w'): 10066 case Magic('W'): 10067 case Magic('h'): 10068 case Magic('H'): 10069 case Magic('a'): 10070 case Magic('A'): 10071 case Magic('l'): 10072 case Magic('L'): 10073 case Magic('u'): 10074 case Magic('U'): 10075 p = (uint8_t *)vim_strchr((char *)classchars, no_Magic(c)); 10076 if (p == NULL) { 10077 if (extra == NFA_ADD_NL) { 10078 semsg(_(e_ill_char_class), (int64_t)c); 10079 rc_did_emsg = true; 10080 return FAIL; 10081 } 10082 siemsg("INTERNAL: Unknown character class char: %d", c); 10083 return FAIL; 10084 } 10085 // When '.' is followed by a composing char ignore the dot, so that 10086 // the composing char is matched here. 10087 if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { 10088 old_regparse = (uint8_t *)regparse; 10089 c = getchr(); 10090 goto nfa_do_multibyte; 10091 } 10092 EMIT(nfa_classcodes[p - classchars]); 10093 if (extra == NFA_ADD_NL) { 10094 EMIT(NFA_NEWL); 10095 EMIT(NFA_OR); 10096 regflags |= RF_HASNL; 10097 } 10098 break; 10099 10100 case Magic('n'): 10101 if (reg_string) { 10102 // In a string "\n" matches a newline character. 10103 EMIT(NL); 10104 } else { 10105 // In buffer text "\n" matches the end of a line. 10106 EMIT(NFA_NEWL); 10107 regflags |= RF_HASNL; 10108 } 10109 break; 10110 10111 case Magic('('): 10112 if (nfa_reg(REG_PAREN) == FAIL) { 10113 return FAIL; // cascaded error 10114 } 10115 break; 10116 10117 case Magic('|'): 10118 case Magic('&'): 10119 case Magic(')'): 10120 semsg(_(e_misplaced), (char)no_Magic(c)); 10121 return FAIL; 10122 10123 case Magic('='): 10124 case Magic('?'): 10125 case Magic('+'): 10126 case Magic('@'): 10127 case Magic('*'): 10128 case Magic('{'): 10129 // these should follow an atom, not form an atom 10130 semsg(_(e_misplaced), (char)no_Magic(c)); 10131 return FAIL; 10132 10133 case Magic('~'): { 10134 uint8_t *lp; 10135 10136 // Previous substitute pattern. 10137 // Generated as "\%(pattern\)". 10138 if (reg_prev_sub == NULL) { 10139 emsg(_(e_nopresub)); 10140 return FAIL; 10141 } 10142 for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; lp += utf_ptr2len((char *)lp)) { 10143 EMIT(utf_ptr2char((char *)lp)); 10144 if (lp != (uint8_t *)reg_prev_sub) { 10145 EMIT(NFA_CONCAT); 10146 } 10147 } 10148 EMIT(NFA_NOPEN); 10149 break; 10150 } 10151 10152 case Magic('1'): 10153 case Magic('2'): 10154 case Magic('3'): 10155 case Magic('4'): 10156 case Magic('5'): 10157 case Magic('6'): 10158 case Magic('7'): 10159 case Magic('8'): 10160 case Magic('9'): { 10161 int refnum = no_Magic(c) - '1'; 10162 10163 if (!seen_endbrace(refnum + 1)) { 10164 return FAIL; 10165 } 10166 EMIT(NFA_BACKREF1 + refnum); 10167 rex.nfa_has_backref = true; 10168 } 10169 break; 10170 10171 case Magic('z'): 10172 c = no_Magic(getchr()); 10173 switch (c) { 10174 case 's': 10175 EMIT(NFA_ZSTART); 10176 if (!re_mult_next("\\zs")) { 10177 return false; 10178 } 10179 break; 10180 case 'e': 10181 EMIT(NFA_ZEND); 10182 rex.nfa_has_zend = true; 10183 if (!re_mult_next("\\ze")) { 10184 return false; 10185 } 10186 break; 10187 case '1': 10188 case '2': 10189 case '3': 10190 case '4': 10191 case '5': 10192 case '6': 10193 case '7': 10194 case '8': 10195 case '9': 10196 // \z1...\z9 10197 if ((reg_do_extmatch & REX_USE) == 0) { 10198 EMSG_RET_FAIL(_(e_z1_not_allowed)); 10199 } 10200 EMIT(NFA_ZREF1 + (no_Magic(c) - '1')); 10201 // No need to set rex.nfa_has_backref, the sub-matches don't 10202 // change when \z1 .. \z9 matches or not. 10203 re_has_z = REX_USE; 10204 break; 10205 case '(': 10206 // \z( 10207 if (reg_do_extmatch != REX_SET) { 10208 EMSG_RET_FAIL(_(e_z_not_allowed)); 10209 } 10210 if (nfa_reg(REG_ZPAREN) == FAIL) { 10211 return FAIL; // cascaded error 10212 } 10213 re_has_z = REX_SET; 10214 break; 10215 default: 10216 semsg(_("E867: (NFA) Unknown operator '\\z%c'"), 10217 no_Magic(c)); 10218 return FAIL; 10219 } 10220 break; 10221 10222 case Magic('%'): 10223 c = no_Magic(getchr()); 10224 switch (c) { 10225 // () without a back reference 10226 case '(': 10227 if (nfa_reg(REG_NPAREN) == FAIL) { 10228 return FAIL; 10229 } 10230 EMIT(NFA_NOPEN); 10231 break; 10232 10233 case 'd': // %d123 decimal 10234 case 'o': // %o123 octal 10235 case 'x': // %xab hex 2 10236 case 'u': // %uabcd hex 4 10237 case 'U': // %U1234abcd hex 8 10238 { 10239 int64_t nr; 10240 10241 switch (c) { 10242 case 'd': 10243 nr = getdecchrs(); break; 10244 case 'o': 10245 nr = getoctchrs(); break; 10246 case 'x': 10247 nr = gethexchrs(2); break; 10248 case 'u': 10249 nr = gethexchrs(4); break; 10250 case 'U': 10251 nr = gethexchrs(8); break; 10252 default: 10253 nr = -1; break; 10254 } 10255 10256 if (nr < 0 || nr > INT_MAX) { 10257 EMSG2_RET_FAIL(_("E678: Invalid character after %s%%[dxouU]"), 10258 reg_magic == MAGIC_ALL); 10259 } 10260 // A NUL is stored in the text as NL 10261 // TODO(vim): what if a composing character follows? 10262 EMIT(nr == 0 ? 0x0a : (int)nr); 10263 } 10264 break; 10265 10266 // Catch \%^ and \%$ regardless of where they appear in the 10267 // pattern -- regardless of whether or not it makes sense. 10268 case '^': 10269 EMIT(NFA_BOF); 10270 break; 10271 10272 case '$': 10273 EMIT(NFA_EOF); 10274 break; 10275 10276 case '#': 10277 if (regparse[0] == '=' && regparse[1] >= 48 10278 && regparse[1] <= 50) { 10279 // misplaced \%#=1 10280 semsg(_(e_atom_engine_must_be_at_start_of_pattern), regparse[1]); 10281 return FAIL; 10282 } 10283 EMIT(NFA_CURSOR); 10284 break; 10285 10286 case 'V': 10287 EMIT(NFA_VISUAL); 10288 break; 10289 10290 case 'C': 10291 EMIT(NFA_ANY_COMPOSING); 10292 break; 10293 10294 case '[': { 10295 int n; 10296 10297 // \%[abc] 10298 for (n = 0; (c = peekchr()) != ']'; n++) { 10299 if (c == NUL) { 10300 EMSG2_RET_FAIL(_(e_missing_sb), 10301 reg_magic == MAGIC_ALL); 10302 } 10303 // recursive call! 10304 if (nfa_regatom() == FAIL) { 10305 return FAIL; 10306 } 10307 } 10308 (void)getchr(); // get the ] 10309 if (n == 0) { 10310 EMSG2_RET_FAIL(_(e_empty_sb), reg_magic == MAGIC_ALL); 10311 } 10312 EMIT(NFA_OPT_CHARS); 10313 EMIT(n); 10314 10315 // Emit as "\%(\%[abc]\)" to be able to handle 10316 // "\%[abc]*" which would cause the empty string to be 10317 // matched an unlimited number of times. NFA_NOPEN is 10318 // added only once at a position, while NFA_SPLIT is 10319 // added multiple times. This is more efficient than 10320 // not allowing NFA_SPLIT multiple times, it is used 10321 // a lot. 10322 EMIT(NFA_NOPEN); 10323 break; 10324 } 10325 10326 default: { 10327 int64_t n = 0; 10328 const int cmp = c; 10329 bool cur = false; 10330 bool got_digit = false; 10331 10332 if (c == '<' || c == '>') { 10333 c = getchr(); 10334 } 10335 if (no_Magic(c) == '.') { 10336 cur = true; 10337 c = getchr(); 10338 } 10339 while (ascii_isdigit(c)) { 10340 if (cur) { 10341 semsg(_(e_regexp_number_after_dot_pos_search_chr), no_Magic(c)); 10342 return FAIL; 10343 } 10344 if (n > (INT32_MAX - (c - '0')) / 10) { 10345 // overflow. 10346 emsg(_(e_value_too_large)); 10347 return FAIL; 10348 } 10349 n = n * 10 + (c - '0'); 10350 c = getchr(); 10351 got_digit = true; 10352 } 10353 if (c == 'l' || c == 'c' || c == 'v') { 10354 int32_t limit = INT32_MAX; 10355 10356 if (!cur && !got_digit) { 10357 semsg(_(e_nfa_regexp_missing_value_in_chr), no_Magic(c)); 10358 return FAIL; 10359 } 10360 if (c == 'l') { 10361 if (cur) { 10362 n = curwin->w_cursor.lnum; 10363 } 10364 // \%{n}l \%{n}<l \%{n}>l 10365 EMIT(cmp == '<' ? NFA_LNUM_LT 10366 : cmp == '>' ? NFA_LNUM_GT : NFA_LNUM); 10367 if (save_prev_at_start) { 10368 at_start = true; 10369 } 10370 } else if (c == 'c') { 10371 if (cur) { 10372 n = curwin->w_cursor.col; 10373 n++; 10374 } 10375 // \%{n}c \%{n}<c \%{n}>c 10376 EMIT(cmp == '<' ? NFA_COL_LT 10377 : cmp == '>' ? NFA_COL_GT : NFA_COL); 10378 } else { 10379 if (cur) { 10380 colnr_T vcol = 0; 10381 getvvcol(curwin, &curwin->w_cursor, NULL, NULL, &vcol); 10382 n = ++vcol; 10383 } 10384 // \%{n}v \%{n}<v \%{n}>v 10385 EMIT(cmp == '<' ? NFA_VCOL_LT 10386 : cmp == '>' ? NFA_VCOL_GT : NFA_VCOL); 10387 limit = INT32_MAX / MB_MAXBYTES; 10388 } 10389 if (n >= limit) { 10390 emsg(_(e_value_too_large)); 10391 return FAIL; 10392 } 10393 EMIT((int)n); 10394 break; 10395 } else if (no_Magic(c) == '\'' && n == 0) { 10396 // \%'m \%<'m \%>'m 10397 EMIT(cmp == '<' ? NFA_MARK_LT 10398 : cmp == '>' ? NFA_MARK_GT : NFA_MARK); 10399 EMIT(getchr()); 10400 break; 10401 } 10402 } 10403 semsg(_("E867: (NFA) Unknown operator '\\%%%c'"), 10404 no_Magic(c)); 10405 return FAIL; 10406 } 10407 break; 10408 10409 case Magic('['): 10410 collection: 10411 // [abc] uses NFA_START_COLL - NFA_END_COLL 10412 // [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL 10413 // Each character is produced as a regular state, using 10414 // NFA_CONCAT to bind them together. 10415 // Besides normal characters there can be: 10416 // - character classes NFA_CLASS_* 10417 // - ranges, two characters followed by NFA_RANGE. 10418 10419 p = (uint8_t *)regparse; 10420 endp = (uint8_t *)skip_anyof((char *)p); 10421 if (*endp == ']') { 10422 // Try to reverse engineer character classes. For example, 10423 // recognize that [0-9] stands for \d and [A-Za-z_] for \h, 10424 // and perform the necessary substitutions in the NFA. 10425 int result = nfa_recognize_char_class((uint8_t *)regparse, endp, extra == NFA_ADD_NL); 10426 if (result != FAIL) { 10427 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) { 10428 EMIT(result - NFA_ADD_NL); 10429 EMIT(NFA_NEWL); 10430 EMIT(NFA_OR); 10431 } else { 10432 EMIT(result); 10433 } 10434 regparse = (char *)endp; 10435 MB_PTR_ADV(regparse); 10436 return OK; 10437 } 10438 // Failed to recognize a character class. Use the simple 10439 // version that turns [abc] into 'a' OR 'b' OR 'c' 10440 negated = false; 10441 if (*regparse == '^') { // negated range 10442 negated = true; 10443 MB_PTR_ADV(regparse); 10444 EMIT(NFA_START_NEG_COLL); 10445 } else { 10446 EMIT(NFA_START_COLL); 10447 } 10448 if (*regparse == '-') { 10449 startc = '-'; 10450 EMIT(startc); 10451 EMIT(NFA_CONCAT); 10452 MB_PTR_ADV(regparse); 10453 } 10454 // Emit the OR branches for each character in the [] 10455 emit_range = false; 10456 while ((uint8_t *)regparse < endp) { 10457 int oldstartc = startc; 10458 startc = -1; 10459 got_coll_char = false; 10460 if (*regparse == '[') { 10461 // Check for [: :], [= =], [. .] 10462 equiclass = collclass = 0; 10463 charclass = get_char_class(®parse); 10464 if (charclass == CLASS_NONE) { 10465 equiclass = get_equi_class(®parse); 10466 if (equiclass == 0) { 10467 collclass = get_coll_element(®parse); 10468 } 10469 } 10470 10471 // Character class like [:alpha:] 10472 if (charclass != CLASS_NONE) { 10473 switch (charclass) { 10474 case CLASS_ALNUM: 10475 EMIT(NFA_CLASS_ALNUM); 10476 break; 10477 case CLASS_ALPHA: 10478 EMIT(NFA_CLASS_ALPHA); 10479 break; 10480 case CLASS_BLANK: 10481 EMIT(NFA_CLASS_BLANK); 10482 break; 10483 case CLASS_CNTRL: 10484 EMIT(NFA_CLASS_CNTRL); 10485 break; 10486 case CLASS_DIGIT: 10487 EMIT(NFA_CLASS_DIGIT); 10488 break; 10489 case CLASS_GRAPH: 10490 EMIT(NFA_CLASS_GRAPH); 10491 break; 10492 case CLASS_LOWER: 10493 wants_nfa = true; 10494 EMIT(NFA_CLASS_LOWER); 10495 break; 10496 case CLASS_PRINT: 10497 EMIT(NFA_CLASS_PRINT); 10498 break; 10499 case CLASS_PUNCT: 10500 EMIT(NFA_CLASS_PUNCT); 10501 break; 10502 case CLASS_SPACE: 10503 EMIT(NFA_CLASS_SPACE); 10504 break; 10505 case CLASS_UPPER: 10506 wants_nfa = true; 10507 EMIT(NFA_CLASS_UPPER); 10508 break; 10509 case CLASS_XDIGIT: 10510 EMIT(NFA_CLASS_XDIGIT); 10511 break; 10512 case CLASS_TAB: 10513 EMIT(NFA_CLASS_TAB); 10514 break; 10515 case CLASS_RETURN: 10516 EMIT(NFA_CLASS_RETURN); 10517 break; 10518 case CLASS_BACKSPACE: 10519 EMIT(NFA_CLASS_BACKSPACE); 10520 break; 10521 case CLASS_ESCAPE: 10522 EMIT(NFA_CLASS_ESCAPE); 10523 break; 10524 case CLASS_IDENT: 10525 EMIT(NFA_CLASS_IDENT); 10526 break; 10527 case CLASS_KEYWORD: 10528 EMIT(NFA_CLASS_KEYWORD); 10529 break; 10530 case CLASS_FNAME: 10531 EMIT(NFA_CLASS_FNAME); 10532 break; 10533 } 10534 EMIT(NFA_CONCAT); 10535 continue; 10536 } 10537 // Try equivalence class [=a=] and the like 10538 if (equiclass != 0) { 10539 nfa_emit_equi_class(equiclass); 10540 continue; 10541 } 10542 // Try collating class like [. .] 10543 if (collclass != 0) { 10544 startc = collclass; // allow [.a.]-x as a range 10545 // Will emit the proper atom at the end of the 10546 // while loop. 10547 } 10548 } 10549 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a 10550 // start character. 10551 if (*regparse == '-' && oldstartc != -1) { 10552 emit_range = true; 10553 startc = oldstartc; 10554 MB_PTR_ADV(regparse); 10555 continue; // reading the end of the range 10556 } 10557 10558 // Now handle simple and escaped characters. 10559 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim 10560 // accepts "\t", "\e", etc., but only when the 'l' flag in 10561 // 'cpoptions' is not included. 10562 if (*regparse == '\\' 10563 && (uint8_t *)regparse + 1 <= endp 10564 && (vim_strchr(REGEXP_INRANGE, (uint8_t)regparse[1]) != NULL 10565 || (!reg_cpo_lit 10566 && vim_strchr(REGEXP_ABBR, (uint8_t)regparse[1]) 10567 != NULL))) { 10568 MB_PTR_ADV(regparse); 10569 10570 if (*regparse == 'n') { 10571 startc = (reg_string || emit_range || regparse[1] == '-') 10572 ? NL : NFA_NEWL; 10573 } else if (*regparse == 'd' 10574 || *regparse == 'o' 10575 || *regparse == 'x' 10576 || *regparse == 'u' 10577 || *regparse == 'U') { 10578 // TODO(RE): This needs more testing 10579 startc = coll_get_char(); 10580 // max UTF-8 Codepoint is U+10FFFF, 10581 // but allow values until INT_MAX 10582 if (startc == INT_MAX) { 10583 EMSG_RET_FAIL(_(e_unicode_val_too_large)); 10584 } 10585 got_coll_char = true; 10586 MB_PTR_BACK(old_regparse, regparse); 10587 } else { 10588 // \r,\t,\e,\b 10589 startc = backslash_trans(*regparse); 10590 } 10591 } 10592 10593 // Normal printable char 10594 if (startc == -1) { 10595 startc = utf_ptr2char(regparse); 10596 } 10597 10598 // Previous char was '-', so this char is end of range. 10599 if (emit_range) { 10600 int endc = startc; 10601 startc = oldstartc; 10602 if (startc > endc) { 10603 EMSG_RET_FAIL(_(e_reverse_range)); 10604 } 10605 10606 if (endc > startc + 2) { 10607 // Emit a range instead of the sequence of 10608 // individual characters. 10609 if (startc == 0) { 10610 // \x00 is translated to \x0a, start at \x01. 10611 EMIT(1); 10612 } else { 10613 post_ptr--; // remove NFA_CONCAT 10614 } 10615 EMIT(endc); 10616 EMIT(NFA_RANGE); 10617 EMIT(NFA_CONCAT); 10618 } else if (utf_char2len(startc) > 1 10619 || utf_char2len(endc) > 1) { 10620 // Emit the characters in the range. 10621 // "startc" was already emitted, so skip it. 10622 for (c = startc + 1; c <= endc; c++) { 10623 EMIT(c); 10624 EMIT(NFA_CONCAT); 10625 } 10626 } else { 10627 // Emit the range. "startc" was already emitted, so 10628 // skip it. 10629 for (c = startc + 1; c <= endc; c++) { 10630 EMIT(c); 10631 EMIT(NFA_CONCAT); 10632 } 10633 } 10634 emit_range = false; 10635 startc = -1; 10636 } else { 10637 // This char (startc) is not part of a range. Just 10638 // emit it. 10639 // Normally, simply emit startc. But if we get char 10640 // code=0 from a collating char, then replace it with 10641 // 0x0a. 10642 // This is needed to completely mimic the behaviour of 10643 // the backtracking engine. 10644 if (startc == NFA_NEWL) { 10645 // Line break can't be matched as part of the 10646 // collection, add an OR below. But not for negated 10647 // range. 10648 if (!negated) { 10649 extra = NFA_ADD_NL; 10650 } 10651 } else { 10652 if (got_coll_char == true && startc == 0) { 10653 EMIT(0x0a); 10654 EMIT(NFA_CONCAT); 10655 } else { 10656 EMIT(startc); 10657 if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) { 10658 EMIT(NFA_CONCAT); 10659 } 10660 } 10661 } 10662 } 10663 10664 int plen; 10665 if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) { 10666 int i = utf_ptr2len(regparse); 10667 10668 c = utf_ptr2char(regparse + i); 10669 10670 // Add composing characters 10671 while (true) { 10672 if (c == 0) { 10673 // \x00 is translated to \x0a, start at \x01. 10674 EMIT(1); 10675 } else { 10676 EMIT(c); 10677 } 10678 EMIT(NFA_CONCAT); 10679 if ((i += utf_char2len(c)) >= plen) { 10680 break; 10681 } 10682 c = utf_ptr2char(regparse + i); 10683 } 10684 EMIT(NFA_COMPOSING); 10685 EMIT(NFA_CONCAT); 10686 } 10687 MB_PTR_ADV(regparse); 10688 } // while (p < endp) 10689 10690 MB_PTR_BACK(old_regparse, regparse); 10691 if (*regparse == '-') { // if last, '-' is just a char 10692 EMIT('-'); 10693 EMIT(NFA_CONCAT); 10694 } 10695 10696 // skip the trailing ] 10697 regparse = (char *)endp; 10698 MB_PTR_ADV(regparse); 10699 10700 // Mark end of the collection. 10701 if (negated == true) { 10702 EMIT(NFA_END_NEG_COLL); 10703 } else { 10704 EMIT(NFA_END_COLL); 10705 } 10706 10707 // \_[] also matches \n but it's not negated 10708 if (extra == NFA_ADD_NL) { 10709 EMIT(reg_string ? NL : NFA_NEWL); 10710 EMIT(NFA_OR); 10711 } 10712 10713 return OK; 10714 } // if exists closing ] 10715 10716 if (reg_strict) { 10717 EMSG_RET_FAIL(_(e_missingbracket)); 10718 } 10719 FALLTHROUGH; 10720 10721 default: { 10722 int plen; 10723 10724 nfa_do_multibyte: 10725 // plen is length of current char with composing chars 10726 if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse)) 10727 || utf_iscomposing_legacy(c)) { 10728 int i = 0; 10729 10730 // A base character plus composing characters, or just one 10731 // or more composing characters. 10732 // This requires creating a separate atom as if enclosing 10733 // the characters in (), where NFA_COMPOSING is the ( and 10734 // NFA_END_COMPOSING is the ). Note that right now we are 10735 // building the postfix form, not the NFA itself; 10736 // a composing char could be: a, b, c, NFA_COMPOSING 10737 // where 'b' and 'c' are chars with codes > 256. 10738 while (true) { 10739 EMIT(c); 10740 if (i > 0) { 10741 EMIT(NFA_CONCAT); 10742 } 10743 if ((i += utf_char2len(c)) >= plen) { 10744 break; 10745 } 10746 c = utf_ptr2char((char *)old_regparse + i); 10747 } 10748 EMIT(NFA_COMPOSING); 10749 regparse = (char *)old_regparse + plen; 10750 } else { 10751 c = no_Magic(c); 10752 EMIT(c); 10753 } 10754 return OK; 10755 } 10756 } 10757 10758 return OK; 10759 } 10760 10761 // Parse something followed by possible [*+=]. 10762 // 10763 // A piece is an atom, possibly followed by a multi, an indication of how many 10764 // times the atom can be matched. Example: "a*" matches any sequence of "a" 10765 // characters: "", "a", "aa", etc. 10766 // 10767 // piece ::= atom 10768 // or atom multi 10769 static int nfa_regpiece(void) 10770 { 10771 int i; 10772 int op; 10773 int ret; 10774 int minval, maxval; 10775 bool greedy = true; // Braces are prefixed with '-' ? 10776 parse_state_T old_state; 10777 parse_state_T new_state; 10778 int64_t c2; 10779 int old_post_pos; 10780 int my_post_start; 10781 int quest; 10782 10783 // Save the current parse state, so that we can use it if <atom>{m,n} is 10784 // next. 10785 save_parse_state(&old_state); 10786 10787 // store current pos in the postfix form, for \{m,n} involving 0s 10788 my_post_start = (int)(post_ptr - post_start); 10789 10790 ret = nfa_regatom(); 10791 if (ret == FAIL) { 10792 return FAIL; // cascaded error 10793 } 10794 op = peekchr(); 10795 if (re_multi_type(op) == NOT_MULTI) { 10796 return OK; 10797 } 10798 10799 skipchr(); 10800 switch (op) { 10801 case Magic('*'): 10802 EMIT(NFA_STAR); 10803 break; 10804 10805 case Magic('+'): 10806 // Trick: Normally, (a*)\+ would match the whole input "aaa". The 10807 // first and only submatch would be "aaa". But the backtracking 10808 // engine interprets the plus as "try matching one more time", and 10809 // a* matches a second time at the end of the input, the empty 10810 // string. 10811 // The submatch will be the empty string. 10812 // 10813 // In order to be consistent with the old engine, we replace 10814 // <atom>+ with <atom><atom>* 10815 restore_parse_state(&old_state); 10816 curchr = -1; 10817 if (nfa_regatom() == FAIL) { 10818 return FAIL; 10819 } 10820 EMIT(NFA_STAR); 10821 EMIT(NFA_CONCAT); 10822 skipchr(); // skip the \+ 10823 break; 10824 10825 case Magic('@'): 10826 c2 = getdecchrs(); 10827 op = no_Magic(getchr()); 10828 i = 0; 10829 switch (op) { 10830 case '=': 10831 // \@= 10832 i = NFA_PREV_ATOM_NO_WIDTH; 10833 break; 10834 case '!': 10835 // \@! 10836 i = NFA_PREV_ATOM_NO_WIDTH_NEG; 10837 break; 10838 case '<': 10839 op = no_Magic(getchr()); 10840 if (op == '=') { 10841 // \@<= 10842 i = NFA_PREV_ATOM_JUST_BEFORE; 10843 } else if (op == '!') { 10844 // \@<! 10845 i = NFA_PREV_ATOM_JUST_BEFORE_NEG; 10846 } 10847 break; 10848 case '>': 10849 // \@> 10850 i = NFA_PREV_ATOM_LIKE_PATTERN; 10851 break; 10852 } 10853 if (i == 0) { 10854 semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op); 10855 return FAIL; 10856 } 10857 EMIT(i); 10858 if (i == NFA_PREV_ATOM_JUST_BEFORE 10859 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG) { 10860 EMIT((int)c2); 10861 } 10862 break; 10863 10864 case Magic('?'): 10865 case Magic('='): 10866 EMIT(NFA_QUEST); 10867 break; 10868 10869 case Magic('{'): 10870 // a{2,5} will expand to 'aaa?a?a?' 10871 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy 10872 // version of '?' 10873 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the 10874 // parenthesis have the same id 10875 10876 greedy = true; 10877 c2 = peekchr(); 10878 if (c2 == '-' || c2 == Magic('-')) { 10879 skipchr(); 10880 greedy = false; 10881 } 10882 if (!read_limits(&minval, &maxval)) { 10883 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits")); 10884 } 10885 10886 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to 10887 // <atom>* 10888 if (minval == 0 && maxval == MAX_LIMIT) { 10889 if (greedy) { 10890 // \{}, \{0,} 10891 EMIT(NFA_STAR); 10892 } else { 10893 // \{-}, \{-0,} 10894 EMIT(NFA_STAR_NONGREEDY); 10895 } 10896 break; 10897 } 10898 10899 // Special case: x{0} or x{-0} 10900 if (maxval == 0) { 10901 // Ignore result of previous call to nfa_regatom() 10902 post_ptr = post_start + my_post_start; 10903 // NFA_EMPTY is 0-length and works everywhere 10904 EMIT(NFA_EMPTY); 10905 return OK; 10906 } 10907 10908 // The engine is very inefficient (uses too many states) when the 10909 // maximum is much larger than the minimum and when the maximum is 10910 // large. However, when maxval is MAX_LIMIT, it is okay, as this 10911 // will emit NFA_STAR. 10912 // Bail out if we can use the other engine, but only, when the 10913 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\} 10914 // does not work with characters > 8 bit with the BT engine) 10915 if ((nfa_re_flags & RE_AUTO) 10916 && (maxval > 500 || maxval > minval + 200) 10917 && (maxval != MAX_LIMIT && minval < 200) 10918 && !wants_nfa) { 10919 return FAIL; 10920 } 10921 10922 // Ignore previous call to nfa_regatom() 10923 post_ptr = post_start + my_post_start; 10924 // Save parse state after the repeated atom and the \{} 10925 save_parse_state(&new_state); 10926 10927 quest = (greedy == true ? NFA_QUEST : NFA_QUEST_NONGREEDY); 10928 for (i = 0; i < maxval; i++) { 10929 // Goto beginning of the repeated atom 10930 restore_parse_state(&old_state); 10931 old_post_pos = (int)(post_ptr - post_start); 10932 if (nfa_regatom() == FAIL) { 10933 return FAIL; 10934 } 10935 // after "minval" times, atoms are optional 10936 if (i + 1 > minval) { 10937 if (maxval == MAX_LIMIT) { 10938 if (greedy) { 10939 EMIT(NFA_STAR); 10940 } else { 10941 EMIT(NFA_STAR_NONGREEDY); 10942 } 10943 } else { 10944 EMIT(quest); 10945 } 10946 } 10947 if (old_post_pos != my_post_start) { 10948 EMIT(NFA_CONCAT); 10949 } 10950 if (i + 1 > minval && maxval == MAX_LIMIT) { 10951 break; 10952 } 10953 } 10954 10955 // Go to just after the repeated atom and the \{} 10956 restore_parse_state(&new_state); 10957 curchr = -1; 10958 10959 break; 10960 10961 default: 10962 break; 10963 } // end switch 10964 10965 if (re_multi_type(peekchr()) != NOT_MULTI) { 10966 // Can't have a multi follow a multi. 10967 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi")); 10968 } 10969 10970 return OK; 10971 } 10972 10973 // Parse one or more pieces, concatenated. It matches a match for the 10974 // first piece, followed by a match for the second piece, etc. Example: 10975 // "f[0-9]b", first matches "f", then a digit and then "b". 10976 // 10977 // concat ::= piece 10978 // or piece piece 10979 // or piece piece piece 10980 // etc. 10981 static int nfa_regconcat(void) 10982 { 10983 bool cont = true; 10984 bool first = true; 10985 10986 while (cont) { 10987 switch (peekchr()) { 10988 case NUL: 10989 case Magic('|'): 10990 case Magic('&'): 10991 case Magic(')'): 10992 cont = false; 10993 break; 10994 10995 case Magic('Z'): 10996 regflags |= RF_ICOMBINE; 10997 skipchr_keepstart(); 10998 break; 10999 case Magic('c'): 11000 regflags |= RF_ICASE; 11001 skipchr_keepstart(); 11002 break; 11003 case Magic('C'): 11004 regflags |= RF_NOICASE; 11005 skipchr_keepstart(); 11006 break; 11007 case Magic('v'): 11008 reg_magic = MAGIC_ALL; 11009 skipchr_keepstart(); 11010 curchr = -1; 11011 break; 11012 case Magic('m'): 11013 reg_magic = MAGIC_ON; 11014 skipchr_keepstart(); 11015 curchr = -1; 11016 break; 11017 case Magic('M'): 11018 reg_magic = MAGIC_OFF; 11019 skipchr_keepstart(); 11020 curchr = -1; 11021 break; 11022 case Magic('V'): 11023 reg_magic = MAGIC_NONE; 11024 skipchr_keepstart(); 11025 curchr = -1; 11026 break; 11027 11028 default: 11029 if (nfa_regpiece() == FAIL) { 11030 return FAIL; 11031 } 11032 if (first == false) { 11033 EMIT(NFA_CONCAT); 11034 } else { 11035 first = false; 11036 } 11037 break; 11038 } 11039 } 11040 11041 return OK; 11042 } 11043 11044 // Parse a branch, one or more concats, separated by "\&". It matches the 11045 // last concat, but only if all the preceding concats also match at the same 11046 // position. Examples: 11047 // "foobeep\&..." matches "foo" in "foobeep". 11048 // ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob" 11049 // 11050 // branch ::= concat 11051 // or concat \& concat 11052 // or concat \& concat \& concat 11053 // etc. 11054 static int nfa_regbranch(void) 11055 { 11056 int old_post_pos; 11057 11058 old_post_pos = (int)(post_ptr - post_start); 11059 11060 // First branch, possibly the only one 11061 if (nfa_regconcat() == FAIL) { 11062 return FAIL; 11063 } 11064 11065 // Try next concats 11066 while (peekchr() == Magic('&')) { 11067 skipchr(); 11068 // if concat is empty do emit a node 11069 if (old_post_pos == (int)(post_ptr - post_start)) { 11070 EMIT(NFA_EMPTY); 11071 } 11072 EMIT(NFA_NOPEN); 11073 EMIT(NFA_PREV_ATOM_NO_WIDTH); 11074 old_post_pos = (int)(post_ptr - post_start); 11075 if (nfa_regconcat() == FAIL) { 11076 return FAIL; 11077 } 11078 // if concat is empty do emit a node 11079 if (old_post_pos == (int)(post_ptr - post_start)) { 11080 EMIT(NFA_EMPTY); 11081 } 11082 EMIT(NFA_CONCAT); 11083 } 11084 11085 // if a branch is empty, emit one node for it 11086 if (old_post_pos == (int)(post_ptr - post_start)) { 11087 EMIT(NFA_EMPTY); 11088 } 11089 11090 return OK; 11091 } 11092 11093 /// Parse a pattern, one or more branches, separated by "\|". It matches 11094 /// anything that matches one of the branches. Example: "foo\|beep" matches 11095 /// "foo" and matches "beep". If more than one branch matches, the first one 11096 /// is used. 11097 /// 11098 /// pattern ::= branch 11099 /// or branch \| branch 11100 /// or branch \| branch \| branch 11101 /// etc. 11102 /// 11103 /// @param paren REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN 11104 static int nfa_reg(int paren) 11105 { 11106 int parno = 0; 11107 11108 if (paren == REG_PAREN) { 11109 if (regnpar >= NSUBEXP) { // Too many `(' 11110 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('")); 11111 } 11112 parno = regnpar++; 11113 } else if (paren == REG_ZPAREN) { 11114 // Make a ZOPEN node. 11115 if (regnzpar >= NSUBEXP) { 11116 EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z(")); 11117 } 11118 parno = regnzpar++; 11119 } 11120 11121 if (nfa_regbranch() == FAIL) { 11122 return FAIL; // cascaded error 11123 } 11124 while (peekchr() == Magic('|')) { 11125 skipchr(); 11126 if (nfa_regbranch() == FAIL) { 11127 return FAIL; // cascaded error 11128 } 11129 EMIT(NFA_OR); 11130 } 11131 11132 // Check for proper termination. 11133 if (paren != REG_NOPAREN && getchr() != Magic(')')) { 11134 if (paren == REG_NPAREN) { 11135 EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); 11136 } else { 11137 EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL); 11138 } 11139 } else if (paren == REG_NOPAREN && peekchr() != NUL) { 11140 if (peekchr() == Magic(')')) { 11141 EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); 11142 } else { 11143 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error")); 11144 } 11145 } 11146 // Here we set the flag allowing back references to this set of 11147 // parentheses. 11148 if (paren == REG_PAREN) { 11149 had_endbrace[parno] = true; // have seen the close paren 11150 EMIT(NFA_MOPEN + parno); 11151 } else if (paren == REG_ZPAREN) { 11152 EMIT(NFA_ZOPEN + parno); 11153 } 11154 11155 return OK; 11156 } 11157 11158 #ifdef REGEXP_DEBUG 11159 static uint8_t code[50]; 11160 11161 static void nfa_set_code(int c) 11162 { 11163 int addnl = false; 11164 11165 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) { 11166 addnl = true; 11167 c -= NFA_ADD_NL; 11168 } 11169 11170 STRCPY(code, ""); 11171 switch (c) { 11172 case NFA_MATCH: 11173 STRCPY(code, "NFA_MATCH "); break; 11174 case NFA_SPLIT: 11175 STRCPY(code, "NFA_SPLIT "); break; 11176 case NFA_CONCAT: 11177 STRCPY(code, "NFA_CONCAT "); break; 11178 case NFA_NEWL: 11179 STRCPY(code, "NFA_NEWL "); break; 11180 case NFA_ZSTART: 11181 STRCPY(code, "NFA_ZSTART"); break; 11182 case NFA_ZEND: 11183 STRCPY(code, "NFA_ZEND"); break; 11184 11185 case NFA_BACKREF1: 11186 STRCPY(code, "NFA_BACKREF1"); break; 11187 case NFA_BACKREF2: 11188 STRCPY(code, "NFA_BACKREF2"); break; 11189 case NFA_BACKREF3: 11190 STRCPY(code, "NFA_BACKREF3"); break; 11191 case NFA_BACKREF4: 11192 STRCPY(code, "NFA_BACKREF4"); break; 11193 case NFA_BACKREF5: 11194 STRCPY(code, "NFA_BACKREF5"); break; 11195 case NFA_BACKREF6: 11196 STRCPY(code, "NFA_BACKREF6"); break; 11197 case NFA_BACKREF7: 11198 STRCPY(code, "NFA_BACKREF7"); break; 11199 case NFA_BACKREF8: 11200 STRCPY(code, "NFA_BACKREF8"); break; 11201 case NFA_BACKREF9: 11202 STRCPY(code, "NFA_BACKREF9"); break; 11203 case NFA_ZREF1: 11204 STRCPY(code, "NFA_ZREF1"); break; 11205 case NFA_ZREF2: 11206 STRCPY(code, "NFA_ZREF2"); break; 11207 case NFA_ZREF3: 11208 STRCPY(code, "NFA_ZREF3"); break; 11209 case NFA_ZREF4: 11210 STRCPY(code, "NFA_ZREF4"); break; 11211 case NFA_ZREF5: 11212 STRCPY(code, "NFA_ZREF5"); break; 11213 case NFA_ZREF6: 11214 STRCPY(code, "NFA_ZREF6"); break; 11215 case NFA_ZREF7: 11216 STRCPY(code, "NFA_ZREF7"); break; 11217 case NFA_ZREF8: 11218 STRCPY(code, "NFA_ZREF8"); break; 11219 case NFA_ZREF9: 11220 STRCPY(code, "NFA_ZREF9"); break; 11221 case NFA_SKIP: 11222 STRCPY(code, "NFA_SKIP"); break; 11223 11224 case NFA_PREV_ATOM_NO_WIDTH: 11225 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; 11226 case NFA_PREV_ATOM_NO_WIDTH_NEG: 11227 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; 11228 case NFA_PREV_ATOM_JUST_BEFORE: 11229 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break; 11230 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 11231 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break; 11232 case NFA_PREV_ATOM_LIKE_PATTERN: 11233 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break; 11234 11235 case NFA_NOPEN: 11236 STRCPY(code, "NFA_NOPEN"); break; 11237 case NFA_NCLOSE: 11238 STRCPY(code, "NFA_NCLOSE"); break; 11239 case NFA_START_INVISIBLE: 11240 STRCPY(code, "NFA_START_INVISIBLE"); break; 11241 case NFA_START_INVISIBLE_FIRST: 11242 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break; 11243 case NFA_START_INVISIBLE_NEG: 11244 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break; 11245 case NFA_START_INVISIBLE_NEG_FIRST: 11246 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break; 11247 case NFA_START_INVISIBLE_BEFORE: 11248 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break; 11249 case NFA_START_INVISIBLE_BEFORE_FIRST: 11250 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break; 11251 case NFA_START_INVISIBLE_BEFORE_NEG: 11252 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break; 11253 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 11254 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break; 11255 case NFA_START_PATTERN: 11256 STRCPY(code, "NFA_START_PATTERN"); break; 11257 case NFA_END_INVISIBLE: 11258 STRCPY(code, "NFA_END_INVISIBLE"); break; 11259 case NFA_END_INVISIBLE_NEG: 11260 STRCPY(code, "NFA_END_INVISIBLE_NEG"); break; 11261 case NFA_END_PATTERN: 11262 STRCPY(code, "NFA_END_PATTERN"); break; 11263 11264 case NFA_COMPOSING: 11265 STRCPY(code, "NFA_COMPOSING"); break; 11266 case NFA_END_COMPOSING: 11267 STRCPY(code, "NFA_END_COMPOSING"); break; 11268 case NFA_OPT_CHARS: 11269 STRCPY(code, "NFA_OPT_CHARS"); break; 11270 11271 case NFA_MOPEN: 11272 case NFA_MOPEN1: 11273 case NFA_MOPEN2: 11274 case NFA_MOPEN3: 11275 case NFA_MOPEN4: 11276 case NFA_MOPEN5: 11277 case NFA_MOPEN6: 11278 case NFA_MOPEN7: 11279 case NFA_MOPEN8: 11280 case NFA_MOPEN9: 11281 STRCPY(code, "NFA_MOPEN(x)"); 11282 code[10] = c - NFA_MOPEN + '0'; 11283 break; 11284 case NFA_MCLOSE: 11285 case NFA_MCLOSE1: 11286 case NFA_MCLOSE2: 11287 case NFA_MCLOSE3: 11288 case NFA_MCLOSE4: 11289 case NFA_MCLOSE5: 11290 case NFA_MCLOSE6: 11291 case NFA_MCLOSE7: 11292 case NFA_MCLOSE8: 11293 case NFA_MCLOSE9: 11294 STRCPY(code, "NFA_MCLOSE(x)"); 11295 code[11] = c - NFA_MCLOSE + '0'; 11296 break; 11297 case NFA_ZOPEN: 11298 case NFA_ZOPEN1: 11299 case NFA_ZOPEN2: 11300 case NFA_ZOPEN3: 11301 case NFA_ZOPEN4: 11302 case NFA_ZOPEN5: 11303 case NFA_ZOPEN6: 11304 case NFA_ZOPEN7: 11305 case NFA_ZOPEN8: 11306 case NFA_ZOPEN9: 11307 STRCPY(code, "NFA_ZOPEN(x)"); 11308 code[10] = c - NFA_ZOPEN + '0'; 11309 break; 11310 case NFA_ZCLOSE: 11311 case NFA_ZCLOSE1: 11312 case NFA_ZCLOSE2: 11313 case NFA_ZCLOSE3: 11314 case NFA_ZCLOSE4: 11315 case NFA_ZCLOSE5: 11316 case NFA_ZCLOSE6: 11317 case NFA_ZCLOSE7: 11318 case NFA_ZCLOSE8: 11319 case NFA_ZCLOSE9: 11320 STRCPY(code, "NFA_ZCLOSE(x)"); 11321 code[11] = c - NFA_ZCLOSE + '0'; 11322 break; 11323 case NFA_EOL: 11324 STRCPY(code, "NFA_EOL "); break; 11325 case NFA_BOL: 11326 STRCPY(code, "NFA_BOL "); break; 11327 case NFA_EOW: 11328 STRCPY(code, "NFA_EOW "); break; 11329 case NFA_BOW: 11330 STRCPY(code, "NFA_BOW "); break; 11331 case NFA_EOF: 11332 STRCPY(code, "NFA_EOF "); break; 11333 case NFA_BOF: 11334 STRCPY(code, "NFA_BOF "); break; 11335 case NFA_LNUM: 11336 STRCPY(code, "NFA_LNUM "); break; 11337 case NFA_LNUM_GT: 11338 STRCPY(code, "NFA_LNUM_GT "); break; 11339 case NFA_LNUM_LT: 11340 STRCPY(code, "NFA_LNUM_LT "); break; 11341 case NFA_COL: 11342 STRCPY(code, "NFA_COL "); break; 11343 case NFA_COL_GT: 11344 STRCPY(code, "NFA_COL_GT "); break; 11345 case NFA_COL_LT: 11346 STRCPY(code, "NFA_COL_LT "); break; 11347 case NFA_VCOL: 11348 STRCPY(code, "NFA_VCOL "); break; 11349 case NFA_VCOL_GT: 11350 STRCPY(code, "NFA_VCOL_GT "); break; 11351 case NFA_VCOL_LT: 11352 STRCPY(code, "NFA_VCOL_LT "); break; 11353 case NFA_MARK: 11354 STRCPY(code, "NFA_MARK "); break; 11355 case NFA_MARK_GT: 11356 STRCPY(code, "NFA_MARK_GT "); break; 11357 case NFA_MARK_LT: 11358 STRCPY(code, "NFA_MARK_LT "); break; 11359 case NFA_CURSOR: 11360 STRCPY(code, "NFA_CURSOR "); break; 11361 case NFA_VISUAL: 11362 STRCPY(code, "NFA_VISUAL "); break; 11363 case NFA_ANY_COMPOSING: 11364 STRCPY(code, "NFA_ANY_COMPOSING "); break; 11365 11366 case NFA_STAR: 11367 STRCPY(code, "NFA_STAR "); break; 11368 case NFA_STAR_NONGREEDY: 11369 STRCPY(code, "NFA_STAR_NONGREEDY "); break; 11370 case NFA_QUEST: 11371 STRCPY(code, "NFA_QUEST"); break; 11372 case NFA_QUEST_NONGREEDY: 11373 STRCPY(code, "NFA_QUEST_NON_GREEDY"); break; 11374 case NFA_EMPTY: 11375 STRCPY(code, "NFA_EMPTY"); break; 11376 case NFA_OR: 11377 STRCPY(code, "NFA_OR"); break; 11378 11379 case NFA_START_COLL: 11380 STRCPY(code, "NFA_START_COLL"); break; 11381 case NFA_END_COLL: 11382 STRCPY(code, "NFA_END_COLL"); break; 11383 case NFA_START_NEG_COLL: 11384 STRCPY(code, "NFA_START_NEG_COLL"); break; 11385 case NFA_END_NEG_COLL: 11386 STRCPY(code, "NFA_END_NEG_COLL"); break; 11387 case NFA_RANGE: 11388 STRCPY(code, "NFA_RANGE"); break; 11389 case NFA_RANGE_MIN: 11390 STRCPY(code, "NFA_RANGE_MIN"); break; 11391 case NFA_RANGE_MAX: 11392 STRCPY(code, "NFA_RANGE_MAX"); break; 11393 11394 case NFA_CLASS_ALNUM: 11395 STRCPY(code, "NFA_CLASS_ALNUM"); break; 11396 case NFA_CLASS_ALPHA: 11397 STRCPY(code, "NFA_CLASS_ALPHA"); break; 11398 case NFA_CLASS_BLANK: 11399 STRCPY(code, "NFA_CLASS_BLANK"); break; 11400 case NFA_CLASS_CNTRL: 11401 STRCPY(code, "NFA_CLASS_CNTRL"); break; 11402 case NFA_CLASS_DIGIT: 11403 STRCPY(code, "NFA_CLASS_DIGIT"); break; 11404 case NFA_CLASS_GRAPH: 11405 STRCPY(code, "NFA_CLASS_GRAPH"); break; 11406 case NFA_CLASS_LOWER: 11407 STRCPY(code, "NFA_CLASS_LOWER"); break; 11408 case NFA_CLASS_PRINT: 11409 STRCPY(code, "NFA_CLASS_PRINT"); break; 11410 case NFA_CLASS_PUNCT: 11411 STRCPY(code, "NFA_CLASS_PUNCT"); break; 11412 case NFA_CLASS_SPACE: 11413 STRCPY(code, "NFA_CLASS_SPACE"); break; 11414 case NFA_CLASS_UPPER: 11415 STRCPY(code, "NFA_CLASS_UPPER"); break; 11416 case NFA_CLASS_XDIGIT: 11417 STRCPY(code, "NFA_CLASS_XDIGIT"); break; 11418 case NFA_CLASS_TAB: 11419 STRCPY(code, "NFA_CLASS_TAB"); break; 11420 case NFA_CLASS_RETURN: 11421 STRCPY(code, "NFA_CLASS_RETURN"); break; 11422 case NFA_CLASS_BACKSPACE: 11423 STRCPY(code, "NFA_CLASS_BACKSPACE"); break; 11424 case NFA_CLASS_ESCAPE: 11425 STRCPY(code, "NFA_CLASS_ESCAPE"); break; 11426 case NFA_CLASS_IDENT: 11427 STRCPY(code, "NFA_CLASS_IDENT"); break; 11428 case NFA_CLASS_KEYWORD: 11429 STRCPY(code, "NFA_CLASS_KEYWORD"); break; 11430 case NFA_CLASS_FNAME: 11431 STRCPY(code, "NFA_CLASS_FNAME"); break; 11432 11433 case NFA_ANY: 11434 STRCPY(code, "NFA_ANY"); break; 11435 case NFA_IDENT: 11436 STRCPY(code, "NFA_IDENT"); break; 11437 case NFA_SIDENT: 11438 STRCPY(code, "NFA_SIDENT"); break; 11439 case NFA_KWORD: 11440 STRCPY(code, "NFA_KWORD"); break; 11441 case NFA_SKWORD: 11442 STRCPY(code, "NFA_SKWORD"); break; 11443 case NFA_FNAME: 11444 STRCPY(code, "NFA_FNAME"); break; 11445 case NFA_SFNAME: 11446 STRCPY(code, "NFA_SFNAME"); break; 11447 case NFA_PRINT: 11448 STRCPY(code, "NFA_PRINT"); break; 11449 case NFA_SPRINT: 11450 STRCPY(code, "NFA_SPRINT"); break; 11451 case NFA_WHITE: 11452 STRCPY(code, "NFA_WHITE"); break; 11453 case NFA_NWHITE: 11454 STRCPY(code, "NFA_NWHITE"); break; 11455 case NFA_DIGIT: 11456 STRCPY(code, "NFA_DIGIT"); break; 11457 case NFA_NDIGIT: 11458 STRCPY(code, "NFA_NDIGIT"); break; 11459 case NFA_HEX: 11460 STRCPY(code, "NFA_HEX"); break; 11461 case NFA_NHEX: 11462 STRCPY(code, "NFA_NHEX"); break; 11463 case NFA_OCTAL: 11464 STRCPY(code, "NFA_OCTAL"); break; 11465 case NFA_NOCTAL: 11466 STRCPY(code, "NFA_NOCTAL"); break; 11467 case NFA_WORD: 11468 STRCPY(code, "NFA_WORD"); break; 11469 case NFA_NWORD: 11470 STRCPY(code, "NFA_NWORD"); break; 11471 case NFA_HEAD: 11472 STRCPY(code, "NFA_HEAD"); break; 11473 case NFA_NHEAD: 11474 STRCPY(code, "NFA_NHEAD"); break; 11475 case NFA_ALPHA: 11476 STRCPY(code, "NFA_ALPHA"); break; 11477 case NFA_NALPHA: 11478 STRCPY(code, "NFA_NALPHA"); break; 11479 case NFA_LOWER: 11480 STRCPY(code, "NFA_LOWER"); break; 11481 case NFA_NLOWER: 11482 STRCPY(code, "NFA_NLOWER"); break; 11483 case NFA_UPPER: 11484 STRCPY(code, "NFA_UPPER"); break; 11485 case NFA_NUPPER: 11486 STRCPY(code, "NFA_NUPPER"); break; 11487 case NFA_LOWER_IC: 11488 STRCPY(code, "NFA_LOWER_IC"); break; 11489 case NFA_NLOWER_IC: 11490 STRCPY(code, "NFA_NLOWER_IC"); break; 11491 case NFA_UPPER_IC: 11492 STRCPY(code, "NFA_UPPER_IC"); break; 11493 case NFA_NUPPER_IC: 11494 STRCPY(code, "NFA_NUPPER_IC"); break; 11495 11496 default: 11497 STRCPY(code, "CHAR(x)"); 11498 code[5] = c; 11499 } 11500 11501 if (addnl == true) { 11502 strcat(code, " + NEWLINE "); 11503 } 11504 } 11505 11506 static FILE *log_fd; 11507 static const uint8_t e_log_open_failed[] = 11508 N_("Could not open temporary log file for writing, displaying on stderr... "); 11509 11510 // Print the postfix notation of the current regexp. 11511 static void nfa_postfix_dump(uint8_t *expr, int retval) 11512 { 11513 int *p; 11514 FILE *f; 11515 11516 f = fopen(NFA_REGEXP_DUMP_LOG, "a"); 11517 if (f == NULL) { 11518 return; 11519 } 11520 11521 fprintf(f, "\n-------------------------\n"); 11522 if (retval == FAIL) { 11523 fprintf(f, ">>> NFA engine failed... \n"); 11524 } else if (retval == OK) { 11525 fprintf(f, ">>> NFA engine succeeded !\n"); 11526 } 11527 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr); 11528 for (p = post_start; *p && p < post_ptr; p++) { 11529 nfa_set_code(*p); 11530 fprintf(f, "%s, ", code); 11531 } 11532 fprintf(f, "\"\nPostfix notation (int): "); 11533 for (p = post_start; *p && p < post_ptr; p++) { 11534 fprintf(f, "%d ", *p); 11535 } 11536 fprintf(f, "\n\n"); 11537 fclose(f); 11538 } 11539 11540 // Print the NFA starting with a root node "state". 11541 static void nfa_print_state(FILE *debugf, nfa_state_T *state) 11542 { 11543 garray_T indent; 11544 11545 ga_init(&indent, 1, 64); 11546 ga_append(&indent, NUL); 11547 nfa_print_state2(debugf, state, &indent); 11548 ga_clear(&indent); 11549 } 11550 11551 static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent) 11552 { 11553 uint8_t *p; 11554 11555 if (state == NULL) { 11556 return; 11557 } 11558 11559 fprintf(debugf, "(%2d)", abs(state->id)); 11560 11561 // Output indent 11562 p = (uint8_t *)indent->ga_data; 11563 if (indent->ga_len >= 3) { 11564 int last = indent->ga_len - 3; 11565 uint8_t save[2]; 11566 11567 strncpy(save, &p[last], 2); // NOLINT(runtime/printf) 11568 memcpy(&p[last], "+-", 2); 11569 fprintf(debugf, " %s", p); 11570 strncpy(&p[last], save, 2); // NOLINT(runtime/printf) 11571 } else { 11572 fprintf(debugf, " %s", p); 11573 } 11574 11575 nfa_set_code(state->c); 11576 fprintf(debugf, "%s (%d) (id=%d) val=%d\n", 11577 code, 11578 state->c, 11579 abs(state->id), 11580 state->val); 11581 if (state->id < 0) { 11582 return; 11583 } 11584 11585 state->id = abs(state->id) * -1; 11586 11587 // grow indent for state->out 11588 indent->ga_len -= 1; 11589 if (state->out1) { 11590 GA_CONCAT_LITERAL(indent, "| "); 11591 } else { 11592 GA_CONCAT_LITERAL(indent, " "); 11593 } 11594 ga_append(indent, NUL); 11595 11596 nfa_print_state2(debugf, state->out, indent); 11597 11598 // replace last part of indent for state->out1 11599 indent->ga_len -= 3; 11600 GA_CONCAT_LITERAL(indent, " "); 11601 ga_append(indent, NUL); 11602 11603 nfa_print_state2(debugf, state->out1, indent); 11604 11605 // shrink indent 11606 indent->ga_len -= 3; 11607 ga_append(indent, NUL); 11608 } 11609 11610 // Print the NFA state machine. 11611 static void nfa_dump(nfa_regprog_T *prog) 11612 { 11613 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a"); 11614 11615 if (debugf == NULL) { 11616 return; 11617 } 11618 11619 nfa_print_state(debugf, prog->start); 11620 11621 if (prog->reganch) { 11622 fprintf(debugf, "reganch: %d\n", prog->reganch); 11623 } 11624 if (prog->regstart != NUL) { 11625 fprintf(debugf, "regstart: %c (decimal: %d)\n", 11626 prog->regstart, prog->regstart); 11627 } 11628 if (prog->match_text != NULL) { 11629 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text); 11630 } 11631 11632 fclose(debugf); 11633 } 11634 #endif // REGEXP_DEBUG 11635 11636 // Parse r.e. @expr and convert it into postfix form. 11637 // Return the postfix string on success, NULL otherwise. 11638 static int *re2post(void) 11639 { 11640 if (nfa_reg(REG_NOPAREN) == FAIL) { 11641 return NULL; 11642 } 11643 EMIT(NFA_MOPEN); 11644 return post_start; 11645 } 11646 11647 // NB. Some of the code below is inspired by Russ's. 11648 11649 // Represents an NFA state plus zero or one or two arrows exiting. 11650 // if c == MATCH, no arrows out; matching state. 11651 // If c == SPLIT, unlabeled arrows to out and out1 (if != NULL). 11652 // If c < 256, labeled arrow with character c to out. 11653 11654 static nfa_state_T *state_ptr; // points to nfa_prog->state 11655 11656 // Allocate and initialize nfa_state_T. 11657 static nfa_state_T *alloc_state(int c, nfa_state_T *out, nfa_state_T *out1) 11658 { 11659 nfa_state_T *s; 11660 11661 if (istate >= nstate) { 11662 return NULL; 11663 } 11664 11665 s = &state_ptr[istate++]; 11666 11667 s->c = c; 11668 s->out = out; 11669 s->out1 = out1; 11670 s->val = 0; 11671 11672 s->id = istate; 11673 s->lastlist[0] = 0; 11674 s->lastlist[1] = 0; 11675 11676 return s; 11677 } 11678 11679 // A partially built NFA without the matching state filled in. 11680 // Frag_T.start points at the start state. 11681 // Frag_T.out is a list of places that need to be set to the 11682 // next state for this fragment. 11683 11684 // Initialize a Frag_T struct and return it. 11685 static Frag_T frag(nfa_state_T *start, Ptrlist *out) 11686 { 11687 Frag_T n; 11688 11689 n.start = start; 11690 n.out = out; 11691 return n; 11692 } 11693 11694 // Create singleton list containing just outp. 11695 static Ptrlist *list1(nfa_state_T **outp) 11696 { 11697 Ptrlist *l; 11698 11699 l = (Ptrlist *)outp; 11700 l->next = NULL; 11701 return l; 11702 } 11703 11704 // Patch the list of states at out to point to start. 11705 static void patch(Ptrlist *l, nfa_state_T *s) 11706 { 11707 Ptrlist *next; 11708 11709 for (; l; l = next) { 11710 next = l->next; 11711 l->s = s; 11712 } 11713 } 11714 11715 // Join the two lists l1 and l2, returning the combination. 11716 static Ptrlist *append(Ptrlist *l1, Ptrlist *l2) 11717 { 11718 Ptrlist *oldl1; 11719 11720 oldl1 = l1; 11721 while (l1->next) { 11722 l1 = l1->next; 11723 } 11724 l1->next = l2; 11725 return oldl1; 11726 } 11727 11728 // Stack used for transforming postfix form into NFA. 11729 static Frag_T empty; 11730 11731 static void st_error(int *postfix, int *end, int *p) 11732 { 11733 #ifdef NFA_REGEXP_ERROR_LOG 11734 FILE *df; 11735 int *p2; 11736 11737 df = fopen(NFA_REGEXP_ERROR_LOG, "a"); 11738 if (df) { 11739 fprintf(df, "Error popping the stack!\n"); 11740 # ifdef REGEXP_DEBUG 11741 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr); 11742 # endif 11743 fprintf(df, "Postfix form is: "); 11744 # ifdef REGEXP_DEBUG 11745 for (p2 = postfix; p2 < end; p2++) { 11746 nfa_set_code(*p2); 11747 fprintf(df, "%s, ", code); 11748 } 11749 nfa_set_code(*p); 11750 fprintf(df, "\nCurrent position is: "); 11751 for (p2 = postfix; p2 <= p; p2++) { 11752 nfa_set_code(*p2); 11753 fprintf(df, "%s, ", code); 11754 } 11755 # else 11756 for (p2 = postfix; p2 < end; p2++) { 11757 fprintf(df, "%d, ", *p2); 11758 } 11759 fprintf(df, "\nCurrent position is: "); 11760 for (p2 = postfix; p2 <= p; p2++) { 11761 fprintf(df, "%d, ", *p2); 11762 } 11763 # endif 11764 fprintf(df, "\n--------------------------\n"); 11765 fclose(df); 11766 } 11767 #endif 11768 emsg(_("E874: (NFA) Could not pop the stack!")); 11769 } 11770 11771 // Push an item onto the stack. 11772 static void st_push(Frag_T s, Frag_T **p, Frag_T *stack_end) 11773 { 11774 Frag_T *stackp = *p; 11775 11776 if (stackp >= stack_end) { 11777 return; 11778 } 11779 *stackp = s; 11780 *p = *p + 1; 11781 } 11782 11783 // Pop an item from the stack. 11784 static Frag_T st_pop(Frag_T **p, Frag_T *stack) 11785 { 11786 Frag_T *stackp; 11787 11788 *p = *p - 1; 11789 stackp = *p; 11790 if (stackp < stack) { 11791 return empty; 11792 } 11793 return **p; 11794 } 11795 11796 // Estimate the maximum byte length of anything matching "state". 11797 // When unknown or unlimited return -1. 11798 static int nfa_max_width(nfa_state_T *startstate, int depth) 11799 { 11800 int l, r; 11801 nfa_state_T *state = startstate; 11802 int len = 0; 11803 11804 // detect looping in a NFA_SPLIT 11805 if (depth > 4) { 11806 return -1; 11807 } 11808 11809 while (state != NULL) { 11810 switch (state->c) { 11811 case NFA_END_INVISIBLE: 11812 case NFA_END_INVISIBLE_NEG: 11813 // the end, return what we have 11814 return len; 11815 11816 case NFA_SPLIT: 11817 // two alternatives, use the maximum 11818 l = nfa_max_width(state->out, depth + 1); 11819 r = nfa_max_width(state->out1, depth + 1); 11820 if (l < 0 || r < 0) { 11821 return -1; 11822 } 11823 return len + (l > r ? l : r); 11824 11825 case NFA_ANY: 11826 case NFA_START_COLL: 11827 case NFA_START_NEG_COLL: 11828 // Matches some character, including composing chars. 11829 len += MB_MAXBYTES; 11830 if (state->c != NFA_ANY) { 11831 // Skip over the characters. 11832 state = state->out1->out; 11833 continue; 11834 } 11835 break; 11836 11837 case NFA_DIGIT: 11838 case NFA_WHITE: 11839 case NFA_HEX: 11840 case NFA_OCTAL: 11841 // ascii 11842 len++; 11843 break; 11844 11845 case NFA_IDENT: 11846 case NFA_SIDENT: 11847 case NFA_KWORD: 11848 case NFA_SKWORD: 11849 case NFA_FNAME: 11850 case NFA_SFNAME: 11851 case NFA_PRINT: 11852 case NFA_SPRINT: 11853 case NFA_NWHITE: 11854 case NFA_NDIGIT: 11855 case NFA_NHEX: 11856 case NFA_NOCTAL: 11857 case NFA_WORD: 11858 case NFA_NWORD: 11859 case NFA_HEAD: 11860 case NFA_NHEAD: 11861 case NFA_ALPHA: 11862 case NFA_NALPHA: 11863 case NFA_LOWER: 11864 case NFA_NLOWER: 11865 case NFA_UPPER: 11866 case NFA_NUPPER: 11867 case NFA_LOWER_IC: 11868 case NFA_NLOWER_IC: 11869 case NFA_UPPER_IC: 11870 case NFA_NUPPER_IC: 11871 case NFA_ANY_COMPOSING: 11872 // possibly non-ascii 11873 len += 3; 11874 break; 11875 11876 case NFA_START_INVISIBLE: 11877 case NFA_START_INVISIBLE_NEG: 11878 case NFA_START_INVISIBLE_BEFORE: 11879 case NFA_START_INVISIBLE_BEFORE_NEG: 11880 // zero-width, out1 points to the END state 11881 state = state->out1->out; 11882 continue; 11883 11884 case NFA_BACKREF1: 11885 case NFA_BACKREF2: 11886 case NFA_BACKREF3: 11887 case NFA_BACKREF4: 11888 case NFA_BACKREF5: 11889 case NFA_BACKREF6: 11890 case NFA_BACKREF7: 11891 case NFA_BACKREF8: 11892 case NFA_BACKREF9: 11893 case NFA_ZREF1: 11894 case NFA_ZREF2: 11895 case NFA_ZREF3: 11896 case NFA_ZREF4: 11897 case NFA_ZREF5: 11898 case NFA_ZREF6: 11899 case NFA_ZREF7: 11900 case NFA_ZREF8: 11901 case NFA_ZREF9: 11902 case NFA_NEWL: 11903 case NFA_SKIP: 11904 // unknown width 11905 return -1; 11906 11907 case NFA_BOL: 11908 case NFA_EOL: 11909 case NFA_BOF: 11910 case NFA_EOF: 11911 case NFA_BOW: 11912 case NFA_EOW: 11913 case NFA_MOPEN: 11914 case NFA_MOPEN1: 11915 case NFA_MOPEN2: 11916 case NFA_MOPEN3: 11917 case NFA_MOPEN4: 11918 case NFA_MOPEN5: 11919 case NFA_MOPEN6: 11920 case NFA_MOPEN7: 11921 case NFA_MOPEN8: 11922 case NFA_MOPEN9: 11923 case NFA_ZOPEN: 11924 case NFA_ZOPEN1: 11925 case NFA_ZOPEN2: 11926 case NFA_ZOPEN3: 11927 case NFA_ZOPEN4: 11928 case NFA_ZOPEN5: 11929 case NFA_ZOPEN6: 11930 case NFA_ZOPEN7: 11931 case NFA_ZOPEN8: 11932 case NFA_ZOPEN9: 11933 case NFA_ZCLOSE: 11934 case NFA_ZCLOSE1: 11935 case NFA_ZCLOSE2: 11936 case NFA_ZCLOSE3: 11937 case NFA_ZCLOSE4: 11938 case NFA_ZCLOSE5: 11939 case NFA_ZCLOSE6: 11940 case NFA_ZCLOSE7: 11941 case NFA_ZCLOSE8: 11942 case NFA_ZCLOSE9: 11943 case NFA_MCLOSE: 11944 case NFA_MCLOSE1: 11945 case NFA_MCLOSE2: 11946 case NFA_MCLOSE3: 11947 case NFA_MCLOSE4: 11948 case NFA_MCLOSE5: 11949 case NFA_MCLOSE6: 11950 case NFA_MCLOSE7: 11951 case NFA_MCLOSE8: 11952 case NFA_MCLOSE9: 11953 case NFA_NOPEN: 11954 case NFA_NCLOSE: 11955 11956 case NFA_LNUM_GT: 11957 case NFA_LNUM_LT: 11958 case NFA_COL_GT: 11959 case NFA_COL_LT: 11960 case NFA_VCOL_GT: 11961 case NFA_VCOL_LT: 11962 case NFA_MARK_GT: 11963 case NFA_MARK_LT: 11964 case NFA_VISUAL: 11965 case NFA_LNUM: 11966 case NFA_CURSOR: 11967 case NFA_COL: 11968 case NFA_VCOL: 11969 case NFA_MARK: 11970 11971 case NFA_ZSTART: 11972 case NFA_ZEND: 11973 case NFA_OPT_CHARS: 11974 case NFA_EMPTY: 11975 case NFA_START_PATTERN: 11976 case NFA_END_PATTERN: 11977 case NFA_COMPOSING: 11978 case NFA_END_COMPOSING: 11979 // zero-width 11980 break; 11981 11982 default: 11983 if (state->c < 0) { 11984 // don't know what this is 11985 return -1; 11986 } 11987 // normal character 11988 len += utf_char2len(state->c); 11989 break; 11990 } 11991 11992 // normal way to continue 11993 state = state->out; 11994 } 11995 11996 // unrecognized, "cannot happen" 11997 return -1; 11998 } 11999 12000 // Convert a postfix form into its equivalent NFA. 12001 // Return the NFA start state on success, NULL otherwise. 12002 static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size) 12003 { 12004 int *p; 12005 int mopen; 12006 int mclose; 12007 Frag_T *stack = NULL; 12008 Frag_T *stackp = NULL; 12009 Frag_T *stack_end = NULL; 12010 Frag_T e1; 12011 Frag_T e2; 12012 Frag_T e; 12013 nfa_state_T *s; 12014 nfa_state_T *s1; 12015 nfa_state_T *matchstate; 12016 nfa_state_T *ret = NULL; 12017 12018 if (postfix == NULL) { 12019 return NULL; 12020 } 12021 12022 #define PUSH(s) st_push((s), &stackp, stack_end) 12023 #define POP() st_pop(&stackp, stack); \ 12024 if (stackp < stack) { \ 12025 st_error(postfix, end, p); \ 12026 xfree(stack); \ 12027 return NULL; \ 12028 } 12029 12030 if (nfa_calc_size == false) { 12031 // Allocate space for the stack. Max states on the stack: "nstate". 12032 stack = xmalloc((size_t)(nstate + 1) * sizeof(Frag_T)); 12033 stackp = stack; 12034 stack_end = stack + (nstate + 1); 12035 } 12036 12037 for (p = postfix; p < end; p++) { 12038 switch (*p) { 12039 case NFA_CONCAT: 12040 // Concatenation. 12041 // Pay attention: this operator does not exist in the r.e. itself 12042 // (it is implicit, really). It is added when r.e. is translated 12043 // to postfix form in re2post(). 12044 if (nfa_calc_size == true) { 12045 // nstate += 0; 12046 break; 12047 } 12048 e2 = POP(); 12049 e1 = POP(); 12050 patch(e1.out, e2.start); 12051 PUSH(frag(e1.start, e2.out)); 12052 break; 12053 12054 case NFA_OR: 12055 // Alternation 12056 if (nfa_calc_size == true) { 12057 nstate++; 12058 break; 12059 } 12060 e2 = POP(); 12061 e1 = POP(); 12062 s = alloc_state(NFA_SPLIT, e1.start, e2.start); 12063 if (s == NULL) { 12064 goto theend; 12065 } 12066 PUSH(frag(s, append(e1.out, e2.out))); 12067 break; 12068 12069 case NFA_STAR: 12070 // Zero or more, prefer more 12071 if (nfa_calc_size == true) { 12072 nstate++; 12073 break; 12074 } 12075 e = POP(); 12076 s = alloc_state(NFA_SPLIT, e.start, NULL); 12077 if (s == NULL) { 12078 goto theend; 12079 } 12080 patch(e.out, s); 12081 PUSH(frag(s, list1(&s->out1))); 12082 break; 12083 12084 case NFA_STAR_NONGREEDY: 12085 // Zero or more, prefer zero 12086 if (nfa_calc_size == true) { 12087 nstate++; 12088 break; 12089 } 12090 e = POP(); 12091 s = alloc_state(NFA_SPLIT, NULL, e.start); 12092 if (s == NULL) { 12093 goto theend; 12094 } 12095 patch(e.out, s); 12096 PUSH(frag(s, list1(&s->out))); 12097 break; 12098 12099 case NFA_QUEST: 12100 // one or zero atoms=> greedy match 12101 if (nfa_calc_size == true) { 12102 nstate++; 12103 break; 12104 } 12105 e = POP(); 12106 s = alloc_state(NFA_SPLIT, e.start, NULL); 12107 if (s == NULL) { 12108 goto theend; 12109 } 12110 PUSH(frag(s, append(e.out, list1(&s->out1)))); 12111 break; 12112 12113 case NFA_QUEST_NONGREEDY: 12114 // zero or one atoms => non-greedy match 12115 if (nfa_calc_size == true) { 12116 nstate++; 12117 break; 12118 } 12119 e = POP(); 12120 s = alloc_state(NFA_SPLIT, NULL, e.start); 12121 if (s == NULL) { 12122 goto theend; 12123 } 12124 PUSH(frag(s, append(e.out, list1(&s->out)))); 12125 break; 12126 12127 case NFA_END_COLL: 12128 case NFA_END_NEG_COLL: 12129 // On the stack is the sequence starting with NFA_START_COLL or 12130 // NFA_START_NEG_COLL and all possible characters. Patch it to 12131 // add the output to the start. 12132 if (nfa_calc_size == true) { 12133 nstate++; 12134 break; 12135 } 12136 e = POP(); 12137 s = alloc_state(NFA_END_COLL, NULL, NULL); 12138 if (s == NULL) { 12139 goto theend; 12140 } 12141 patch(e.out, s); 12142 e.start->out1 = s; 12143 PUSH(frag(e.start, list1(&s->out))); 12144 break; 12145 12146 case NFA_RANGE: 12147 // Before this are two characters, the low and high end of a 12148 // range. Turn them into two states with MIN and MAX. 12149 if (nfa_calc_size == true) { 12150 // nstate += 0; 12151 break; 12152 } 12153 e2 = POP(); 12154 e1 = POP(); 12155 e2.start->val = e2.start->c; 12156 e2.start->c = NFA_RANGE_MAX; 12157 e1.start->val = e1.start->c; 12158 e1.start->c = NFA_RANGE_MIN; 12159 patch(e1.out, e2.start); 12160 PUSH(frag(e1.start, e2.out)); 12161 break; 12162 12163 case NFA_EMPTY: 12164 // 0-length, used in a repetition with max/min count of 0 12165 if (nfa_calc_size == true) { 12166 nstate++; 12167 break; 12168 } 12169 s = alloc_state(NFA_EMPTY, NULL, NULL); 12170 if (s == NULL) { 12171 goto theend; 12172 } 12173 PUSH(frag(s, list1(&s->out))); 12174 break; 12175 12176 case NFA_OPT_CHARS: { 12177 int n; 12178 12179 // \%[abc] implemented as: 12180 // NFA_SPLIT 12181 // +-CHAR(a) 12182 // | +-NFA_SPLIT 12183 // | +-CHAR(b) 12184 // | | +-NFA_SPLIT 12185 // | | +-CHAR(c) 12186 // | | | +-next 12187 // | | +- next 12188 // | +- next 12189 // +- next 12190 n = *++p; // get number of characters 12191 if (nfa_calc_size == true) { 12192 nstate += n; 12193 break; 12194 } 12195 s = NULL; // avoid compiler warning 12196 e1.out = NULL; // stores list with out1's 12197 s1 = NULL; // previous NFA_SPLIT to connect to 12198 while (n-- > 0) { 12199 e = POP(); // get character 12200 s = alloc_state(NFA_SPLIT, e.start, NULL); 12201 if (s == NULL) { 12202 goto theend; 12203 } 12204 if (e1.out == NULL) { 12205 e1 = e; 12206 } 12207 patch(e.out, s1); 12208 append(e1.out, list1(&s->out1)); 12209 s1 = s; 12210 } 12211 PUSH(frag(s, e1.out)); 12212 break; 12213 } 12214 12215 case NFA_PREV_ATOM_NO_WIDTH: 12216 case NFA_PREV_ATOM_NO_WIDTH_NEG: 12217 case NFA_PREV_ATOM_JUST_BEFORE: 12218 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 12219 case NFA_PREV_ATOM_LIKE_PATTERN: { 12220 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE 12221 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG); 12222 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN); 12223 int start_state; 12224 int end_state; 12225 int n = 0; 12226 nfa_state_T *zend; 12227 nfa_state_T *skip; 12228 12229 switch (*p) { 12230 case NFA_PREV_ATOM_NO_WIDTH: 12231 start_state = NFA_START_INVISIBLE; 12232 end_state = NFA_END_INVISIBLE; 12233 break; 12234 case NFA_PREV_ATOM_NO_WIDTH_NEG: 12235 start_state = NFA_START_INVISIBLE_NEG; 12236 end_state = NFA_END_INVISIBLE_NEG; 12237 break; 12238 case NFA_PREV_ATOM_JUST_BEFORE: 12239 start_state = NFA_START_INVISIBLE_BEFORE; 12240 end_state = NFA_END_INVISIBLE; 12241 break; 12242 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 12243 start_state = NFA_START_INVISIBLE_BEFORE_NEG; 12244 end_state = NFA_END_INVISIBLE_NEG; 12245 break; 12246 default: // NFA_PREV_ATOM_LIKE_PATTERN: 12247 start_state = NFA_START_PATTERN; 12248 end_state = NFA_END_PATTERN; 12249 break; 12250 } 12251 12252 if (before) { 12253 n = *++p; // get the count 12254 } 12255 // The \@= operator: match the preceding atom with zero width. 12256 // The \@! operator: no match for the preceding atom. 12257 // The \@<= operator: match for the preceding atom. 12258 // The \@<! operator: no match for the preceding atom. 12259 // Surrounds the preceding atom with START_INVISIBLE and 12260 // END_INVISIBLE, similarly to MOPEN. 12261 12262 if (nfa_calc_size == true) { 12263 nstate += pattern ? 4 : 2; 12264 break; 12265 } 12266 e = POP(); 12267 s1 = alloc_state(end_state, NULL, NULL); 12268 if (s1 == NULL) { 12269 goto theend; 12270 } 12271 12272 s = alloc_state(start_state, e.start, s1); 12273 if (s == NULL) { 12274 goto theend; 12275 } 12276 if (pattern) { 12277 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. 12278 skip = alloc_state(NFA_SKIP, NULL, NULL); 12279 if (skip == NULL) { 12280 goto theend; 12281 } 12282 zend = alloc_state(NFA_ZEND, s1, NULL); 12283 if (zend == NULL) { 12284 goto theend; 12285 } 12286 s1->out = skip; 12287 patch(e.out, zend); 12288 PUSH(frag(s, list1(&skip->out))); 12289 } else { 12290 patch(e.out, s1); 12291 PUSH(frag(s, list1(&s1->out))); 12292 if (before) { 12293 if (n <= 0) { 12294 // See if we can guess the maximum width, it avoids a 12295 // lot of pointless tries. 12296 n = nfa_max_width(e.start, 0); 12297 } 12298 s->val = n; // store the count 12299 } 12300 } 12301 break; 12302 } 12303 12304 case NFA_COMPOSING: // char with composing char 12305 FALLTHROUGH; 12306 12307 case NFA_MOPEN: // \( \) Submatch 12308 case NFA_MOPEN1: 12309 case NFA_MOPEN2: 12310 case NFA_MOPEN3: 12311 case NFA_MOPEN4: 12312 case NFA_MOPEN5: 12313 case NFA_MOPEN6: 12314 case NFA_MOPEN7: 12315 case NFA_MOPEN8: 12316 case NFA_MOPEN9: 12317 case NFA_ZOPEN: // \z( \) Submatch 12318 case NFA_ZOPEN1: 12319 case NFA_ZOPEN2: 12320 case NFA_ZOPEN3: 12321 case NFA_ZOPEN4: 12322 case NFA_ZOPEN5: 12323 case NFA_ZOPEN6: 12324 case NFA_ZOPEN7: 12325 case NFA_ZOPEN8: 12326 case NFA_ZOPEN9: 12327 case NFA_NOPEN: // \%( \) "Invisible Submatch" 12328 if (nfa_calc_size == true) { 12329 nstate += 2; 12330 break; 12331 } 12332 12333 mopen = *p; 12334 switch (*p) { 12335 case NFA_NOPEN: 12336 mclose = NFA_NCLOSE; break; 12337 case NFA_ZOPEN: 12338 mclose = NFA_ZCLOSE; break; 12339 case NFA_ZOPEN1: 12340 mclose = NFA_ZCLOSE1; break; 12341 case NFA_ZOPEN2: 12342 mclose = NFA_ZCLOSE2; break; 12343 case NFA_ZOPEN3: 12344 mclose = NFA_ZCLOSE3; break; 12345 case NFA_ZOPEN4: 12346 mclose = NFA_ZCLOSE4; break; 12347 case NFA_ZOPEN5: 12348 mclose = NFA_ZCLOSE5; break; 12349 case NFA_ZOPEN6: 12350 mclose = NFA_ZCLOSE6; break; 12351 case NFA_ZOPEN7: 12352 mclose = NFA_ZCLOSE7; break; 12353 case NFA_ZOPEN8: 12354 mclose = NFA_ZCLOSE8; break; 12355 case NFA_ZOPEN9: 12356 mclose = NFA_ZCLOSE9; break; 12357 case NFA_COMPOSING: 12358 mclose = NFA_END_COMPOSING; break; 12359 default: 12360 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9 12361 mclose = *p + NSUBEXP; 12362 break; 12363 } 12364 12365 // Allow "NFA_MOPEN" as a valid postfix representation for 12366 // the empty regexp "". In this case, the NFA will be 12367 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows 12368 // empty groups of parenthesis, and empty mbyte chars 12369 if (stackp == stack) { 12370 s = alloc_state(mopen, NULL, NULL); 12371 if (s == NULL) { 12372 goto theend; 12373 } 12374 s1 = alloc_state(mclose, NULL, NULL); 12375 if (s1 == NULL) { 12376 goto theend; 12377 } 12378 patch(list1(&s->out), s1); 12379 PUSH(frag(s, list1(&s1->out))); 12380 break; 12381 } 12382 12383 // At least one node was emitted before NFA_MOPEN, so 12384 // at least one node will be between NFA_MOPEN and NFA_MCLOSE 12385 e = POP(); 12386 s = alloc_state(mopen, e.start, NULL); // `(' 12387 if (s == NULL) { 12388 goto theend; 12389 } 12390 12391 s1 = alloc_state(mclose, NULL, NULL); // `)' 12392 if (s1 == NULL) { 12393 goto theend; 12394 } 12395 patch(e.out, s1); 12396 12397 if (mopen == NFA_COMPOSING) { 12398 // COMPOSING->out1 = END_COMPOSING 12399 patch(list1(&s->out1), s1); 12400 } 12401 12402 PUSH(frag(s, list1(&s1->out))); 12403 break; 12404 12405 case NFA_BACKREF1: 12406 case NFA_BACKREF2: 12407 case NFA_BACKREF3: 12408 case NFA_BACKREF4: 12409 case NFA_BACKREF5: 12410 case NFA_BACKREF6: 12411 case NFA_BACKREF7: 12412 case NFA_BACKREF8: 12413 case NFA_BACKREF9: 12414 case NFA_ZREF1: 12415 case NFA_ZREF2: 12416 case NFA_ZREF3: 12417 case NFA_ZREF4: 12418 case NFA_ZREF5: 12419 case NFA_ZREF6: 12420 case NFA_ZREF7: 12421 case NFA_ZREF8: 12422 case NFA_ZREF9: 12423 if (nfa_calc_size == true) { 12424 nstate += 2; 12425 break; 12426 } 12427 s = alloc_state(*p, NULL, NULL); 12428 if (s == NULL) { 12429 goto theend; 12430 } 12431 s1 = alloc_state(NFA_SKIP, NULL, NULL); 12432 if (s1 == NULL) { 12433 goto theend; 12434 } 12435 patch(list1(&s->out), s1); 12436 PUSH(frag(s, list1(&s1->out))); 12437 break; 12438 12439 case NFA_LNUM: 12440 case NFA_LNUM_GT: 12441 case NFA_LNUM_LT: 12442 case NFA_VCOL: 12443 case NFA_VCOL_GT: 12444 case NFA_VCOL_LT: 12445 case NFA_COL: 12446 case NFA_COL_GT: 12447 case NFA_COL_LT: 12448 case NFA_MARK: 12449 case NFA_MARK_GT: 12450 case NFA_MARK_LT: { 12451 int n = *++p; // lnum, col or mark name 12452 12453 if (nfa_calc_size == true) { 12454 nstate += 1; 12455 break; 12456 } 12457 s = alloc_state(p[-1], NULL, NULL); 12458 if (s == NULL) { 12459 goto theend; 12460 } 12461 s->val = n; 12462 PUSH(frag(s, list1(&s->out))); 12463 break; 12464 } 12465 12466 case NFA_ZSTART: 12467 case NFA_ZEND: 12468 default: 12469 // Operands 12470 if (nfa_calc_size == true) { 12471 nstate++; 12472 break; 12473 } 12474 s = alloc_state(*p, NULL, NULL); 12475 if (s == NULL) { 12476 goto theend; 12477 } 12478 PUSH(frag(s, list1(&s->out))); 12479 break; 12480 } // switch(*p) 12481 } // for(p = postfix; *p; ++p) 12482 12483 if (nfa_calc_size == true) { 12484 nstate++; 12485 goto theend; // Return value when counting size is ignored anyway 12486 } 12487 12488 e = POP(); 12489 if (stackp != stack) { 12490 xfree(stack); 12491 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA)," 12492 "too many states left on stack")); 12493 } 12494 12495 if (istate >= nstate) { 12496 xfree(stack); 12497 EMSG_RET_NULL(_("E876: (NFA regexp) " 12498 "Not enough space to store the whole NFA ")); 12499 } 12500 12501 matchstate = &state_ptr[istate++]; // the match state 12502 matchstate->c = NFA_MATCH; 12503 matchstate->out = matchstate->out1 = NULL; 12504 matchstate->id = 0; 12505 12506 patch(e.out, matchstate); 12507 ret = e.start; 12508 12509 theend: 12510 xfree(stack); 12511 return ret; 12512 12513 #undef POP1 12514 #undef PUSH1 12515 #undef POP2 12516 #undef PUSH2 12517 #undef POP 12518 #undef PUSH 12519 } 12520 12521 // After building the NFA program, inspect it to add optimization hints. 12522 static void nfa_postprocess(nfa_regprog_T *prog) 12523 { 12524 int i; 12525 int c; 12526 12527 for (i = 0; i < prog->nstate; i++) { 12528 c = prog->state[i].c; 12529 if (c == NFA_START_INVISIBLE 12530 || c == NFA_START_INVISIBLE_NEG 12531 || c == NFA_START_INVISIBLE_BEFORE 12532 || c == NFA_START_INVISIBLE_BEFORE_NEG) { 12533 int directly; 12534 12535 // Do it directly when what follows is possibly the end of the 12536 // match. 12537 if (match_follows(prog->state[i].out1->out, 0)) { 12538 directly = true; 12539 } else { 12540 int ch_invisible = failure_chance(prog->state[i].out, 0); 12541 int ch_follows = failure_chance(prog->state[i].out1->out, 0); 12542 12543 // Postpone when the invisible match is expensive or has a 12544 // lower chance of failing. 12545 if (c == NFA_START_INVISIBLE_BEFORE 12546 || c == NFA_START_INVISIBLE_BEFORE_NEG) { 12547 // "before" matches are very expensive when 12548 // unbounded, always prefer what follows then, 12549 // unless what follows will always match. 12550 // Otherwise strongly prefer what follows. 12551 if (prog->state[i].val <= 0 && ch_follows > 0) { 12552 directly = false; 12553 } else { 12554 directly = ch_follows * 10 < ch_invisible; 12555 } 12556 } else { 12557 // normal invisible, first do the one with the 12558 // highest failure chance 12559 directly = ch_follows < ch_invisible; 12560 } 12561 } 12562 if (directly) { 12563 // switch to the _FIRST state 12564 prog->state[i].c++; 12565 } 12566 } 12567 } 12568 } 12569 12570 ///////////////////////////////////////////////////////////////// 12571 // NFA execution code. 12572 ///////////////////////////////////////////////////////////////// 12573 12574 // Values for done in nfa_pim_T. 12575 #define NFA_PIM_UNUSED 0 // pim not used 12576 #define NFA_PIM_TODO 1 // pim not done yet 12577 #define NFA_PIM_MATCH 2 // pim executed, matches 12578 #define NFA_PIM_NOMATCH 3 // pim executed, no match 12579 12580 #ifdef REGEXP_DEBUG 12581 static void log_subsexpr(regsubs_T *subs) 12582 { 12583 log_subexpr(&subs->norm); 12584 if (rex.nfa_has_zsubexpr) { 12585 log_subexpr(&subs->synt); 12586 } 12587 } 12588 12589 static void log_subexpr(regsub_T *sub) 12590 { 12591 int j; 12592 12593 for (j = 0; j < sub->in_use; j++) { 12594 if (REG_MULTI) { 12595 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n", 12596 j, 12597 sub->list.multi[j].start_col, 12598 (int)sub->list.multi[j].start_lnum, 12599 sub->list.multi[j].end_col, 12600 (int)sub->list.multi[j].end_lnum); 12601 } else { 12602 char *s = (char *)sub->list.line[j].start; 12603 char *e = (char *)sub->list.line[j].end; 12604 12605 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n", 12606 j, 12607 s == NULL ? "NULL" : s, 12608 e == NULL ? "NULL" : e); 12609 } 12610 } 12611 } 12612 12613 static char *pim_info(const nfa_pim_T *pim) 12614 { 12615 static char buf[30]; 12616 12617 if (pim == NULL || pim->result == NFA_PIM_UNUSED) { 12618 buf[0] = NUL; 12619 } else { 12620 snprintf(buf, sizeof(buf), " PIM col %d", 12621 REG_MULTI 12622 ? (int)pim->end.pos.col 12623 : (int)(pim->end.ptr - rex.input)); 12624 } 12625 return buf; 12626 } 12627 12628 #endif 12629 12630 // Used during execution: whether a match has been found. 12631 static int nfa_match; 12632 static proftime_T *nfa_time_limit; 12633 static int *nfa_timed_out; 12634 static int nfa_time_count; 12635 12636 // Copy postponed invisible match info from "from" to "to". 12637 static void copy_pim(nfa_pim_T *to, nfa_pim_T *from) 12638 { 12639 to->result = from->result; 12640 to->state = from->state; 12641 copy_sub(&to->subs.norm, &from->subs.norm); 12642 if (rex.nfa_has_zsubexpr) { 12643 copy_sub(&to->subs.synt, &from->subs.synt); 12644 } 12645 to->end = from->end; 12646 } 12647 12648 static void clear_sub(regsub_T *sub) 12649 { 12650 if (REG_MULTI) { 12651 // Use 0xff to set lnum to -1 12652 memset(sub->list.multi, 0xff, sizeof(struct multipos) * (size_t)rex.nfa_nsubexpr); 12653 } else { 12654 memset(sub->list.line, 0, sizeof(struct linepos) * (size_t)rex.nfa_nsubexpr); 12655 } 12656 sub->in_use = 0; 12657 } 12658 12659 // Copy the submatches from "from" to "to". 12660 static void copy_sub(regsub_T *to, regsub_T *from) 12661 { 12662 to->in_use = from->in_use; 12663 if (from->in_use <= 0) { 12664 return; 12665 } 12666 12667 // Copy the match start and end positions. 12668 if (REG_MULTI) { 12669 memmove(&to->list.multi[0], &from->list.multi[0], 12670 sizeof(struct multipos) * (size_t)from->in_use); 12671 to->orig_start_col = from->orig_start_col; 12672 } else { 12673 memmove(&to->list.line[0], &from->list.line[0], 12674 sizeof(struct linepos) * (size_t)from->in_use); 12675 } 12676 } 12677 12678 // Like copy_sub() but exclude the main match. 12679 static void copy_sub_off(regsub_T *to, regsub_T *from) 12680 { 12681 if (to->in_use < from->in_use) { 12682 to->in_use = from->in_use; 12683 } 12684 if (from->in_use <= 1) { 12685 return; 12686 } 12687 12688 // Copy the match start and end positions. 12689 if (REG_MULTI) { 12690 memmove(&to->list.multi[1], &from->list.multi[1], 12691 sizeof(struct multipos) * (size_t)(from->in_use - 1)); 12692 } else { 12693 memmove(&to->list.line[1], &from->list.line[1], 12694 sizeof(struct linepos) * (size_t)(from->in_use - 1)); 12695 } 12696 } 12697 12698 // Like copy_sub() but only do the end of the main match if \ze is present. 12699 static void copy_ze_off(regsub_T *to, regsub_T *from) 12700 { 12701 if (!rex.nfa_has_zend) { 12702 return; 12703 } 12704 12705 if (REG_MULTI) { 12706 if (from->list.multi[0].end_lnum >= 0) { 12707 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum; 12708 to->list.multi[0].end_col = from->list.multi[0].end_col; 12709 } 12710 } else { 12711 if (from->list.line[0].end != NULL) { 12712 to->list.line[0].end = from->list.line[0].end; 12713 } 12714 } 12715 } 12716 12717 // Return true if "sub1" and "sub2" have the same start positions. 12718 // When using back-references also check the end position. 12719 static bool sub_equal(regsub_T *sub1, regsub_T *sub2) 12720 { 12721 int i; 12722 int todo; 12723 linenr_T s1; 12724 linenr_T s2; 12725 uint8_t *sp1; 12726 uint8_t *sp2; 12727 12728 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use; 12729 if (REG_MULTI) { 12730 for (i = 0; i < todo; i++) { 12731 if (i < sub1->in_use) { 12732 s1 = sub1->list.multi[i].start_lnum; 12733 } else { 12734 s1 = -1; 12735 } 12736 if (i < sub2->in_use) { 12737 s2 = sub2->list.multi[i].start_lnum; 12738 } else { 12739 s2 = -1; 12740 } 12741 if (s1 != s2) { 12742 return false; 12743 } 12744 if (s1 != -1 && sub1->list.multi[i].start_col 12745 != sub2->list.multi[i].start_col) { 12746 return false; 12747 } 12748 if (rex.nfa_has_backref) { 12749 if (i < sub1->in_use) { 12750 s1 = sub1->list.multi[i].end_lnum; 12751 } else { 12752 s1 = -1; 12753 } 12754 if (i < sub2->in_use) { 12755 s2 = sub2->list.multi[i].end_lnum; 12756 } else { 12757 s2 = -1; 12758 } 12759 if (s1 != s2) { 12760 return false; 12761 } 12762 if (s1 != -1 12763 && sub1->list.multi[i].end_col != sub2->list.multi[i].end_col) { 12764 return false; 12765 } 12766 } 12767 } 12768 } else { 12769 for (i = 0; i < todo; i++) { 12770 if (i < sub1->in_use) { 12771 sp1 = sub1->list.line[i].start; 12772 } else { 12773 sp1 = NULL; 12774 } 12775 if (i < sub2->in_use) { 12776 sp2 = sub2->list.line[i].start; 12777 } else { 12778 sp2 = NULL; 12779 } 12780 if (sp1 != sp2) { 12781 return false; 12782 } 12783 if (rex.nfa_has_backref) { 12784 if (i < sub1->in_use) { 12785 sp1 = sub1->list.line[i].end; 12786 } else { 12787 sp1 = NULL; 12788 } 12789 if (i < sub2->in_use) { 12790 sp2 = sub2->list.line[i].end; 12791 } else { 12792 sp2 = NULL; 12793 } 12794 if (sp1 != sp2) { 12795 return false; 12796 } 12797 } 12798 } 12799 } 12800 12801 return true; 12802 } 12803 12804 #ifdef REGEXP_DEBUG 12805 static void open_debug_log(TriState result) 12806 { 12807 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a"); 12808 if (log_fd == NULL) { 12809 emsg(_(e_log_open_failed)); 12810 log_fd = stderr; 12811 } 12812 12813 fprintf(log_fd, "****************************\n"); 12814 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n"); 12815 fprintf(log_fd, "MATCH = %s\n", result == kTrue ? "OK" : result == kNone ? "MAYBE" : "FALSE"); 12816 fprintf(log_fd, "****************************\n"); 12817 } 12818 12819 static void report_state(char *action, regsub_T *sub, nfa_state_T *state, int lid, nfa_pim_T *pim) 12820 { 12821 int col; 12822 12823 if (sub->in_use <= 0) { 12824 col = -1; 12825 } else if (REG_MULTI) { 12826 col = sub->list.multi[0].start_col; 12827 } else { 12828 col = (int)(sub->list.line[0].start - rex.line); 12829 } 12830 nfa_set_code(state->c); 12831 if (log_fd == NULL) { 12832 open_debug_log(kNone); 12833 } 12834 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n", 12835 action, abs(state->id), lid, state->c, code, col, 12836 pim_info(pim)); 12837 } 12838 12839 #endif 12840 12841 /// @param l runtime state list 12842 /// @param state state to update 12843 /// @param subs pointers to subexpressions 12844 /// @param pim postponed match or NULL 12845 /// 12846 /// @return true if the same state is already in list "l" with the same 12847 /// positions as "subs". 12848 static bool has_state_with_pos(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs, nfa_pim_T *pim) 12849 FUNC_ATTR_NONNULL_ARG(1, 2, 3) 12850 { 12851 for (int i = 0; i < l->n; i++) { 12852 nfa_thread_T *thread = &l->t[i]; 12853 if (thread->state->id == state->id 12854 && sub_equal(&thread->subs.norm, &subs->norm) 12855 && (!rex.nfa_has_zsubexpr 12856 || sub_equal(&thread->subs.synt, &subs->synt)) 12857 && pim_equal(&thread->pim, pim)) { 12858 return true; 12859 } 12860 } 12861 return false; 12862 } 12863 12864 // Return true if "one" and "two" are equal. That includes when both are not 12865 // set. 12866 static bool pim_equal(const nfa_pim_T *one, const nfa_pim_T *two) 12867 { 12868 const bool one_unused = (one == NULL || one->result == NFA_PIM_UNUSED); 12869 const bool two_unused = (two == NULL || two->result == NFA_PIM_UNUSED); 12870 12871 if (one_unused) { 12872 // one is unused: equal when two is also unused 12873 return two_unused; 12874 } 12875 if (two_unused) { 12876 // one is used and two is not: not equal 12877 return false; 12878 } 12879 // compare the state id 12880 if (one->state->id != two->state->id) { 12881 return false; 12882 } 12883 // compare the position 12884 if (REG_MULTI) { 12885 return one->end.pos.lnum == two->end.pos.lnum 12886 && one->end.pos.col == two->end.pos.col; 12887 } 12888 return one->end.ptr == two->end.ptr; 12889 } 12890 12891 // Return true if "state" leads to a NFA_MATCH without advancing the input. 12892 static bool match_follows(const nfa_state_T *startstate, int depth) 12893 FUNC_ATTR_NONNULL_ALL 12894 { 12895 const nfa_state_T *state = startstate; 12896 12897 // avoid too much recursion 12898 if (depth > 10) { 12899 return false; 12900 } 12901 while (state != NULL) { 12902 switch (state->c) { 12903 case NFA_MATCH: 12904 case NFA_MCLOSE: 12905 case NFA_END_INVISIBLE: 12906 case NFA_END_INVISIBLE_NEG: 12907 case NFA_END_PATTERN: 12908 return true; 12909 12910 case NFA_SPLIT: 12911 return match_follows(state->out, depth + 1) 12912 || match_follows(state->out1, depth + 1); 12913 12914 case NFA_START_INVISIBLE: 12915 case NFA_START_INVISIBLE_FIRST: 12916 case NFA_START_INVISIBLE_BEFORE: 12917 case NFA_START_INVISIBLE_BEFORE_FIRST: 12918 case NFA_START_INVISIBLE_NEG: 12919 case NFA_START_INVISIBLE_NEG_FIRST: 12920 case NFA_START_INVISIBLE_BEFORE_NEG: 12921 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 12922 case NFA_COMPOSING: 12923 // skip ahead to next state 12924 state = state->out1->out; 12925 continue; 12926 12927 case NFA_ANY: 12928 case NFA_ANY_COMPOSING: 12929 case NFA_IDENT: 12930 case NFA_SIDENT: 12931 case NFA_KWORD: 12932 case NFA_SKWORD: 12933 case NFA_FNAME: 12934 case NFA_SFNAME: 12935 case NFA_PRINT: 12936 case NFA_SPRINT: 12937 case NFA_WHITE: 12938 case NFA_NWHITE: 12939 case NFA_DIGIT: 12940 case NFA_NDIGIT: 12941 case NFA_HEX: 12942 case NFA_NHEX: 12943 case NFA_OCTAL: 12944 case NFA_NOCTAL: 12945 case NFA_WORD: 12946 case NFA_NWORD: 12947 case NFA_HEAD: 12948 case NFA_NHEAD: 12949 case NFA_ALPHA: 12950 case NFA_NALPHA: 12951 case NFA_LOWER: 12952 case NFA_NLOWER: 12953 case NFA_UPPER: 12954 case NFA_NUPPER: 12955 case NFA_LOWER_IC: 12956 case NFA_NLOWER_IC: 12957 case NFA_UPPER_IC: 12958 case NFA_NUPPER_IC: 12959 case NFA_START_COLL: 12960 case NFA_START_NEG_COLL: 12961 case NFA_NEWL: 12962 // state will advance input 12963 return false; 12964 12965 default: 12966 if (state->c > 0) { 12967 // state will advance input 12968 return false; 12969 } 12970 // Others: zero-width or possibly zero-width, might still find 12971 // a match at the same position, keep looking. 12972 break; 12973 } 12974 state = state->out; 12975 } 12976 return false; 12977 } 12978 12979 /// @param l runtime state list 12980 /// @param state state to update 12981 /// @param subs pointers to subexpressions 12982 /// 12983 /// @return true if "state" is already in list "l". 12984 static bool state_in_list(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs) 12985 FUNC_ATTR_NONNULL_ALL 12986 { 12987 if (state->lastlist[nfa_ll_index] == l->id) { 12988 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL)) { 12989 return true; 12990 } 12991 } 12992 return false; 12993 } 12994 12995 // Offset used for "off" by addstate_here(). 12996 #define ADDSTATE_HERE_OFFSET 10 12997 12998 /// Add "state" and possibly what follows to state list ".". 12999 /// 13000 /// @param l runtime state list 13001 /// @param state state to update 13002 /// @param subs_arg pointers to subexpressions 13003 /// @param pim postponed look-behind match 13004 /// @param off_arg byte offset, when -1 go to next line 13005 /// 13006 /// @return "subs_arg", possibly copied into temp_subs. 13007 /// NULL when recursiveness is too deep. 13008 static regsubs_T *addstate(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs_arg, nfa_pim_T *pim, 13009 int off_arg) 13010 FUNC_ATTR_NONNULL_ARG(1, 2) FUNC_ATTR_WARN_UNUSED_RESULT 13011 { 13012 int subidx; 13013 int off = off_arg; 13014 int add_here = false; 13015 int listindex = 0; 13016 int k; 13017 int found = false; 13018 nfa_thread_T *thread; 13019 struct multipos save_multipos; 13020 int save_in_use; 13021 uint8_t *save_ptr; 13022 int i; 13023 regsub_T *sub; 13024 regsubs_T *subs = subs_arg; 13025 static regsubs_T temp_subs; 13026 #ifdef REGEXP_DEBUG 13027 int did_print = false; 13028 #endif 13029 static int depth = 0; 13030 13031 // This function is called recursively. When the depth is too much we run 13032 // out of stack and crash, limit recursiveness here. 13033 if (++depth >= 5000 || subs == NULL) { 13034 depth--; 13035 return NULL; 13036 } 13037 13038 if (off_arg <= -ADDSTATE_HERE_OFFSET) { 13039 add_here = true; 13040 off = 0; 13041 listindex = -(off_arg + ADDSTATE_HERE_OFFSET); 13042 } 13043 13044 switch (state->c) { 13045 case NFA_NCLOSE: 13046 case NFA_MCLOSE: 13047 case NFA_MCLOSE1: 13048 case NFA_MCLOSE2: 13049 case NFA_MCLOSE3: 13050 case NFA_MCLOSE4: 13051 case NFA_MCLOSE5: 13052 case NFA_MCLOSE6: 13053 case NFA_MCLOSE7: 13054 case NFA_MCLOSE8: 13055 case NFA_MCLOSE9: 13056 case NFA_ZCLOSE: 13057 case NFA_ZCLOSE1: 13058 case NFA_ZCLOSE2: 13059 case NFA_ZCLOSE3: 13060 case NFA_ZCLOSE4: 13061 case NFA_ZCLOSE5: 13062 case NFA_ZCLOSE6: 13063 case NFA_ZCLOSE7: 13064 case NFA_ZCLOSE8: 13065 case NFA_ZCLOSE9: 13066 case NFA_MOPEN: 13067 case NFA_ZEND: 13068 case NFA_SPLIT: 13069 case NFA_EMPTY: 13070 // These nodes are not added themselves but their "out" and/or 13071 // "out1" may be added below. 13072 break; 13073 13074 case NFA_BOL: 13075 case NFA_BOF: 13076 // "^" won't match past end-of-line, don't bother trying. 13077 // Except when at the end of the line, or when we are going to the 13078 // next line for a look-behind match. 13079 if (rex.input > rex.line 13080 && *rex.input != NUL 13081 && (nfa_endp == NULL 13082 || !REG_MULTI 13083 || rex.lnum == nfa_endp->se_u.pos.lnum)) { 13084 goto skip_add; 13085 } 13086 FALLTHROUGH; 13087 13088 case NFA_MOPEN1: 13089 case NFA_MOPEN2: 13090 case NFA_MOPEN3: 13091 case NFA_MOPEN4: 13092 case NFA_MOPEN5: 13093 case NFA_MOPEN6: 13094 case NFA_MOPEN7: 13095 case NFA_MOPEN8: 13096 case NFA_MOPEN9: 13097 case NFA_ZOPEN: 13098 case NFA_ZOPEN1: 13099 case NFA_ZOPEN2: 13100 case NFA_ZOPEN3: 13101 case NFA_ZOPEN4: 13102 case NFA_ZOPEN5: 13103 case NFA_ZOPEN6: 13104 case NFA_ZOPEN7: 13105 case NFA_ZOPEN8: 13106 case NFA_ZOPEN9: 13107 case NFA_NOPEN: 13108 case NFA_ZSTART: 13109 // These nodes need to be added so that we can bail out when it 13110 // was added to this list before at the same position to avoid an 13111 // endless loop for "\(\)*" 13112 13113 default: 13114 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) { 13115 // This state is already in the list, don't add it again, 13116 // unless it is an MOPEN that is used for a backreference or 13117 // when there is a PIM. For NFA_MATCH check the position, 13118 // lower position is preferred. 13119 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim 13120 && state->c != NFA_MATCH) { 13121 // When called from addstate_here() do insert before 13122 // existing states. 13123 if (add_here) { 13124 for (k = 0; k < l->n && k < listindex; k++) { 13125 if (l->t[k].state->id == state->id) { 13126 found = true; 13127 break; 13128 } 13129 } 13130 } 13131 13132 if (!add_here || found) { 13133 skip_add: 13134 #ifdef REGEXP_DEBUG 13135 nfa_set_code(state->c); 13136 fprintf(log_fd, 13137 "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n", 13138 abs(state->id), l->id, state->c, code, 13139 pim == NULL ? "NULL" : "yes", l->has_pim, found); 13140 #endif 13141 depth--; 13142 return subs; 13143 } 13144 } 13145 13146 // Do not add the state again when it exists with the same 13147 // positions. 13148 if (has_state_with_pos(l, state, subs, pim)) { 13149 goto skip_add; 13150 } 13151 } 13152 13153 // When there are backreferences or PIMs the number of states may 13154 // be (a lot) bigger than anticipated. 13155 if (l->n == l->len) { 13156 const int newlen = l->len * 3 / 2 + 50; 13157 const size_t newsize = (size_t)newlen * sizeof(nfa_thread_T); 13158 13159 if ((int64_t)(newsize >> 10) >= p_mmp) { 13160 emsg(_(e_pattern_uses_more_memory_than_maxmempattern)); 13161 depth--; 13162 return NULL; 13163 } 13164 if (subs != &temp_subs) { 13165 // "subs" may point into the current array, need to make a 13166 // copy before it becomes invalid. 13167 copy_sub(&temp_subs.norm, &subs->norm); 13168 if (rex.nfa_has_zsubexpr) { 13169 copy_sub(&temp_subs.synt, &subs->synt); 13170 } 13171 subs = &temp_subs; 13172 } 13173 13174 nfa_thread_T *const newt = xrealloc(l->t, newsize); 13175 l->t = newt; 13176 l->len = newlen; 13177 } 13178 13179 // add the state to the list 13180 state->lastlist[nfa_ll_index] = l->id; 13181 thread = &l->t[l->n++]; 13182 thread->state = state; 13183 if (pim == NULL) { 13184 thread->pim.result = NFA_PIM_UNUSED; 13185 } else { 13186 copy_pim(&thread->pim, pim); 13187 l->has_pim = true; 13188 } 13189 copy_sub(&thread->subs.norm, &subs->norm); 13190 if (rex.nfa_has_zsubexpr) { 13191 copy_sub(&thread->subs.synt, &subs->synt); 13192 } 13193 #ifdef REGEXP_DEBUG 13194 report_state("Adding", &thread->subs.norm, state, l->id, pim); 13195 did_print = true; 13196 #endif 13197 } 13198 13199 #ifdef REGEXP_DEBUG 13200 if (!did_print) { 13201 report_state("Processing", &subs->norm, state, l->id, pim); 13202 } 13203 #endif 13204 switch (state->c) { 13205 case NFA_MATCH: 13206 break; 13207 13208 case NFA_SPLIT: 13209 // order matters here 13210 subs = addstate(l, state->out, subs, pim, off_arg); 13211 subs = addstate(l, state->out1, subs, pim, off_arg); 13212 break; 13213 13214 case NFA_EMPTY: 13215 case NFA_NOPEN: 13216 case NFA_NCLOSE: 13217 subs = addstate(l, state->out, subs, pim, off_arg); 13218 break; 13219 13220 case NFA_MOPEN: 13221 case NFA_MOPEN1: 13222 case NFA_MOPEN2: 13223 case NFA_MOPEN3: 13224 case NFA_MOPEN4: 13225 case NFA_MOPEN5: 13226 case NFA_MOPEN6: 13227 case NFA_MOPEN7: 13228 case NFA_MOPEN8: 13229 case NFA_MOPEN9: 13230 case NFA_ZOPEN: 13231 case NFA_ZOPEN1: 13232 case NFA_ZOPEN2: 13233 case NFA_ZOPEN3: 13234 case NFA_ZOPEN4: 13235 case NFA_ZOPEN5: 13236 case NFA_ZOPEN6: 13237 case NFA_ZOPEN7: 13238 case NFA_ZOPEN8: 13239 case NFA_ZOPEN9: 13240 case NFA_ZSTART: 13241 if (state->c == NFA_ZSTART) { 13242 subidx = 0; 13243 sub = &subs->norm; 13244 } else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { 13245 subidx = state->c - NFA_ZOPEN; 13246 sub = &subs->synt; 13247 } else { 13248 subidx = state->c - NFA_MOPEN; 13249 sub = &subs->norm; 13250 } 13251 13252 // avoid compiler warnings 13253 save_ptr = NULL; 13254 CLEAR_FIELD(save_multipos); 13255 13256 // Set the position (with "off" added) in the subexpression. Save 13257 // and restore it when it was in use. Otherwise fill any gap. 13258 if (REG_MULTI) { 13259 if (subidx < sub->in_use) { 13260 save_multipos = sub->list.multi[subidx]; 13261 save_in_use = -1; 13262 } else { 13263 save_in_use = sub->in_use; 13264 for (i = sub->in_use; i < subidx; i++) { 13265 sub->list.multi[i].start_lnum = -1; 13266 sub->list.multi[i].end_lnum = -1; 13267 } 13268 sub->in_use = subidx + 1; 13269 } 13270 if (off == -1) { 13271 sub->list.multi[subidx].start_lnum = rex.lnum + 1; 13272 sub->list.multi[subidx].start_col = 0; 13273 } else { 13274 sub->list.multi[subidx].start_lnum = rex.lnum; 13275 sub->list.multi[subidx].start_col = 13276 (colnr_T)(rex.input - rex.line + off); 13277 } 13278 sub->list.multi[subidx].end_lnum = -1; 13279 } else { 13280 if (subidx < sub->in_use) { 13281 save_ptr = sub->list.line[subidx].start; 13282 save_in_use = -1; 13283 } else { 13284 save_in_use = sub->in_use; 13285 for (i = sub->in_use; i < subidx; i++) { 13286 sub->list.line[i].start = NULL; 13287 sub->list.line[i].end = NULL; 13288 } 13289 sub->in_use = subidx + 1; 13290 } 13291 sub->list.line[subidx].start = rex.input + off; 13292 } 13293 13294 subs = addstate(l, state->out, subs, pim, off_arg); 13295 if (subs == NULL) { 13296 break; 13297 } 13298 // "subs" may have changed, need to set "sub" again. 13299 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { 13300 sub = &subs->synt; 13301 } else { 13302 sub = &subs->norm; 13303 } 13304 13305 if (save_in_use == -1) { 13306 if (REG_MULTI) { 13307 sub->list.multi[subidx] = save_multipos; 13308 } else { 13309 sub->list.line[subidx].start = save_ptr; 13310 } 13311 } else { 13312 sub->in_use = save_in_use; 13313 } 13314 break; 13315 13316 case NFA_MCLOSE: 13317 if (rex.nfa_has_zend 13318 && (REG_MULTI 13319 ? subs->norm.list.multi[0].end_lnum >= 0 13320 : subs->norm.list.line[0].end != NULL)) { 13321 // Do not overwrite the position set by \ze. 13322 subs = addstate(l, state->out, subs, pim, off_arg); 13323 break; 13324 } 13325 FALLTHROUGH; 13326 case NFA_MCLOSE1: 13327 case NFA_MCLOSE2: 13328 case NFA_MCLOSE3: 13329 case NFA_MCLOSE4: 13330 case NFA_MCLOSE5: 13331 case NFA_MCLOSE6: 13332 case NFA_MCLOSE7: 13333 case NFA_MCLOSE8: 13334 case NFA_MCLOSE9: 13335 case NFA_ZCLOSE: 13336 case NFA_ZCLOSE1: 13337 case NFA_ZCLOSE2: 13338 case NFA_ZCLOSE3: 13339 case NFA_ZCLOSE4: 13340 case NFA_ZCLOSE5: 13341 case NFA_ZCLOSE6: 13342 case NFA_ZCLOSE7: 13343 case NFA_ZCLOSE8: 13344 case NFA_ZCLOSE9: 13345 case NFA_ZEND: 13346 if (state->c == NFA_ZEND) { 13347 subidx = 0; 13348 sub = &subs->norm; 13349 } else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { 13350 subidx = state->c - NFA_ZCLOSE; 13351 sub = &subs->synt; 13352 } else { 13353 subidx = state->c - NFA_MCLOSE; 13354 sub = &subs->norm; 13355 } 13356 13357 // We don't fill in gaps here, there must have been an MOPEN that 13358 // has done that. 13359 save_in_use = sub->in_use; 13360 if (sub->in_use <= subidx) { 13361 sub->in_use = subidx + 1; 13362 } 13363 if (REG_MULTI) { 13364 save_multipos = sub->list.multi[subidx]; 13365 if (off == -1) { 13366 sub->list.multi[subidx].end_lnum = rex.lnum + 1; 13367 sub->list.multi[subidx].end_col = 0; 13368 } else { 13369 sub->list.multi[subidx].end_lnum = rex.lnum; 13370 sub->list.multi[subidx].end_col = 13371 (colnr_T)(rex.input - rex.line + off); 13372 } 13373 // avoid compiler warnings 13374 save_ptr = NULL; 13375 } else { 13376 save_ptr = sub->list.line[subidx].end; 13377 sub->list.line[subidx].end = rex.input + off; 13378 // avoid compiler warnings 13379 CLEAR_FIELD(save_multipos); 13380 } 13381 13382 subs = addstate(l, state->out, subs, pim, off_arg); 13383 if (subs == NULL) { 13384 break; 13385 } 13386 // "subs" may have changed, need to set "sub" again. 13387 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { 13388 sub = &subs->synt; 13389 } else { 13390 sub = &subs->norm; 13391 } 13392 13393 if (REG_MULTI) { 13394 sub->list.multi[subidx] = save_multipos; 13395 } else { 13396 sub->list.line[subidx].end = save_ptr; 13397 } 13398 sub->in_use = save_in_use; 13399 break; 13400 } 13401 depth--; 13402 return subs; 13403 } 13404 13405 /// Like addstate(), but the new state(s) are put at position "*ip". 13406 /// Used for zero-width matches, next state to use is the added one. 13407 /// This makes sure the order of states to be tried does not change, which 13408 /// matters for alternatives. 13409 /// 13410 /// @param l runtime state list 13411 /// @param state state to update 13412 /// @param subs pointers to subexpressions 13413 /// @param pim postponed look-behind match 13414 static regsubs_T *addstate_here(nfa_list_T *l, nfa_state_T *state, regsubs_T *subs, nfa_pim_T *pim, 13415 int *ip) 13416 FUNC_ATTR_NONNULL_ARG(1, 2, 5) FUNC_ATTR_WARN_UNUSED_RESULT 13417 { 13418 int tlen = l->n; 13419 int count; 13420 int listidx = *ip; 13421 13422 // First add the state(s) at the end, so that we know how many there are. 13423 // Pass the listidx as offset (avoids adding another argument to 13424 // addstate()). 13425 regsubs_T *r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET); 13426 if (r == NULL) { 13427 return NULL; 13428 } 13429 13430 // when "*ip" was at the end of the list, nothing to do 13431 if (listidx + 1 == tlen) { 13432 return r; 13433 } 13434 13435 // re-order to put the new state at the current position 13436 count = l->n - tlen; 13437 if (count == 0) { 13438 return r; // no state got added 13439 } 13440 if (count == 1) { 13441 // overwrite the current state 13442 l->t[listidx] = l->t[l->n - 1]; 13443 } else if (count > 1) { 13444 if (l->n + count - 1 >= l->len) { 13445 // not enough space to move the new states, reallocate the list 13446 // and move the states to the right position 13447 const int newlen = l->len * 3 / 2 + 50; 13448 const size_t newsize = (size_t)newlen * sizeof(nfa_thread_T); 13449 13450 if ((int64_t)(newsize >> 10) >= p_mmp) { 13451 emsg(_(e_pattern_uses_more_memory_than_maxmempattern)); 13452 return NULL; 13453 } 13454 nfa_thread_T *const newl = xmalloc(newsize); 13455 l->len = newlen; 13456 memmove(&(newl[0]), 13457 &(l->t[0]), 13458 sizeof(nfa_thread_T) * (size_t)listidx); 13459 memmove(&(newl[listidx]), 13460 &(l->t[l->n - count]), 13461 sizeof(nfa_thread_T) * (size_t)count); 13462 memmove(&(newl[listidx + count]), 13463 &(l->t[listidx + 1]), 13464 sizeof(nfa_thread_T) * (size_t)(l->n - count - listidx - 1)); 13465 xfree(l->t); 13466 l->t = newl; 13467 } else { 13468 // make space for new states, then move them from the 13469 // end to the current position 13470 memmove(&(l->t[listidx + count]), 13471 &(l->t[listidx + 1]), 13472 sizeof(nfa_thread_T) * (size_t)(l->n - listidx - 1)); 13473 memmove(&(l->t[listidx]), 13474 &(l->t[l->n - 1]), 13475 sizeof(nfa_thread_T) * (size_t)count); 13476 } 13477 } 13478 l->n--; 13479 *ip = listidx - 1; 13480 13481 return r; 13482 } 13483 13484 // Check character class "class" against current character c. 13485 static int check_char_class(int cls, int c) 13486 { 13487 switch (cls) { 13488 case NFA_CLASS_ALNUM: 13489 if (c >= 1 && c < 128 && isalnum(c)) { 13490 return OK; 13491 } 13492 break; 13493 case NFA_CLASS_ALPHA: 13494 if (c >= 1 && c < 128 && isalpha(c)) { 13495 return OK; 13496 } 13497 break; 13498 case NFA_CLASS_BLANK: 13499 if (c == ' ' || c == '\t') { 13500 return OK; 13501 } 13502 break; 13503 case NFA_CLASS_CNTRL: 13504 if (c >= 1 && c <= 127 && iscntrl(c)) { 13505 return OK; 13506 } 13507 break; 13508 case NFA_CLASS_DIGIT: 13509 if (ascii_isdigit(c)) { 13510 return OK; 13511 } 13512 break; 13513 case NFA_CLASS_GRAPH: 13514 if (c >= 1 && c <= 127 && isgraph(c)) { 13515 return OK; 13516 } 13517 break; 13518 case NFA_CLASS_LOWER: 13519 if (mb_islower(c) && c != 170 && c != 186) { 13520 return OK; 13521 } 13522 break; 13523 case NFA_CLASS_PRINT: 13524 if (vim_isprintc(c)) { 13525 return OK; 13526 } 13527 break; 13528 case NFA_CLASS_PUNCT: 13529 if (c >= 1 && c < 128 && ispunct(c)) { 13530 return OK; 13531 } 13532 break; 13533 case NFA_CLASS_SPACE: 13534 if ((c >= 9 && c <= 13) || (c == ' ')) { 13535 return OK; 13536 } 13537 break; 13538 case NFA_CLASS_UPPER: 13539 if (mb_isupper(c)) { 13540 return OK; 13541 } 13542 break; 13543 case NFA_CLASS_XDIGIT: 13544 if (ascii_isxdigit(c)) { 13545 return OK; 13546 } 13547 break; 13548 case NFA_CLASS_TAB: 13549 if (c == '\t') { 13550 return OK; 13551 } 13552 break; 13553 case NFA_CLASS_RETURN: 13554 if (c == '\r') { 13555 return OK; 13556 } 13557 break; 13558 case NFA_CLASS_BACKSPACE: 13559 if (c == '\b') { 13560 return OK; 13561 } 13562 break; 13563 case NFA_CLASS_ESCAPE: 13564 if (c == ESC) { 13565 return OK; 13566 } 13567 break; 13568 case NFA_CLASS_IDENT: 13569 if (vim_isIDc(c)) { 13570 return OK; 13571 } 13572 break; 13573 case NFA_CLASS_KEYWORD: 13574 if (reg_iswordc(c)) { 13575 return OK; 13576 } 13577 break; 13578 case NFA_CLASS_FNAME: 13579 if (vim_isfilec(c)) { 13580 return OK; 13581 } 13582 break; 13583 13584 default: 13585 // should not be here :P 13586 siemsg(_(e_ill_char_class), (int64_t)cls); 13587 return FAIL; 13588 } 13589 return FAIL; 13590 } 13591 13592 /// Check for a match with subexpression "subidx". 13593 /// 13594 /// @param sub pointers to subexpressions 13595 /// @param bytelen out: length of match in bytes 13596 /// 13597 /// @return true if it matches. 13598 static int match_backref(regsub_T *sub, int subidx, int *bytelen) 13599 { 13600 int len; 13601 13602 if (sub->in_use <= subidx) { 13603 retempty: 13604 // backref was not set, match an empty string 13605 *bytelen = 0; 13606 return true; 13607 } 13608 13609 if (REG_MULTI) { 13610 if (sub->list.multi[subidx].start_lnum < 0 13611 || sub->list.multi[subidx].end_lnum < 0) { 13612 goto retempty; 13613 } 13614 if (sub->list.multi[subidx].start_lnum == rex.lnum 13615 && sub->list.multi[subidx].end_lnum == rex.lnum) { 13616 len = sub->list.multi[subidx].end_col 13617 - sub->list.multi[subidx].start_col; 13618 if (cstrncmp((char *)rex.line + sub->list.multi[subidx].start_col, 13619 (char *)rex.input, &len) == 0) { 13620 *bytelen = len; 13621 return true; 13622 } 13623 } else { 13624 if (match_with_backref(sub->list.multi[subidx].start_lnum, 13625 sub->list.multi[subidx].start_col, 13626 sub->list.multi[subidx].end_lnum, 13627 sub->list.multi[subidx].end_col, 13628 bytelen) == RA_MATCH) { 13629 return true; 13630 } 13631 } 13632 } else { 13633 if (sub->list.line[subidx].start == NULL 13634 || sub->list.line[subidx].end == NULL) { 13635 goto retempty; 13636 } 13637 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start); 13638 if (cstrncmp((char *)sub->list.line[subidx].start, (char *)rex.input, &len) == 0) { 13639 *bytelen = len; 13640 return true; 13641 } 13642 } 13643 return false; 13644 } 13645 13646 /// Check for a match with \z subexpression "subidx". 13647 /// 13648 /// @param bytelen out: length of match in bytes 13649 /// 13650 /// @return true if it matches. 13651 static int match_zref(int subidx, int *bytelen) 13652 { 13653 int len; 13654 13655 cleanup_zsubexpr(); 13656 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL) { 13657 // backref was not set, match an empty string 13658 *bytelen = 0; 13659 return true; 13660 } 13661 13662 len = (int)strlen((char *)re_extmatch_in->matches[subidx]); 13663 if (cstrncmp((char *)re_extmatch_in->matches[subidx], (char *)rex.input, &len) == 0) { 13664 *bytelen = len; 13665 return true; 13666 } 13667 return false; 13668 } 13669 13670 // Save list IDs for all NFA states of "prog" into "list". 13671 // Also reset the IDs to zero. 13672 // Only used for the recursive value lastlist[1]. 13673 static void nfa_save_listids(nfa_regprog_T *prog, int *list) 13674 { 13675 int i; 13676 nfa_state_T *p; 13677 13678 // Order in the list is reverse, it's a bit faster that way. 13679 p = &prog->state[0]; 13680 for (i = prog->nstate; --i >= 0;) { 13681 list[i] = p->lastlist[1]; 13682 p->lastlist[1] = 0; 13683 p++; 13684 } 13685 } 13686 13687 // Restore list IDs from "list" to all NFA states. 13688 static void nfa_restore_listids(nfa_regprog_T *prog, const int *list) 13689 { 13690 int i; 13691 nfa_state_T *p; 13692 13693 p = &prog->state[0]; 13694 for (i = prog->nstate; --i >= 0;) { 13695 p->lastlist[1] = list[i]; 13696 p++; 13697 } 13698 } 13699 13700 static bool nfa_re_num_cmp(uintmax_t val, int op, uintmax_t pos) 13701 { 13702 if (op == 1) { 13703 return pos > val; 13704 } 13705 if (op == 2) { 13706 return pos < val; 13707 } 13708 return val == pos; 13709 } 13710 13711 // Recursively call nfa_regmatch() 13712 // "pim" is NULL or contains info about a Postponed Invisible Match (start 13713 // position). 13714 static int recursive_regmatch(nfa_state_T *state, nfa_pim_T *pim, nfa_regprog_T *prog, 13715 regsubs_T *submatch, regsubs_T *m, int **listids, int *listids_len) 13716 FUNC_ATTR_NONNULL_ARG(1, 3, 5, 6, 7) 13717 { 13718 const int save_reginput_col = (int)(rex.input - rex.line); 13719 const int save_reglnum = rex.lnum; 13720 const int save_nfa_match = nfa_match; 13721 const int save_nfa_listid = rex.nfa_listid; 13722 save_se_T *const save_nfa_endp = nfa_endp; 13723 save_se_T endpos; 13724 save_se_T *endposp = NULL; 13725 int need_restore = false; 13726 13727 if (pim != NULL) { 13728 // start at the position where the postponed match was 13729 if (REG_MULTI) { 13730 rex.input = rex.line + pim->end.pos.col; 13731 } else { 13732 rex.input = pim->end.ptr; 13733 } 13734 } 13735 13736 if (state->c == NFA_START_INVISIBLE_BEFORE 13737 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST 13738 || state->c == NFA_START_INVISIBLE_BEFORE_NEG 13739 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) { 13740 // The recursive match must end at the current position. When "pim" is 13741 // not NULL it specifies the current position. 13742 endposp = &endpos; 13743 if (REG_MULTI) { 13744 if (pim == NULL) { 13745 endpos.se_u.pos.col = (int)(rex.input - rex.line); 13746 endpos.se_u.pos.lnum = rex.lnum; 13747 } else { 13748 endpos.se_u.pos = pim->end.pos; 13749 } 13750 } else { 13751 if (pim == NULL) { 13752 endpos.se_u.ptr = rex.input; 13753 } else { 13754 endpos.se_u.ptr = pim->end.ptr; 13755 } 13756 } 13757 13758 // Go back the specified number of bytes, or as far as the 13759 // start of the previous line, to try matching "\@<=" or 13760 // not matching "\@<!". This is very inefficient, limit the number of 13761 // bytes if possible. 13762 if (state->val <= 0) { 13763 if (REG_MULTI) { 13764 rex.line = (uint8_t *)reg_getline(--rex.lnum); 13765 if (rex.line == NULL) { 13766 // can't go before the first line 13767 rex.line = (uint8_t *)reg_getline(++rex.lnum); 13768 } 13769 } 13770 rex.input = rex.line; 13771 } else { 13772 if (REG_MULTI && (int)(rex.input - rex.line) < state->val) { 13773 // Not enough bytes in this line, go to end of 13774 // previous line. 13775 rex.line = (uint8_t *)reg_getline(--rex.lnum); 13776 if (rex.line == NULL) { 13777 // can't go before the first line 13778 rex.line = (uint8_t *)reg_getline(++rex.lnum); 13779 rex.input = rex.line; 13780 } else { 13781 rex.input = rex.line + reg_getline_len(rex.lnum); 13782 } 13783 } 13784 if ((int)(rex.input - rex.line) >= state->val) { 13785 rex.input -= state->val; 13786 rex.input -= utf_head_off((char *)rex.line, (char *)rex.input); 13787 } else { 13788 rex.input = rex.line; 13789 } 13790 } 13791 } 13792 13793 #ifdef REGEXP_DEBUG 13794 if (log_fd != stderr) { 13795 fclose(log_fd); 13796 } 13797 log_fd = NULL; 13798 #endif 13799 // Have to clear the lastlist field of the NFA nodes, so that 13800 // nfa_regmatch() and addstate() can run properly after recursion. 13801 if (nfa_ll_index == 1) { 13802 // Already calling nfa_regmatch() recursively. Save the lastlist[1] 13803 // values and clear them. 13804 if (*listids == NULL || *listids_len < prog->nstate) { 13805 xfree(*listids); 13806 *listids = xmalloc(sizeof(**listids) * (size_t)prog->nstate); 13807 *listids_len = prog->nstate; 13808 } 13809 nfa_save_listids(prog, *listids); 13810 need_restore = true; 13811 // any value of rex.nfa_listid will do 13812 } else { 13813 // First recursive nfa_regmatch() call, switch to the second lastlist 13814 // entry. Make sure rex.nfa_listid is different from a previous 13815 // recursive call, because some states may still have this ID. 13816 nfa_ll_index++; 13817 if (rex.nfa_listid <= rex.nfa_alt_listid) { 13818 rex.nfa_listid = rex.nfa_alt_listid; 13819 } 13820 } 13821 13822 // Call nfa_regmatch() to check if the current concat matches at this 13823 // position. The concat ends with the node NFA_END_INVISIBLE 13824 nfa_endp = endposp; 13825 const int result = nfa_regmatch(prog, state->out, submatch, m); 13826 13827 if (need_restore) { 13828 nfa_restore_listids(prog, *listids); 13829 } else { 13830 nfa_ll_index--; 13831 rex.nfa_alt_listid = rex.nfa_listid; 13832 } 13833 13834 // restore position in input text 13835 rex.lnum = save_reglnum; 13836 if (REG_MULTI) { 13837 rex.line = (uint8_t *)reg_getline(rex.lnum); 13838 } 13839 rex.input = rex.line + save_reginput_col; 13840 if (result != NFA_TOO_EXPENSIVE) { 13841 nfa_match = save_nfa_match; 13842 rex.nfa_listid = save_nfa_listid; 13843 } 13844 nfa_endp = save_nfa_endp; 13845 13846 #ifdef REGEXP_DEBUG 13847 open_debug_log(result); 13848 #endif 13849 13850 return result; 13851 } 13852 13853 // Estimate the chance of a match with "state" failing. 13854 // empty match: 0 13855 // NFA_ANY: 1 13856 // specific character: 99 13857 static int failure_chance(nfa_state_T *state, int depth) 13858 { 13859 int c = state->c; 13860 int l, r; 13861 13862 // detect looping 13863 if (depth > 4) { 13864 return 1; 13865 } 13866 13867 switch (c) { 13868 case NFA_SPLIT: 13869 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT) { 13870 // avoid recursive stuff 13871 return 1; 13872 } 13873 // two alternatives, use the lowest failure chance 13874 l = failure_chance(state->out, depth + 1); 13875 r = failure_chance(state->out1, depth + 1); 13876 return l < r ? l : r; 13877 13878 case NFA_ANY: 13879 // matches anything, unlikely to fail 13880 return 1; 13881 13882 case NFA_MATCH: 13883 case NFA_MCLOSE: 13884 case NFA_ANY_COMPOSING: 13885 // empty match works always 13886 return 0; 13887 13888 case NFA_START_INVISIBLE: 13889 case NFA_START_INVISIBLE_FIRST: 13890 case NFA_START_INVISIBLE_NEG: 13891 case NFA_START_INVISIBLE_NEG_FIRST: 13892 case NFA_START_INVISIBLE_BEFORE: 13893 case NFA_START_INVISIBLE_BEFORE_FIRST: 13894 case NFA_START_INVISIBLE_BEFORE_NEG: 13895 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 13896 case NFA_START_PATTERN: 13897 // recursive regmatch is expensive, use low failure chance 13898 return 5; 13899 13900 case NFA_BOL: 13901 case NFA_EOL: 13902 case NFA_BOF: 13903 case NFA_EOF: 13904 case NFA_NEWL: 13905 return 99; 13906 13907 case NFA_BOW: 13908 case NFA_EOW: 13909 return 90; 13910 13911 case NFA_MOPEN: 13912 case NFA_MOPEN1: 13913 case NFA_MOPEN2: 13914 case NFA_MOPEN3: 13915 case NFA_MOPEN4: 13916 case NFA_MOPEN5: 13917 case NFA_MOPEN6: 13918 case NFA_MOPEN7: 13919 case NFA_MOPEN8: 13920 case NFA_MOPEN9: 13921 case NFA_ZOPEN: 13922 case NFA_ZOPEN1: 13923 case NFA_ZOPEN2: 13924 case NFA_ZOPEN3: 13925 case NFA_ZOPEN4: 13926 case NFA_ZOPEN5: 13927 case NFA_ZOPEN6: 13928 case NFA_ZOPEN7: 13929 case NFA_ZOPEN8: 13930 case NFA_ZOPEN9: 13931 case NFA_ZCLOSE: 13932 case NFA_ZCLOSE1: 13933 case NFA_ZCLOSE2: 13934 case NFA_ZCLOSE3: 13935 case NFA_ZCLOSE4: 13936 case NFA_ZCLOSE5: 13937 case NFA_ZCLOSE6: 13938 case NFA_ZCLOSE7: 13939 case NFA_ZCLOSE8: 13940 case NFA_ZCLOSE9: 13941 case NFA_NOPEN: 13942 case NFA_MCLOSE1: 13943 case NFA_MCLOSE2: 13944 case NFA_MCLOSE3: 13945 case NFA_MCLOSE4: 13946 case NFA_MCLOSE5: 13947 case NFA_MCLOSE6: 13948 case NFA_MCLOSE7: 13949 case NFA_MCLOSE8: 13950 case NFA_MCLOSE9: 13951 case NFA_NCLOSE: 13952 return failure_chance(state->out, depth + 1); 13953 13954 case NFA_BACKREF1: 13955 case NFA_BACKREF2: 13956 case NFA_BACKREF3: 13957 case NFA_BACKREF4: 13958 case NFA_BACKREF5: 13959 case NFA_BACKREF6: 13960 case NFA_BACKREF7: 13961 case NFA_BACKREF8: 13962 case NFA_BACKREF9: 13963 case NFA_ZREF1: 13964 case NFA_ZREF2: 13965 case NFA_ZREF3: 13966 case NFA_ZREF4: 13967 case NFA_ZREF5: 13968 case NFA_ZREF6: 13969 case NFA_ZREF7: 13970 case NFA_ZREF8: 13971 case NFA_ZREF9: 13972 // backreferences don't match in many places 13973 return 94; 13974 13975 case NFA_LNUM_GT: 13976 case NFA_LNUM_LT: 13977 case NFA_COL_GT: 13978 case NFA_COL_LT: 13979 case NFA_VCOL_GT: 13980 case NFA_VCOL_LT: 13981 case NFA_MARK_GT: 13982 case NFA_MARK_LT: 13983 case NFA_VISUAL: 13984 // before/after positions don't match very often 13985 return 85; 13986 13987 case NFA_LNUM: 13988 return 90; 13989 13990 case NFA_CURSOR: 13991 case NFA_COL: 13992 case NFA_VCOL: 13993 case NFA_MARK: 13994 // specific positions rarely match 13995 return 98; 13996 13997 case NFA_COMPOSING: 13998 return 95; 13999 14000 default: 14001 if (c > 0) { 14002 // character match fails often 14003 return 95; 14004 } 14005 } 14006 14007 // something else, includes character classes 14008 return 50; 14009 } 14010 14011 // Skip until the char "c" we know a match must start with. 14012 static int skip_to_start(int c, colnr_T *colp) 14013 { 14014 const uint8_t *const s = (uint8_t *)cstrchr((char *)rex.line + *colp, c); 14015 if (s == NULL) { 14016 return FAIL; 14017 } 14018 *colp = (int)(s - rex.line); 14019 return OK; 14020 } 14021 14022 // Check for a match with match_text. 14023 // Called after skip_to_start() has found regstart. 14024 // Returns zero for no match, 1 for a match. 14025 static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) 14026 { 14027 colnr_T col = *startcol; 14028 const int regstart_len = utf_char2len(regstart); 14029 14030 while (true) { 14031 bool match = true; 14032 uint8_t *s1 = match_text; 14033 // skip regstart 14034 int regstart_len2 = regstart_len; 14035 if (regstart_len2 > 1 && utf_ptr2len((char *)rex.line + col) != regstart_len2) { 14036 // because of case-folding of the previously matched text, we may need 14037 // to skip fewer bytes than utf_char2len(regstart) 14038 regstart_len2 = utf_char2len(utf_fold(regstart)); 14039 } 14040 uint8_t *s2 = rex.line + col + regstart_len2; 14041 while (*s1) { 14042 int c1_len = utf_ptr2len((char *)s1); 14043 int c1 = utf_ptr2char((char *)s1); 14044 int c2_len = utf_ptr2len((char *)s2); 14045 int c2 = utf_ptr2char((char *)s2); 14046 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) { 14047 match = false; 14048 break; 14049 } 14050 s1 += c1_len; 14051 s2 += c2_len; 14052 } 14053 if (match 14054 // check that no composing char follows 14055 && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) { 14056 cleanup_subexpr(); 14057 if (REG_MULTI) { 14058 rex.reg_startpos[0].lnum = rex.lnum; 14059 rex.reg_startpos[0].col = col; 14060 rex.reg_endpos[0].lnum = rex.lnum; 14061 rex.reg_endpos[0].col = (colnr_T)(s2 - rex.line); 14062 } else { 14063 rex.reg_startp[0] = rex.line + col; 14064 rex.reg_endp[0] = s2; 14065 } 14066 *startcol = col; 14067 return 1L; 14068 } 14069 14070 // Try finding regstart after the current match. 14071 col += regstart_len; // skip regstart 14072 if (skip_to_start(regstart, &col) == FAIL) { 14073 break; 14074 } 14075 } 14076 14077 *startcol = col; 14078 return 0L; 14079 } 14080 14081 static int nfa_did_time_out(void) 14082 { 14083 if (nfa_time_limit != NULL && profile_passed_limit(*nfa_time_limit)) { 14084 if (nfa_timed_out != NULL) { 14085 *nfa_timed_out = true; 14086 } 14087 return true; 14088 } 14089 return false; 14090 } 14091 14092 /// Main matching routine. 14093 /// 14094 /// Run NFA to determine whether it matches rex.input. 14095 /// 14096 /// When "nfa_endp" is not NULL it is a required end-of-match position. 14097 /// 14098 /// Return true if there is a match, false if there is no match, 14099 /// NFA_TOO_EXPENSIVE if we end up with too many states. 14100 /// When there is a match "submatch" contains the positions. 14101 /// 14102 /// Note: Caller must ensure that: start != NULL. 14103 static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m) 14104 FUNC_ATTR_NONNULL_ARG(1, 2, 4) 14105 { 14106 int result = false; 14107 int flag = 0; 14108 bool go_to_nextline = false; 14109 nfa_thread_T *t; 14110 nfa_list_T list[2]; 14111 int listidx; 14112 nfa_list_T *thislist; 14113 nfa_list_T *nextlist; 14114 int *listids = NULL; 14115 int listids_len = 0; 14116 nfa_state_T *add_state; 14117 bool add_here; 14118 int add_count; 14119 int add_off = 0; 14120 int toplevel = start->c == NFA_MOPEN; 14121 regsubs_T *r; 14122 // Some patterns may take a long time to match, especially when using 14123 // recursive_regmatch(). Allow interrupting them with CTRL-C. 14124 reg_breakcheck(); 14125 if (got_int) { 14126 return false; 14127 } 14128 if (nfa_did_time_out()) { 14129 return false; 14130 } 14131 14132 #ifdef NFA_REGEXP_DEBUG_LOG 14133 FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a"); 14134 14135 if (debug == NULL) { 14136 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG); 14137 return false; 14138 } 14139 #endif 14140 nfa_match = false; 14141 14142 // Allocate memory for the lists of nodes. 14143 size_t size = (size_t)(prog->nstate + 1) * sizeof(nfa_thread_T); 14144 list[0].t = xmalloc(size); 14145 list[0].len = prog->nstate + 1; 14146 list[1].t = xmalloc(size); 14147 list[1].len = prog->nstate + 1; 14148 14149 #ifdef REGEXP_DEBUG 14150 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a"); 14151 if (log_fd == NULL) { 14152 emsg(_(e_log_open_failed)); 14153 log_fd = stderr; 14154 } 14155 fprintf(log_fd, "**********************************\n"); 14156 nfa_set_code(start->c); 14157 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n", 14158 abs(start->id), code); 14159 fprintf(log_fd, "**********************************\n"); 14160 #endif 14161 14162 thislist = &list[0]; 14163 thislist->n = 0; 14164 thislist->has_pim = false; 14165 nextlist = &list[1]; 14166 nextlist->n = 0; 14167 nextlist->has_pim = false; 14168 #ifdef REGEXP_DEBUG 14169 fprintf(log_fd, "(---) STARTSTATE first\n"); 14170 #endif 14171 thislist->id = rex.nfa_listid + 1; 14172 14173 // Inline optimized code for addstate(thislist, start, m, 0) if we know 14174 // it's the first MOPEN. 14175 if (toplevel) { 14176 if (REG_MULTI) { 14177 m->norm.list.multi[0].start_lnum = rex.lnum; 14178 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line); 14179 m->norm.orig_start_col = m->norm.list.multi[0].start_col; 14180 } else { 14181 m->norm.list.line[0].start = rex.input; 14182 } 14183 m->norm.in_use = 1; 14184 r = addstate(thislist, start->out, m, NULL, 0); 14185 } else { 14186 r = addstate(thislist, start, m, NULL, 0); 14187 } 14188 if (r == NULL) { 14189 nfa_match = NFA_TOO_EXPENSIVE; 14190 goto theend; 14191 } 14192 14193 #define ADD_STATE_IF_MATCH(state) \ 14194 if (result) { \ 14195 add_state = (state)->out; \ 14196 add_off = clen; \ 14197 } 14198 14199 // Run for each character. 14200 while (true) { 14201 int curc = utf_ptr2char((char *)rex.input); 14202 int clen = utfc_ptr2len((char *)rex.input); 14203 if (curc == NUL) { 14204 clen = 0; 14205 go_to_nextline = false; 14206 } 14207 14208 // swap lists 14209 thislist = &list[flag]; 14210 nextlist = &list[flag ^= 1]; 14211 nextlist->n = 0; // clear nextlist 14212 nextlist->has_pim = false; 14213 rex.nfa_listid++; 14214 if (prog->re_engine == AUTOMATIC_ENGINE 14215 && (rex.nfa_listid >= NFA_MAX_STATES)) { 14216 // Too many states, retry with old engine. 14217 nfa_match = NFA_TOO_EXPENSIVE; 14218 goto theend; 14219 } 14220 14221 thislist->id = rex.nfa_listid; 14222 nextlist->id = rex.nfa_listid + 1; 14223 14224 #ifdef REGEXP_DEBUG 14225 fprintf(log_fd, "------------------------------------------\n"); 14226 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input); 14227 fprintf(log_fd, 14228 ">>> Advanced one character... Current char is %c (code %d) \n", 14229 curc, 14230 (int)curc); 14231 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n); 14232 { 14233 int i; 14234 14235 for (i = 0; i < thislist->n; i++) { 14236 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id)); 14237 } 14238 } 14239 fprintf(log_fd, "\n"); 14240 #endif 14241 14242 #ifdef NFA_REGEXP_DEBUG_LOG 14243 fprintf(debug, "\n-------------------\n"); 14244 #endif 14245 // If the state lists are empty we can stop. 14246 if (thislist->n == 0) { 14247 break; 14248 } 14249 14250 // compute nextlist 14251 for (listidx = 0; listidx < thislist->n; listidx++) { 14252 // If the list gets very long there probably is something wrong. 14253 // At least allow interrupting with CTRL-C. 14254 reg_breakcheck(); 14255 if (got_int) { 14256 break; 14257 } 14258 if (nfa_time_limit != NULL && ++nfa_time_count == 20) { 14259 nfa_time_count = 0; 14260 if (nfa_did_time_out()) { 14261 break; 14262 } 14263 } 14264 t = &thislist->t[listidx]; 14265 14266 #ifdef NFA_REGEXP_DEBUG_LOG 14267 nfa_set_code(t->state->c); 14268 fprintf(debug, "%s, ", code); 14269 #endif 14270 #ifdef REGEXP_DEBUG 14271 { 14272 int col; 14273 14274 if (t->subs.norm.in_use <= 0) { 14275 col = -1; 14276 } else if (REG_MULTI) { 14277 col = t->subs.norm.list.multi[0].start_col; 14278 } else { 14279 col = (int)(t->subs.norm.list.line[0].start - rex.line); 14280 } 14281 nfa_set_code(t->state->c); 14282 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n", 14283 abs(t->state->id), (int)t->state->c, code, col, 14284 pim_info(&t->pim)); 14285 } 14286 #endif 14287 14288 // Handle the possible codes of the current state. 14289 // The most important is NFA_MATCH. 14290 add_state = NULL; 14291 add_here = false; 14292 add_count = 0; 14293 switch (t->state->c) { 14294 case NFA_MATCH: 14295 // If the match is not at the start of the line, ends before a 14296 // composing characters and rex.reg_icombine is not set, that 14297 // is not really a match. 14298 if (!rex.reg_icombine 14299 && rex.input != rex.line 14300 && utf_iscomposing_legacy(curc)) { 14301 break; 14302 } 14303 nfa_match = true; 14304 copy_sub(&submatch->norm, &t->subs.norm); 14305 if (rex.nfa_has_zsubexpr) { 14306 copy_sub(&submatch->synt, &t->subs.synt); 14307 } 14308 #ifdef REGEXP_DEBUG 14309 log_subsexpr(&t->subs); 14310 #endif 14311 // Found the left-most longest match, do not look at any other 14312 // states at this position. When the list of states is going 14313 // to be empty quit without advancing, so that "rex.input" is 14314 // correct. 14315 if (nextlist->n == 0) { 14316 clen = 0; 14317 } 14318 goto nextchar; 14319 14320 case NFA_END_INVISIBLE: 14321 case NFA_END_INVISIBLE_NEG: 14322 case NFA_END_PATTERN: 14323 // This is only encountered after a NFA_START_INVISIBLE or 14324 // NFA_START_INVISIBLE_BEFORE node. 14325 // They surround a zero-width group, used with "\@=", "\&", 14326 // "\@!", "\@<=" and "\@<!". 14327 // If we got here, it means that the current "invisible" group 14328 // finished successfully, so return control to the parent 14329 // nfa_regmatch(). For a look-behind match only when it ends 14330 // in the position in "nfa_endp". 14331 // Submatches are stored in *m, and used in the parent call. 14332 #ifdef REGEXP_DEBUG 14333 if (nfa_endp != NULL) { 14334 if (REG_MULTI) { 14335 fprintf(log_fd, 14336 "Current lnum: %d, endp lnum: %d;" 14337 " current col: %d, endp col: %d\n", 14338 (int)rex.lnum, 14339 (int)nfa_endp->se_u.pos.lnum, 14340 (int)(rex.input - rex.line), 14341 nfa_endp->se_u.pos.col); 14342 } else { 14343 fprintf(log_fd, "Current col: %d, endp col: %d\n", 14344 (int)(rex.input - rex.line), 14345 (int)(nfa_endp->se_u.ptr - rex.input)); 14346 } 14347 } 14348 #endif 14349 // If "nfa_endp" is set it's only a match if it ends at 14350 // "nfa_endp" 14351 if (nfa_endp != NULL 14352 && (REG_MULTI 14353 ? (rex.lnum != nfa_endp->se_u.pos.lnum 14354 || (int)(rex.input - rex.line) != nfa_endp->se_u.pos.col) 14355 : rex.input != nfa_endp->se_u.ptr)) { 14356 break; 14357 } 14358 // do not set submatches for \@! 14359 if (t->state->c != NFA_END_INVISIBLE_NEG) { 14360 copy_sub(&m->norm, &t->subs.norm); 14361 if (rex.nfa_has_zsubexpr) { 14362 copy_sub(&m->synt, &t->subs.synt); 14363 } 14364 } 14365 #ifdef REGEXP_DEBUG 14366 fprintf(log_fd, "Match found:\n"); 14367 log_subsexpr(m); 14368 #endif 14369 nfa_match = true; 14370 // See comment above at "goto nextchar". 14371 if (nextlist->n == 0) { 14372 clen = 0; 14373 } 14374 goto nextchar; 14375 14376 case NFA_START_INVISIBLE: 14377 case NFA_START_INVISIBLE_FIRST: 14378 case NFA_START_INVISIBLE_NEG: 14379 case NFA_START_INVISIBLE_NEG_FIRST: 14380 case NFA_START_INVISIBLE_BEFORE: 14381 case NFA_START_INVISIBLE_BEFORE_FIRST: 14382 case NFA_START_INVISIBLE_BEFORE_NEG: 14383 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 14384 #ifdef REGEXP_DEBUG 14385 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n", 14386 failure_chance(t->state->out, 0), 14387 failure_chance(t->state->out1->out, 0)); 14388 #endif 14389 // Do it directly if there already is a PIM or when 14390 // nfa_postprocess() detected it will work better. 14391 if (t->pim.result != NFA_PIM_UNUSED 14392 || t->state->c == NFA_START_INVISIBLE_FIRST 14393 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST 14394 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST 14395 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) { 14396 int in_use = m->norm.in_use; 14397 14398 // Copy submatch info for the recursive call, opposite 14399 // of what happens on success below. 14400 copy_sub_off(&m->norm, &t->subs.norm); 14401 if (rex.nfa_has_zsubexpr) { 14402 copy_sub_off(&m->synt, &t->subs.synt); 14403 } 14404 // First try matching the invisible match, then what 14405 // follows. 14406 result = recursive_regmatch(t->state, NULL, prog, submatch, m, 14407 &listids, &listids_len); 14408 if (result == NFA_TOO_EXPENSIVE) { 14409 nfa_match = result; 14410 goto theend; 14411 } 14412 14413 // for \@! and \@<! it is a match when the result is 14414 // false 14415 if (result != (t->state->c == NFA_START_INVISIBLE_NEG 14416 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST 14417 || t->state->c 14418 == NFA_START_INVISIBLE_BEFORE_NEG 14419 || t->state->c 14420 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) { 14421 // Copy submatch info from the recursive call 14422 copy_sub_off(&t->subs.norm, &m->norm); 14423 if (rex.nfa_has_zsubexpr) { 14424 copy_sub_off(&t->subs.synt, &m->synt); 14425 } 14426 // If the pattern has \ze and it matched in the 14427 // sub pattern, use it. 14428 copy_ze_off(&t->subs.norm, &m->norm); 14429 14430 // t->state->out1 is the corresponding 14431 // END_INVISIBLE node; Add its out to the current 14432 // list (zero-width match). 14433 add_here = true; 14434 add_state = t->state->out1->out; 14435 } 14436 m->norm.in_use = in_use; 14437 } else { 14438 nfa_pim_T pim; 14439 14440 // First try matching what follows. Only if a match 14441 // is found verify the invisible match matches. Add a 14442 // nfa_pim_T to the following states, it contains info 14443 // about the invisible match. 14444 pim.state = t->state; 14445 pim.result = NFA_PIM_TODO; 14446 pim.subs.norm.in_use = 0; 14447 pim.subs.synt.in_use = 0; 14448 if (REG_MULTI) { 14449 pim.end.pos.col = (int)(rex.input - rex.line); 14450 pim.end.pos.lnum = rex.lnum; 14451 } else { 14452 pim.end.ptr = rex.input; 14453 } 14454 // t->state->out1 is the corresponding END_INVISIBLE 14455 // node; Add its out to the current list (zero-width 14456 // match). 14457 if (addstate_here(thislist, t->state->out1->out, &t->subs, 14458 &pim, &listidx) == NULL) { 14459 nfa_match = NFA_TOO_EXPENSIVE; 14460 goto theend; 14461 } 14462 } 14463 break; 14464 14465 case NFA_START_PATTERN: { 14466 nfa_state_T *skip = NULL; 14467 #ifdef REGEXP_DEBUG 14468 int skip_lid = 0; 14469 #endif 14470 14471 // There is no point in trying to match the pattern if the 14472 // output state is not going to be added to the list. 14473 if (state_in_list(nextlist, t->state->out1->out, &t->subs)) { 14474 skip = t->state->out1->out; 14475 #ifdef REGEXP_DEBUG 14476 skip_lid = nextlist->id; 14477 #endif 14478 } else if (state_in_list(nextlist, 14479 t->state->out1->out->out, &t->subs)) { 14480 skip = t->state->out1->out->out; 14481 #ifdef REGEXP_DEBUG 14482 skip_lid = nextlist->id; 14483 #endif 14484 } else if (state_in_list(thislist, 14485 t->state->out1->out->out, &t->subs)) { 14486 skip = t->state->out1->out->out; 14487 #ifdef REGEXP_DEBUG 14488 skip_lid = thislist->id; 14489 #endif 14490 } 14491 if (skip != NULL) { 14492 #ifdef REGEXP_DEBUG 14493 nfa_set_code(skip->c); 14494 fprintf(log_fd, 14495 "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n", 14496 abs(skip->id), skip_lid, skip->c, code); 14497 #endif 14498 break; 14499 } 14500 // Copy submatch info to the recursive call, opposite of what 14501 // happens afterwards. 14502 copy_sub_off(&m->norm, &t->subs.norm); 14503 if (rex.nfa_has_zsubexpr) { 14504 copy_sub_off(&m->synt, &t->subs.synt); 14505 } 14506 14507 // First try matching the pattern. 14508 result = recursive_regmatch(t->state, NULL, prog, submatch, m, 14509 &listids, &listids_len); 14510 if (result == NFA_TOO_EXPENSIVE) { 14511 nfa_match = result; 14512 goto theend; 14513 } 14514 if (result) { 14515 int bytelen; 14516 14517 #ifdef REGEXP_DEBUG 14518 fprintf(log_fd, "NFA_START_PATTERN matches:\n"); 14519 log_subsexpr(m); 14520 #endif 14521 // Copy submatch info from the recursive call 14522 copy_sub_off(&t->subs.norm, &m->norm); 14523 if (rex.nfa_has_zsubexpr) { 14524 copy_sub_off(&t->subs.synt, &m->synt); 14525 } 14526 // Now we need to skip over the matched text and then 14527 // continue with what follows. 14528 if (REG_MULTI) { 14529 // TODO(RE): multi-line match 14530 bytelen = m->norm.list.multi[0].end_col 14531 - (int)(rex.input - rex.line); 14532 } else { 14533 bytelen = (int)(m->norm.list.line[0].end - rex.input); 14534 } 14535 14536 #ifdef REGEXP_DEBUG 14537 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen); 14538 #endif 14539 if (bytelen == 0) { 14540 // empty match, output of corresponding 14541 // NFA_END_PATTERN/NFA_SKIP to be used at current 14542 // position 14543 add_here = true; 14544 add_state = t->state->out1->out->out; 14545 } else if (bytelen <= clen) { 14546 // match current character, output of corresponding 14547 // NFA_END_PATTERN to be used at next position. 14548 add_state = t->state->out1->out->out; 14549 add_off = clen; 14550 } else { 14551 // skip over the matched characters, set character 14552 // count in NFA_SKIP 14553 add_state = t->state->out1->out; 14554 add_off = bytelen; 14555 add_count = bytelen - clen; 14556 } 14557 } 14558 break; 14559 } 14560 14561 case NFA_BOL: 14562 if (rex.input == rex.line) { 14563 add_here = true; 14564 add_state = t->state->out; 14565 } 14566 break; 14567 14568 case NFA_EOL: 14569 if (curc == NUL) { 14570 add_here = true; 14571 add_state = t->state->out; 14572 } 14573 break; 14574 14575 case NFA_BOW: 14576 result = true; 14577 14578 if (curc == NUL) { 14579 result = false; 14580 } else { 14581 int this_class; 14582 14583 // Get class of current and previous char (if it exists). 14584 this_class = mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab); 14585 if (this_class <= 1) { 14586 result = false; 14587 } else if (reg_prev_class() == this_class) { 14588 result = false; 14589 } 14590 } 14591 if (result) { 14592 add_here = true; 14593 add_state = t->state->out; 14594 } 14595 break; 14596 14597 case NFA_EOW: 14598 result = true; 14599 if (rex.input == rex.line) { 14600 result = false; 14601 } else { 14602 int this_class, prev_class; 14603 14604 // Get class of current and previous char (if it exists). 14605 this_class = mb_get_class_tab((char *)rex.input, rex.reg_buf->b_chartab); 14606 prev_class = reg_prev_class(); 14607 if (this_class == prev_class 14608 || prev_class == 0 || prev_class == 1) { 14609 result = false; 14610 } 14611 } 14612 if (result) { 14613 add_here = true; 14614 add_state = t->state->out; 14615 } 14616 break; 14617 14618 case NFA_BOF: 14619 if (rex.lnum == 0 && rex.input == rex.line 14620 && (!REG_MULTI || rex.reg_firstlnum == 1)) { 14621 add_here = true; 14622 add_state = t->state->out; 14623 } 14624 break; 14625 14626 case NFA_EOF: 14627 if (rex.lnum == rex.reg_maxline && curc == NUL) { 14628 add_here = true; 14629 add_state = t->state->out; 14630 } 14631 break; 14632 14633 case NFA_COMPOSING: { 14634 int mc = curc; 14635 int len = 0; 14636 nfa_state_T *end; 14637 nfa_state_T *sta; 14638 int cchars[MAX_MCO]; 14639 int ccount = 0; 14640 int j; 14641 14642 sta = t->state->out; 14643 len = 0; 14644 if (utf_iscomposing_legacy(sta->c)) { 14645 // Only match composing character(s), ignore base 14646 // character. Used for ".{composing}" and "{composing}" 14647 // (no preceding character). 14648 len += utf_char2len(mc); 14649 } 14650 if (rex.reg_icombine && len == 0) { 14651 // If \Z was present, then ignore composing characters. 14652 // When ignoring the base character this always matches. 14653 if (sta->c != curc) { 14654 result = FAIL; 14655 } else { 14656 result = OK; 14657 } 14658 while (sta->c != NFA_END_COMPOSING) { 14659 sta = sta->out; 14660 } 14661 } else if (len > 0 || mc == sta->c) { 14662 // Check base character matches first, unless ignored. 14663 if (len == 0) { 14664 len += utf_char2len(mc); 14665 sta = sta->out; 14666 } 14667 14668 // We don't care about the order of composing characters. 14669 // Get them into cchars[] first. 14670 while (len < clen) { 14671 mc = utf_ptr2char((char *)rex.input + len); 14672 cchars[ccount++] = mc; 14673 len += utf_char2len(mc); 14674 if (ccount == MAX_MCO) { 14675 break; 14676 } 14677 } 14678 14679 // Check that each composing char in the pattern matches a 14680 // composing char in the text. We do not check if all 14681 // composing chars are matched. 14682 result = OK; 14683 while (sta->c != NFA_END_COMPOSING) { 14684 for (j = 0; j < ccount; j++) { 14685 if (cchars[j] == sta->c) { 14686 break; 14687 } 14688 } 14689 if (j == ccount) { 14690 result = FAIL; 14691 break; 14692 } 14693 sta = sta->out; 14694 } 14695 } else { 14696 result = FAIL; 14697 } 14698 14699 end = t->state->out1; // NFA_END_COMPOSING 14700 ADD_STATE_IF_MATCH(end); 14701 break; 14702 } 14703 14704 case NFA_NEWL: 14705 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI 14706 && rex.lnum <= rex.reg_maxline) { 14707 go_to_nextline = true; 14708 // Pass -1 for the offset, which means taking the position 14709 // at the start of the next line. 14710 add_state = t->state->out; 14711 add_off = -1; 14712 } else if (curc == '\n' && rex.reg_line_lbr) { 14713 // match \n as if it is an ordinary character 14714 add_state = t->state->out; 14715 add_off = 1; 14716 } 14717 break; 14718 14719 case NFA_START_COLL: 14720 case NFA_START_NEG_COLL: { 14721 // What follows is a list of characters, until NFA_END_COLL. 14722 // One of them must match or none of them must match. 14723 nfa_state_T *state; 14724 int result_if_matched; 14725 int c1, c2; 14726 14727 // Never match EOL. If it's part of the collection it is added 14728 // as a separate state with an OR. 14729 if (curc == NUL) { 14730 break; 14731 } 14732 14733 state = t->state->out; 14734 result_if_matched = (t->state->c == NFA_START_COLL); 14735 while (true) { 14736 if (state->c == NFA_COMPOSING) { 14737 int mc = curc; 14738 int len = 0; 14739 nfa_state_T *end; 14740 nfa_state_T *sta; 14741 int cchars[MAX_MCO]; 14742 int ccount = 0; 14743 int j; 14744 14745 sta = t->state->out->out; 14746 if (utf_iscomposing_legacy(sta->c)) { 14747 // Only match composing character(s), ignore base 14748 // character. Used for ".{composing}" and "{composing}" 14749 // (no preceding character). 14750 len += utf_char2len(mc); 14751 } 14752 if (rex.reg_icombine && len == 0) { 14753 // If \Z was present, then ignore composing characters. 14754 // When ignoring the base character this always matches. 14755 if (sta->c != curc) { 14756 result = FAIL; 14757 } else { 14758 result = OK; 14759 } 14760 while (sta->c != NFA_END_COMPOSING) { 14761 sta = sta->out; 14762 } 14763 } 14764 // Check base character matches first, unless ignored. 14765 else if (len > 0 || mc == sta->c) { 14766 if (len == 0) { 14767 len += utf_char2len(mc); 14768 sta = sta->out; 14769 } 14770 14771 // We don't care about the order of composing characters. 14772 // Get them into cchars[] first. 14773 while (len < clen) { 14774 mc = utf_ptr2char((char *)rex.input + len); 14775 cchars[ccount++] = mc; 14776 len += utf_char2len(mc); 14777 if (ccount == MAX_MCO) { 14778 break; 14779 } 14780 } 14781 14782 // Check that each composing char in the pattern matches a 14783 // composing char in the text. We do not check if all 14784 // composing chars are matched. 14785 result = OK; 14786 while (sta->c != NFA_END_COMPOSING) { 14787 for (j = 0; j < ccount; j++) { 14788 if (cchars[j] == sta->c) { 14789 break; 14790 } 14791 } 14792 if (j == ccount) { 14793 result = FAIL; 14794 break; 14795 } 14796 sta = sta->out; 14797 } 14798 } else { 14799 result = FAIL; 14800 } 14801 14802 if (t->state->out->out1 != NULL 14803 && t->state->out->out1->c == NFA_END_COMPOSING) { 14804 end = t->state->out->out1; 14805 ADD_STATE_IF_MATCH(end); 14806 } 14807 break; 14808 } 14809 if (state->c == NFA_END_COLL) { 14810 result = !result_if_matched; 14811 break; 14812 } 14813 if (state->c == NFA_RANGE_MIN) { 14814 c1 = state->val; 14815 state = state->out; // advance to NFA_RANGE_MAX 14816 c2 = state->val; 14817 #ifdef REGEXP_DEBUG 14818 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n", 14819 curc, c1, c2); 14820 #endif 14821 if (curc >= c1 && curc <= c2) { 14822 result = result_if_matched; 14823 break; 14824 } 14825 if (rex.reg_ic) { 14826 int curc_low = utf_fold(curc); 14827 int done = false; 14828 14829 for (; c1 <= c2; c1++) { 14830 if (utf_fold(c1) == curc_low) { 14831 result = result_if_matched; 14832 done = true; 14833 break; 14834 } 14835 } 14836 if (done) { 14837 break; 14838 } 14839 } 14840 } else if (state->c < 0 ? check_char_class(state->c, curc) 14841 : (curc == state->c 14842 || (rex.reg_ic 14843 && utf_fold(curc) == utf_fold(state->c)))) { 14844 result = result_if_matched; 14845 break; 14846 } 14847 state = state->out; 14848 } 14849 if (result) { 14850 // next state is in out of the NFA_END_COLL, out1 of 14851 // START points to the END state 14852 add_state = t->state->out1->out; 14853 add_off = clen; 14854 } 14855 break; 14856 } 14857 14858 case NFA_ANY: 14859 // Any char except NUL, (end of input) does not match. 14860 if (curc > 0) { 14861 add_state = t->state->out; 14862 add_off = clen; 14863 } 14864 break; 14865 14866 case NFA_ANY_COMPOSING: 14867 // On a composing character skip over it. Otherwise do 14868 // nothing. Always matches. 14869 if (utf_iscomposing_legacy(curc)) { 14870 add_off = clen; 14871 } else { 14872 add_here = true; 14873 add_off = 0; 14874 } 14875 add_state = t->state->out; 14876 break; 14877 14878 // Character classes like \a for alpha, \d for digit etc. 14879 case NFA_IDENT: // \i 14880 result = vim_isIDc(curc); 14881 ADD_STATE_IF_MATCH(t->state); 14882 break; 14883 14884 case NFA_SIDENT: // \I 14885 result = !ascii_isdigit(curc) && vim_isIDc(curc); 14886 ADD_STATE_IF_MATCH(t->state); 14887 break; 14888 14889 case NFA_KWORD: // \k 14890 result = vim_iswordp_buf((char *)rex.input, rex.reg_buf); 14891 ADD_STATE_IF_MATCH(t->state); 14892 break; 14893 14894 case NFA_SKWORD: // \K 14895 result = !ascii_isdigit(curc) 14896 && vim_iswordp_buf((char *)rex.input, rex.reg_buf); 14897 ADD_STATE_IF_MATCH(t->state); 14898 break; 14899 14900 case NFA_FNAME: // \f 14901 result = vim_isfilec(curc); 14902 ADD_STATE_IF_MATCH(t->state); 14903 break; 14904 14905 case NFA_SFNAME: // \F 14906 result = !ascii_isdigit(curc) && vim_isfilec(curc); 14907 ADD_STATE_IF_MATCH(t->state); 14908 break; 14909 14910 case NFA_PRINT: // \p 14911 result = vim_isprintc(utf_ptr2char((char *)rex.input)); 14912 ADD_STATE_IF_MATCH(t->state); 14913 break; 14914 14915 case NFA_SPRINT: // \P 14916 result = !ascii_isdigit(curc) && vim_isprintc(utf_ptr2char((char *)rex.input)); 14917 ADD_STATE_IF_MATCH(t->state); 14918 break; 14919 14920 case NFA_WHITE: // \s 14921 result = ascii_iswhite(curc); 14922 ADD_STATE_IF_MATCH(t->state); 14923 break; 14924 14925 case NFA_NWHITE: // \S 14926 result = curc != NUL && !ascii_iswhite(curc); 14927 ADD_STATE_IF_MATCH(t->state); 14928 break; 14929 14930 case NFA_DIGIT: // \d 14931 result = ri_digit(curc); 14932 ADD_STATE_IF_MATCH(t->state); 14933 break; 14934 14935 case NFA_NDIGIT: // \D 14936 result = curc != NUL && !ri_digit(curc); 14937 ADD_STATE_IF_MATCH(t->state); 14938 break; 14939 14940 case NFA_HEX: // \x 14941 result = ri_hex(curc); 14942 ADD_STATE_IF_MATCH(t->state); 14943 break; 14944 14945 case NFA_NHEX: // \X 14946 result = curc != NUL && !ri_hex(curc); 14947 ADD_STATE_IF_MATCH(t->state); 14948 break; 14949 14950 case NFA_OCTAL: // \o 14951 result = ri_octal(curc); 14952 ADD_STATE_IF_MATCH(t->state); 14953 break; 14954 14955 case NFA_NOCTAL: // \O 14956 result = curc != NUL && !ri_octal(curc); 14957 ADD_STATE_IF_MATCH(t->state); 14958 break; 14959 14960 case NFA_WORD: // \w 14961 result = ri_word(curc); 14962 ADD_STATE_IF_MATCH(t->state); 14963 break; 14964 14965 case NFA_NWORD: // \W 14966 result = curc != NUL && !ri_word(curc); 14967 ADD_STATE_IF_MATCH(t->state); 14968 break; 14969 14970 case NFA_HEAD: // \h 14971 result = ri_head(curc); 14972 ADD_STATE_IF_MATCH(t->state); 14973 break; 14974 14975 case NFA_NHEAD: // \H 14976 result = curc != NUL && !ri_head(curc); 14977 ADD_STATE_IF_MATCH(t->state); 14978 break; 14979 14980 case NFA_ALPHA: // \a 14981 result = ri_alpha(curc); 14982 ADD_STATE_IF_MATCH(t->state); 14983 break; 14984 14985 case NFA_NALPHA: // \A 14986 result = curc != NUL && !ri_alpha(curc); 14987 ADD_STATE_IF_MATCH(t->state); 14988 break; 14989 14990 case NFA_LOWER: // \l 14991 result = ri_lower(curc); 14992 ADD_STATE_IF_MATCH(t->state); 14993 break; 14994 14995 case NFA_NLOWER: // \L 14996 result = curc != NUL && !ri_lower(curc); 14997 ADD_STATE_IF_MATCH(t->state); 14998 break; 14999 15000 case NFA_UPPER: // \u 15001 result = ri_upper(curc); 15002 ADD_STATE_IF_MATCH(t->state); 15003 break; 15004 15005 case NFA_NUPPER: // \U 15006 result = curc != NUL && !ri_upper(curc); 15007 ADD_STATE_IF_MATCH(t->state); 15008 break; 15009 15010 case NFA_LOWER_IC: // [a-z] 15011 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc)); 15012 ADD_STATE_IF_MATCH(t->state); 15013 break; 15014 15015 case NFA_NLOWER_IC: // [^a-z] 15016 result = curc != NUL 15017 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc))); 15018 ADD_STATE_IF_MATCH(t->state); 15019 break; 15020 15021 case NFA_UPPER_IC: // [A-Z] 15022 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc)); 15023 ADD_STATE_IF_MATCH(t->state); 15024 break; 15025 15026 case NFA_NUPPER_IC: // [^A-Z] 15027 result = curc != NUL 15028 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc))); 15029 ADD_STATE_IF_MATCH(t->state); 15030 break; 15031 15032 case NFA_BACKREF1: 15033 case NFA_BACKREF2: 15034 case NFA_BACKREF3: 15035 case NFA_BACKREF4: 15036 case NFA_BACKREF5: 15037 case NFA_BACKREF6: 15038 case NFA_BACKREF7: 15039 case NFA_BACKREF8: 15040 case NFA_BACKREF9: 15041 case NFA_ZREF1: 15042 case NFA_ZREF2: 15043 case NFA_ZREF3: 15044 case NFA_ZREF4: 15045 case NFA_ZREF5: 15046 case NFA_ZREF6: 15047 case NFA_ZREF7: 15048 case NFA_ZREF8: 15049 case NFA_ZREF9: 15050 // \1 .. \9 \z1 .. \z9 15051 { 15052 int subidx; 15053 int bytelen; 15054 15055 if (t->state->c >= NFA_BACKREF1 && t->state->c <= NFA_BACKREF9) { 15056 subidx = t->state->c - NFA_BACKREF1 + 1; 15057 result = match_backref(&t->subs.norm, subidx, &bytelen); 15058 } else { 15059 subidx = t->state->c - NFA_ZREF1 + 1; 15060 result = match_zref(subidx, &bytelen); 15061 } 15062 15063 if (result) { 15064 if (bytelen == 0) { 15065 // empty match always works, output of NFA_SKIP to be 15066 // used next 15067 add_here = true; 15068 add_state = t->state->out->out; 15069 } else if (bytelen <= clen) { 15070 // match current character, jump ahead to out of 15071 // NFA_SKIP 15072 add_state = t->state->out->out; 15073 add_off = clen; 15074 } else { 15075 // skip over the matched characters, set character 15076 // count in NFA_SKIP 15077 add_state = t->state->out; 15078 add_off = bytelen; 15079 add_count = bytelen - clen; 15080 } 15081 } 15082 break; 15083 } 15084 case NFA_SKIP: 15085 // character of previous matching \1 .. \9 or \@> 15086 if (t->count - clen <= 0) { 15087 // end of match, go to what follows 15088 add_state = t->state->out; 15089 add_off = clen; 15090 } else { 15091 // add state again with decremented count 15092 add_state = t->state; 15093 add_off = 0; 15094 add_count = t->count - clen; 15095 } 15096 break; 15097 15098 case NFA_LNUM: 15099 case NFA_LNUM_GT: 15100 case NFA_LNUM_LT: 15101 assert(t->state->val >= 0 15102 && !((rex.reg_firstlnum > 0 15103 && rex.lnum > LONG_MAX - rex.reg_firstlnum) 15104 || (rex.reg_firstlnum < 0 15105 && rex.lnum < LONG_MIN + rex.reg_firstlnum)) 15106 && rex.lnum + rex.reg_firstlnum >= 0); 15107 result = (REG_MULTI 15108 && nfa_re_num_cmp((uintmax_t)t->state->val, 15109 t->state->c - NFA_LNUM, 15110 (uintmax_t)rex.lnum + (uintmax_t)rex.reg_firstlnum)); 15111 if (result) { 15112 add_here = true; 15113 add_state = t->state->out; 15114 } 15115 break; 15116 15117 case NFA_COL: 15118 case NFA_COL_GT: 15119 case NFA_COL_LT: 15120 assert(t->state->val >= 0 15121 && rex.input >= rex.line 15122 && (uintmax_t)(rex.input - rex.line) <= UINTMAX_MAX - 1); 15123 result = nfa_re_num_cmp((uintmax_t)t->state->val, 15124 t->state->c - NFA_COL, 15125 (uintmax_t)(rex.input - rex.line + 1)); 15126 if (result) { 15127 add_here = true; 15128 add_state = t->state->out; 15129 } 15130 break; 15131 15132 case NFA_VCOL: 15133 case NFA_VCOL_GT: 15134 case NFA_VCOL_LT: { 15135 int op = t->state->c - NFA_VCOL; 15136 colnr_T col = (colnr_T)(rex.input - rex.line); 15137 15138 // Bail out quickly when there can't be a match, avoid the overhead of 15139 // win_linetabsize() on long lines. 15140 if (op != 1 && col > t->state->val * MB_MAXBYTES) { 15141 break; 15142 } 15143 15144 result = false; 15145 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win; 15146 if (op == 1 && col - 1 > t->state->val && col > 100) { 15147 int64_t ts = (int64_t)wp->w_buffer->b_p_ts; 15148 15149 // Guess that a character won't use more columns than 'tabstop', 15150 // with a minimum of 4. 15151 if (ts < 4) { 15152 ts = 4; 15153 } 15154 result = col > t->state->val * ts; 15155 } 15156 if (!result) { 15157 linenr_T lnum = REG_MULTI ? rex.reg_firstlnum + rex.lnum : 1; 15158 if (REG_MULTI && (lnum <= 0 || lnum > wp->w_buffer->b_ml.ml_line_count)) { 15159 lnum = 1; 15160 } 15161 int vcol = win_linetabsize(wp, lnum, (char *)rex.line, col); 15162 assert(t->state->val >= 0); 15163 result = nfa_re_num_cmp((uintmax_t)t->state->val, op, (uintmax_t)vcol + 1); 15164 } 15165 if (result) { 15166 add_here = true; 15167 add_state = t->state->out; 15168 } 15169 } 15170 break; 15171 15172 case NFA_MARK: 15173 case NFA_MARK_GT: 15174 case NFA_MARK_LT: { 15175 size_t col = REG_MULTI ? (size_t)(rex.input - rex.line) : 0; 15176 fmark_T *fm = mark_get(rex.reg_buf, curwin, NULL, kMarkBufLocal, t->state->val); 15177 15178 // Line may have been freed, get it again. 15179 if (REG_MULTI) { 15180 rex.line = (uint8_t *)reg_getline(rex.lnum); 15181 rex.input = rex.line + col; 15182 } 15183 15184 // Compare the mark position to the match position, if the mark 15185 // exists and mark is set in reg_buf. 15186 if (fm != NULL && fm->mark.lnum > 0) { 15187 pos_T *pos = &fm->mark; 15188 const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum 15189 && pos->col == MAXCOL 15190 ? reg_getline_len(pos->lnum - rex.reg_firstlnum) 15191 : pos->col; 15192 15193 result = pos->lnum == rex.lnum + rex.reg_firstlnum 15194 ? (pos_col == (colnr_T)(rex.input - rex.line) 15195 ? t->state->c == NFA_MARK 15196 : (pos_col < (colnr_T)(rex.input - rex.line) 15197 ? t->state->c == NFA_MARK_GT 15198 : t->state->c == NFA_MARK_LT)) 15199 : (pos->lnum < rex.lnum + rex.reg_firstlnum 15200 ? t->state->c == NFA_MARK_GT 15201 : t->state->c == NFA_MARK_LT); 15202 if (result) { 15203 add_here = true; 15204 add_state = t->state->out; 15205 } 15206 } 15207 break; 15208 } 15209 15210 case NFA_CURSOR: 15211 result = rex.reg_win != NULL 15212 && (rex.lnum + rex.reg_firstlnum == rex.reg_win->w_cursor.lnum) 15213 && ((colnr_T)(rex.input - rex.line) == rex.reg_win->w_cursor.col); 15214 if (result) { 15215 add_here = true; 15216 add_state = t->state->out; 15217 } 15218 break; 15219 15220 case NFA_VISUAL: 15221 result = reg_match_visual(); 15222 if (result) { 15223 add_here = true; 15224 add_state = t->state->out; 15225 } 15226 break; 15227 15228 case NFA_MOPEN1: 15229 case NFA_MOPEN2: 15230 case NFA_MOPEN3: 15231 case NFA_MOPEN4: 15232 case NFA_MOPEN5: 15233 case NFA_MOPEN6: 15234 case NFA_MOPEN7: 15235 case NFA_MOPEN8: 15236 case NFA_MOPEN9: 15237 case NFA_ZOPEN: 15238 case NFA_ZOPEN1: 15239 case NFA_ZOPEN2: 15240 case NFA_ZOPEN3: 15241 case NFA_ZOPEN4: 15242 case NFA_ZOPEN5: 15243 case NFA_ZOPEN6: 15244 case NFA_ZOPEN7: 15245 case NFA_ZOPEN8: 15246 case NFA_ZOPEN9: 15247 case NFA_NOPEN: 15248 case NFA_ZSTART: 15249 // These states are only added to be able to bail out when 15250 // they are added again, nothing is to be done. 15251 break; 15252 15253 default: // regular character 15254 { 15255 int c = t->state->c; 15256 15257 #ifdef REGEXP_DEBUG 15258 if (c < 0) { 15259 siemsg("INTERNAL: Negative state char: %" PRId64, (int64_t)c); 15260 } 15261 #endif 15262 result = (c == curc); 15263 15264 if (!result && rex.reg_ic) { 15265 result = utf_fold(c) == utf_fold(curc); 15266 } 15267 15268 // If rex.reg_icombine is not set only skip over the character 15269 // itself. When it is set skip over composing characters. 15270 if (result && !rex.reg_icombine) { 15271 clen = utf_ptr2len((char *)rex.input); 15272 } 15273 15274 ADD_STATE_IF_MATCH(t->state); 15275 break; 15276 } 15277 } // switch (t->state->c) 15278 15279 if (add_state != NULL) { 15280 nfa_pim_T *pim; 15281 nfa_pim_T pim_copy; 15282 15283 if (t->pim.result == NFA_PIM_UNUSED) { 15284 pim = NULL; 15285 } else { 15286 pim = &t->pim; 15287 } 15288 15289 // Handle the postponed invisible match if the match might end 15290 // without advancing and before the end of the line. 15291 if (pim != NULL && (clen == 0 || match_follows(add_state, 0))) { 15292 if (pim->result == NFA_PIM_TODO) { 15293 #ifdef REGEXP_DEBUG 15294 fprintf(log_fd, "\n"); 15295 fprintf(log_fd, "==================================\n"); 15296 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n"); 15297 fprintf(log_fd, "\n"); 15298 #endif 15299 result = recursive_regmatch(pim->state, pim, prog, submatch, m, 15300 &listids, &listids_len); 15301 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH; 15302 // for \@! and \@<! it is a match when the result is 15303 // false 15304 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG 15305 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST 15306 || pim->state->c 15307 == NFA_START_INVISIBLE_BEFORE_NEG 15308 || pim->state->c 15309 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) { 15310 // Copy submatch info from the recursive call 15311 copy_sub_off(&pim->subs.norm, &m->norm); 15312 if (rex.nfa_has_zsubexpr) { 15313 copy_sub_off(&pim->subs.synt, &m->synt); 15314 } 15315 } 15316 } else { 15317 result = (pim->result == NFA_PIM_MATCH); 15318 #ifdef REGEXP_DEBUG 15319 fprintf(log_fd, "\n"); 15320 fprintf(log_fd, 15321 "Using previous recursive nfa_regmatch() result, result == %d\n", 15322 pim->result); 15323 fprintf(log_fd, "MATCH = %s\n", result ? "OK" : "false"); 15324 fprintf(log_fd, "\n"); 15325 #endif 15326 } 15327 15328 // for \@! and \@<! it is a match when result is false 15329 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG 15330 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST 15331 || pim->state->c 15332 == NFA_START_INVISIBLE_BEFORE_NEG 15333 || pim->state->c 15334 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) { 15335 // Copy submatch info from the recursive call 15336 copy_sub_off(&t->subs.norm, &pim->subs.norm); 15337 if (rex.nfa_has_zsubexpr) { 15338 copy_sub_off(&t->subs.synt, &pim->subs.synt); 15339 } 15340 } else { 15341 // look-behind match failed, don't add the state 15342 continue; 15343 } 15344 15345 // Postponed invisible match was handled, don't add it to 15346 // following states. 15347 pim = NULL; 15348 } 15349 15350 // If "pim" points into l->t it will become invalid when 15351 // adding the state causes the list to be reallocated. Make a 15352 // local copy to avoid that. 15353 if (pim == &t->pim) { 15354 copy_pim(&pim_copy, pim); 15355 pim = &pim_copy; 15356 } 15357 15358 if (add_here) { 15359 r = addstate_here(thislist, add_state, &t->subs, pim, &listidx); 15360 } else { 15361 r = addstate(nextlist, add_state, &t->subs, pim, add_off); 15362 if (add_count > 0) { 15363 nextlist->t[nextlist->n - 1].count = add_count; 15364 } 15365 } 15366 if (r == NULL) { 15367 nfa_match = NFA_TOO_EXPENSIVE; 15368 goto theend; 15369 } 15370 } 15371 } // for (thislist = thislist; thislist->state; thislist++) 15372 15373 // Look for the start of a match in the current position by adding the 15374 // start state to the list of states. 15375 // The first found match is the leftmost one, thus the order of states 15376 // matters! 15377 // Do not add the start state in recursive calls of nfa_regmatch(), 15378 // because recursive calls should only start in the first position. 15379 // Unless "nfa_endp" is not NULL, then we match the end position. 15380 // Also don't start a match past the first line. 15381 if (!nfa_match 15382 && ((toplevel 15383 && rex.lnum == 0 15384 && clen != 0 15385 && (rex.reg_maxcol == 0 15386 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol)) 15387 || (nfa_endp != NULL 15388 && (REG_MULTI 15389 ? (rex.lnum < nfa_endp->se_u.pos.lnum 15390 || (rex.lnum == nfa_endp->se_u.pos.lnum 15391 && (int)(rex.input - rex.line) 15392 < nfa_endp->se_u.pos.col)) 15393 : rex.input < nfa_endp->se_u.ptr)))) { 15394 #ifdef REGEXP_DEBUG 15395 fprintf(log_fd, "(---) STARTSTATE\n"); 15396 #endif 15397 // Inline optimized code for addstate() if we know the state is 15398 // the first MOPEN. 15399 if (toplevel) { 15400 int add = true; 15401 15402 if (prog->regstart != NUL && clen != 0) { 15403 if (nextlist->n == 0) { 15404 colnr_T col = (colnr_T)(rex.input - rex.line) + clen; 15405 15406 // Nextlist is empty, we can skip ahead to the 15407 // character that must appear at the start. 15408 if (skip_to_start(prog->regstart, &col) == FAIL) { 15409 break; 15410 } 15411 #ifdef REGEXP_DEBUG 15412 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n", 15413 col - ((colnr_T)(rex.input - rex.line) + clen)); 15414 #endif 15415 rex.input = rex.line + col - clen; 15416 } else { 15417 // Checking if the required start character matches is 15418 // cheaper than adding a state that won't match. 15419 const int c = utf_ptr2char((char *)rex.input + clen); 15420 if (c != prog->regstart 15421 && (!rex.reg_ic 15422 || utf_fold(c) != utf_fold(prog->regstart))) { 15423 #ifdef REGEXP_DEBUG 15424 fprintf(log_fd, 15425 " Skipping start state, regstart does not match\n"); 15426 #endif 15427 add = false; 15428 } 15429 } 15430 } 15431 15432 if (add) { 15433 if (REG_MULTI) { 15434 m->norm.list.multi[0].start_col = 15435 (colnr_T)(rex.input - rex.line) + clen; 15436 m->norm.orig_start_col = 15437 m->norm.list.multi[0].start_col; 15438 } else { 15439 m->norm.list.line[0].start = rex.input + clen; 15440 } 15441 if (addstate(nextlist, start->out, m, NULL, clen) == NULL) { 15442 nfa_match = NFA_TOO_EXPENSIVE; 15443 goto theend; 15444 } 15445 } 15446 } else { 15447 if (addstate(nextlist, start, m, NULL, clen) == NULL) { 15448 nfa_match = NFA_TOO_EXPENSIVE; 15449 goto theend; 15450 } 15451 } 15452 } 15453 15454 #ifdef REGEXP_DEBUG 15455 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n); 15456 { 15457 int i; 15458 15459 for (i = 0; i < thislist->n; i++) { 15460 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id)); 15461 } 15462 } 15463 fprintf(log_fd, "\n"); 15464 #endif 15465 15466 nextchar: 15467 // Advance to the next character, or advance to the next line, or 15468 // finish. 15469 if (clen != 0) { 15470 rex.input += clen; 15471 } else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI 15472 && rex.lnum < nfa_endp->se_u.pos.lnum)) { 15473 reg_nextline(); 15474 } else { 15475 break; 15476 } 15477 15478 // Allow interrupting with CTRL-C. 15479 reg_breakcheck(); 15480 if (got_int) { 15481 break; 15482 } 15483 // Check for timeout once every twenty times to avoid overhead. 15484 if (nfa_time_limit != NULL && ++nfa_time_count == 20) { 15485 nfa_time_count = 0; 15486 if (nfa_did_time_out()) { 15487 break; 15488 } 15489 } 15490 } 15491 15492 #ifdef REGEXP_DEBUG 15493 if (log_fd != stderr) { 15494 fclose(log_fd); 15495 } 15496 log_fd = NULL; 15497 #endif 15498 15499 theend: 15500 // Free memory 15501 xfree(list[0].t); 15502 xfree(list[1].t); 15503 xfree(listids); 15504 #undef ADD_STATE_IF_MATCH 15505 #ifdef NFA_REGEXP_DEBUG_LOG 15506 fclose(debug); 15507 #endif 15508 15509 return nfa_match; 15510 } 15511 15512 /// Try match of "prog" with at rex.line["col"]. 15513 /// 15514 /// @param tm timeout limit or NULL 15515 /// @param timed_out flag set on timeout or NULL 15516 /// 15517 /// @return <= 0 for failure, number of lines contained in the match otherwise. 15518 static int nfa_regtry(nfa_regprog_T *prog, colnr_T col, proftime_T *tm, int *timed_out) 15519 { 15520 int i; 15521 regsubs_T subs, m; 15522 nfa_state_T *start = prog->start; 15523 #ifdef REGEXP_DEBUG 15524 FILE *f; 15525 #endif 15526 15527 rex.input = rex.line + col; 15528 nfa_time_limit = tm; 15529 nfa_timed_out = timed_out; 15530 nfa_time_count = 0; 15531 15532 #ifdef REGEXP_DEBUG 15533 f = fopen(NFA_REGEXP_RUN_LOG, "a"); 15534 if (f != NULL) { 15535 fprintf(f, 15536 "\n\n\t=======================================================\n"); 15537 # ifdef REGEXP_DEBUG 15538 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr); 15539 # endif 15540 fprintf(f, "\tInput text is \"%s\" \n", rex.input); 15541 fprintf(f, "\t=======================================================\n\n"); 15542 nfa_print_state(f, start); 15543 fprintf(f, "\n\n"); 15544 fclose(f); 15545 } else { 15546 emsg("Could not open temporary log file for writing"); 15547 } 15548 #endif 15549 15550 clear_sub(&subs.norm); 15551 clear_sub(&m.norm); 15552 clear_sub(&subs.synt); 15553 clear_sub(&m.synt); 15554 15555 int result = nfa_regmatch(prog, start, &subs, &m); 15556 if (!result) { 15557 return 0; 15558 } else if (result == NFA_TOO_EXPENSIVE) { 15559 return result; 15560 } 15561 15562 cleanup_subexpr(); 15563 if (REG_MULTI) { 15564 for (i = 0; i < subs.norm.in_use; i++) { 15565 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum; 15566 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col; 15567 15568 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum; 15569 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col; 15570 } 15571 if (rex.reg_mmatch != NULL) { 15572 rex.reg_mmatch->rmm_matchcol = subs.norm.orig_start_col; 15573 } 15574 15575 if (rex.reg_startpos[0].lnum < 0) { 15576 rex.reg_startpos[0].lnum = 0; 15577 rex.reg_startpos[0].col = col; 15578 } 15579 if (rex.reg_endpos[0].lnum < 0) { 15580 // pattern has a \ze but it didn't match, use current end 15581 rex.reg_endpos[0].lnum = rex.lnum; 15582 rex.reg_endpos[0].col = (int)(rex.input - rex.line); 15583 } else { 15584 // Use line number of "\ze". 15585 rex.lnum = rex.reg_endpos[0].lnum; 15586 } 15587 } else { 15588 for (i = 0; i < subs.norm.in_use; i++) { 15589 rex.reg_startp[i] = subs.norm.list.line[i].start; 15590 rex.reg_endp[i] = subs.norm.list.line[i].end; 15591 } 15592 15593 if (rex.reg_startp[0] == NULL) { 15594 rex.reg_startp[0] = rex.line + col; 15595 } 15596 if (rex.reg_endp[0] == NULL) { 15597 rex.reg_endp[0] = rex.input; 15598 } 15599 } 15600 15601 // Package any found \z(...\) matches for export. Default is none. 15602 unref_extmatch(re_extmatch_out); 15603 re_extmatch_out = NULL; 15604 15605 if (prog->reghasz == REX_SET) { 15606 cleanup_zsubexpr(); 15607 re_extmatch_out = make_extmatch(); 15608 // Loop over \z1, \z2, etc. There is no \z0. 15609 for (i = 1; i < subs.synt.in_use; i++) { 15610 if (REG_MULTI) { 15611 struct multipos *mpos = &subs.synt.list.multi[i]; 15612 15613 // Only accept single line matches that are valid. 15614 if (mpos->start_lnum >= 0 15615 && mpos->start_lnum == mpos->end_lnum 15616 && mpos->end_col >= mpos->start_col) { 15617 re_extmatch_out->matches[i] = 15618 (uint8_t *)xstrnsave(reg_getline(mpos->start_lnum) + mpos->start_col, 15619 (size_t)(mpos->end_col - mpos->start_col)); 15620 } 15621 } else { 15622 struct linepos *lpos = &subs.synt.list.line[i]; 15623 15624 if (lpos->start != NULL && lpos->end != NULL) { 15625 re_extmatch_out->matches[i] = 15626 (uint8_t *)xstrnsave((char *)lpos->start, (size_t)(lpos->end - lpos->start)); 15627 } 15628 } 15629 } 15630 } 15631 15632 return 1 + rex.lnum; 15633 } 15634 15635 /// Match a regexp against a string ("line" points to the string) or multiple 15636 /// lines (if "line" is NULL, use reg_getline()). 15637 /// 15638 /// @param line String in which to search or NULL 15639 /// @param startcol Column to start looking for match 15640 /// @param tm Timeout limit or NULL 15641 /// @param timed_out Flag set on timeout or NULL 15642 /// 15643 /// @return <= 0 if there is no match and number of lines contained in the 15644 /// match otherwise. 15645 static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int *timed_out) 15646 { 15647 nfa_regprog_T *prog; 15648 int retval = 0; 15649 colnr_T col = startcol; 15650 15651 if (REG_MULTI) { 15652 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog; 15653 line = (uint8_t *)reg_getline(0); // relative to the cursor 15654 rex.reg_startpos = rex.reg_mmatch->startpos; 15655 rex.reg_endpos = rex.reg_mmatch->endpos; 15656 } else { 15657 prog = (nfa_regprog_T *)rex.reg_match->regprog; 15658 rex.reg_startp = (uint8_t **)rex.reg_match->startp; 15659 rex.reg_endp = (uint8_t **)rex.reg_match->endp; 15660 } 15661 15662 // Be paranoid... 15663 if (prog == NULL || line == NULL) { 15664 iemsg(_(e_null)); 15665 goto theend; 15666 } 15667 15668 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic 15669 if (prog->regflags & RF_ICASE) { 15670 rex.reg_ic = true; 15671 } else if (prog->regflags & RF_NOICASE) { 15672 rex.reg_ic = false; 15673 } 15674 15675 // If pattern contains "\Z" overrule value of rex.reg_icombine 15676 if (prog->regflags & RF_ICOMBINE) { 15677 rex.reg_icombine = true; 15678 } 15679 15680 rex.line = line; 15681 rex.lnum = 0; // relative to line 15682 15683 rex.nfa_has_zend = prog->has_zend; 15684 rex.nfa_has_backref = prog->has_backref; 15685 rex.nfa_nsubexpr = prog->nsubexp; 15686 rex.nfa_listid = 1; 15687 rex.nfa_alt_listid = 2; 15688 #ifdef REGEXP_DEBUG 15689 nfa_regengine.expr = prog->pattern; 15690 #endif 15691 15692 if (prog->reganch && col > 0) { 15693 return 0L; 15694 } 15695 15696 rex.need_clear_subexpr = true; 15697 // Clear the external match subpointers if necessary. 15698 if (prog->reghasz == REX_SET) { 15699 rex.nfa_has_zsubexpr = true; 15700 rex.need_clear_zsubexpr = true; 15701 } else { 15702 rex.nfa_has_zsubexpr = false; 15703 rex.need_clear_zsubexpr = false; 15704 } 15705 15706 if (prog->regstart != NUL) { 15707 // Skip ahead until a character we know the match must start with. 15708 // When there is none there is no match. 15709 if (skip_to_start(prog->regstart, &col) == FAIL) { 15710 return 0L; 15711 } 15712 15713 // If match_text is set it contains the full text that must match. 15714 // Nothing else to try. Doesn't handle combining chars well. 15715 if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) { 15716 retval = find_match_text(&col, prog->regstart, prog->match_text); 15717 if (REG_MULTI) { 15718 rex.reg_mmatch->rmm_matchcol = col; 15719 } else { 15720 rex.reg_match->rm_matchcol = col; 15721 } 15722 return retval; 15723 } 15724 } 15725 15726 // If the start column is past the maximum column: no need to try. 15727 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) { 15728 goto theend; 15729 } 15730 15731 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when 15732 // it's accidentally used during execution. 15733 nstate = 0; 15734 for (int i = 0; i < prog->nstate; i++) { 15735 prog->state[i].id = i; 15736 prog->state[i].lastlist[0] = 0; 15737 prog->state[i].lastlist[1] = 0; 15738 } 15739 15740 retval = nfa_regtry(prog, col, tm, timed_out); 15741 15742 #ifdef REGEXP_DEBUG 15743 nfa_regengine.expr = NULL; 15744 #endif 15745 15746 theend: 15747 if (retval > 0) { 15748 // Make sure the end is never before the start. Can happen when \zs and 15749 // \ze are used. 15750 if (REG_MULTI) { 15751 const lpos_T *const start = &rex.reg_mmatch->startpos[0]; 15752 const lpos_T *const end = &rex.reg_mmatch->endpos[0]; 15753 15754 if (end->lnum < start->lnum 15755 || (end->lnum == start->lnum && end->col < start->col)) { 15756 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0]; 15757 } 15758 } else { 15759 if (rex.reg_match->endp[0] < rex.reg_match->startp[0]) { 15760 rex.reg_match->endp[0] = rex.reg_match->startp[0]; 15761 } 15762 15763 // startpos[0] may be set by "\zs", also return the column where 15764 // the whole pattern matched. 15765 rex.reg_match->rm_matchcol = col; 15766 } 15767 } 15768 15769 return retval; 15770 } 15771 15772 // Compile a regular expression into internal code for the NFA matcher. 15773 // Returns the program in allocated space. Returns NULL for an error. 15774 static regprog_T *nfa_regcomp(uint8_t *expr, int re_flags) 15775 { 15776 nfa_regprog_T *prog = NULL; 15777 int *postfix; 15778 15779 if (expr == NULL) { 15780 return NULL; 15781 } 15782 15783 #ifdef REGEXP_DEBUG 15784 nfa_regengine.expr = expr; 15785 #endif 15786 nfa_re_flags = re_flags; 15787 15788 init_class_tab(); 15789 15790 nfa_regcomp_start(expr, re_flags); 15791 15792 // Build postfix form of the regexp. Needed to build the NFA 15793 // (and count its size). 15794 postfix = re2post(); 15795 if (postfix == NULL) { 15796 goto fail; // Cascaded (syntax?) error 15797 } 15798 15799 // In order to build the NFA, we parse the input regexp twice: 15800 // 1. first pass to count size (so we can allocate space) 15801 // 2. second to emit code 15802 #ifdef REGEXP_DEBUG 15803 { 15804 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a"); 15805 15806 if (f != NULL) { 15807 fprintf(f, 15808 "\n*****************************\n\n\n\n\t" 15809 "Compiling regexp \"%s\"... hold on !\n", 15810 expr); 15811 fclose(f); 15812 } 15813 } 15814 #endif 15815 15816 // PASS 1 15817 // Count number of NFA states in "nstate". Do not build the NFA. 15818 post2nfa(postfix, post_ptr, true); 15819 15820 // allocate the regprog with space for the compiled regexp 15821 size_t prog_size = offsetof(nfa_regprog_T, state) + sizeof(nfa_state_T) * (size_t)nstate; 15822 prog = xmalloc(prog_size); 15823 state_ptr = prog->state; 15824 prog->re_in_use = false; 15825 15826 // PASS 2 15827 // Build the NFA 15828 prog->start = post2nfa(postfix, post_ptr, false); 15829 if (prog->start == NULL) { 15830 goto fail; 15831 } 15832 prog->regflags = regflags; 15833 prog->engine = &nfa_regengine; 15834 prog->nstate = nstate; 15835 prog->has_zend = rex.nfa_has_zend; 15836 prog->has_backref = rex.nfa_has_backref; 15837 prog->nsubexp = regnpar; 15838 15839 nfa_postprocess(prog); 15840 15841 prog->reganch = nfa_get_reganch(prog->start, 0); 15842 prog->regstart = nfa_get_regstart(prog->start, 0); 15843 prog->match_text = nfa_get_match_text(prog->start); 15844 15845 #ifdef REGEXP_DEBUG 15846 nfa_postfix_dump(expr, OK); 15847 nfa_dump(prog); 15848 #endif 15849 // Remember whether this pattern has any \z specials in it. 15850 prog->reghasz = re_has_z; 15851 prog->pattern = xstrdup((char *)expr); 15852 #ifdef REGEXP_DEBUG 15853 nfa_regengine.expr = NULL; 15854 #endif 15855 15856 out: 15857 xfree(post_start); 15858 post_start = post_ptr = post_end = NULL; 15859 state_ptr = NULL; 15860 return (regprog_T *)prog; 15861 15862 fail: 15863 XFREE_CLEAR(prog); 15864 #ifdef REGEXP_DEBUG 15865 nfa_postfix_dump(expr, FAIL); 15866 nfa_regengine.expr = NULL; 15867 #endif 15868 goto out; 15869 } 15870 15871 // Free a compiled regexp program, returned by nfa_regcomp(). 15872 static void nfa_regfree(regprog_T *prog) 15873 { 15874 if (prog == NULL) { 15875 return; 15876 } 15877 15878 xfree(((nfa_regprog_T *)prog)->match_text); 15879 xfree(((nfa_regprog_T *)prog)->pattern); 15880 xfree(prog); 15881 } 15882 15883 /// Match a regexp against a string. 15884 /// "rmp->regprog" is a compiled regexp as returned by nfa_regcomp(). 15885 /// Uses curbuf for line count and 'iskeyword'. 15886 /// If "line_lbr" is true, consider a "\n" in "line" to be a line break. 15887 /// 15888 /// @param line string to match against 15889 /// @param col column to start looking for match 15890 /// 15891 /// @return <= 0 for failure, number of lines contained in the match otherwise. 15892 static int nfa_regexec_nl(regmatch_T *rmp, uint8_t *line, colnr_T col, bool line_lbr) 15893 { 15894 rex.reg_match = rmp; 15895 rex.reg_mmatch = NULL; 15896 rex.reg_maxline = 0; 15897 rex.reg_line_lbr = line_lbr; 15898 rex.reg_buf = curbuf; 15899 rex.reg_win = NULL; 15900 rex.reg_ic = rmp->rm_ic; 15901 rex.reg_icombine = false; 15902 rex.reg_nobreak = rmp->regprog->re_flags & RE_NOBREAK; 15903 rex.reg_maxcol = 0; 15904 return nfa_regexec_both(line, col, NULL, NULL); 15905 } 15906 15907 /// Matches a regexp against multiple lines. 15908 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). 15909 /// Uses curbuf for line count and 'iskeyword'. 15910 /// 15911 /// @param win Window in which to search or NULL 15912 /// @param buf Buffer in which to search 15913 /// @param lnum Number of line to start looking for match 15914 /// @param col Column to start looking for match 15915 /// @param tm Timeout limit or NULL 15916 /// @param timed_out Flag set on timeout or NULL 15917 /// 15918 /// @return <= 0 if there is no match and number of lines contained in the match 15919 /// otherwise. 15920 /// 15921 /// @note The body is the same as bt_regexec() except for nfa_regexec_both() 15922 /// 15923 /// @warning 15924 /// Match may actually be in another line. e.g.: 15925 /// when r.e. is \nc, cursor is at 'a' and the text buffer looks like 15926 /// 15927 /// @par 15928 /// 15929 /// +-------------------------+ 15930 /// |a | 15931 /// |b | 15932 /// |c | 15933 /// | | 15934 /// +-------------------------+ 15935 /// 15936 /// @par 15937 /// then nfa_regexec_multi() returns 3. while the original vim_regexec_multi() 15938 /// returns 0 and a second call at line 2 will return 2. 15939 /// 15940 /// @par 15941 /// FIXME if this behavior is not compatible. 15942 static int nfa_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, 15943 proftime_T *tm, int *timed_out) 15944 { 15945 init_regexec_multi(rmp, win, buf, lnum); 15946 return nfa_regexec_both(NULL, col, tm, timed_out); 15947 } 15948 // }}}1 15949 15950 static regengine_T bt_regengine = { 15951 bt_regcomp, 15952 bt_regfree, 15953 bt_regexec_nl, 15954 bt_regexec_multi, 15955 #ifdef REGEXP_DEBUG 15956 "", 15957 #endif 15958 }; 15959 15960 static regengine_T nfa_regengine = { 15961 nfa_regcomp, 15962 nfa_regfree, 15963 nfa_regexec_nl, 15964 nfa_regexec_multi, 15965 #ifdef REGEXP_DEBUG 15966 "", 15967 #endif 15968 }; 15969 15970 // Which regexp engine to use? Needed for vim_regcomp(). 15971 // Must match with 'regexpengine'. 15972 static int regexp_engine = 0; 15973 15974 #ifdef REGEXP_DEBUG 15975 static uint8_t regname[][30] = { 15976 "AUTOMATIC Regexp Engine", 15977 "BACKTRACKING Regexp Engine", 15978 "NFA Regexp Engine" 15979 }; 15980 #endif 15981 15982 // Compile a regular expression into internal code. 15983 // Returns the program in allocated memory. 15984 // Use vim_regfree() to free the memory. 15985 // Returns NULL for an error. 15986 regprog_T *vim_regcomp(const char *expr_arg, int re_flags) 15987 { 15988 regprog_T *prog = NULL; 15989 const char *expr = expr_arg; 15990 15991 regexp_engine = (int)p_re; 15992 15993 // Check for prefix "\%#=", that sets the regexp engine 15994 if (strncmp(expr, "\\%#=", 4) == 0) { 15995 int newengine = expr[4] - '0'; 15996 15997 if (newengine == AUTOMATIC_ENGINE 15998 || newengine == BACKTRACKING_ENGINE 15999 || newengine == NFA_ENGINE) { 16000 regexp_engine = expr[4] - '0'; 16001 expr += 5; 16002 #ifdef REGEXP_DEBUG 16003 smsg(0, "New regexp mode selected (%d): %s", 16004 regexp_engine, 16005 regname[newengine]); 16006 #endif 16007 } else { 16008 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used ")); 16009 regexp_engine = AUTOMATIC_ENGINE; 16010 } 16011 } 16012 #ifdef REGEXP_DEBUG 16013 bt_regengine.expr = expr; 16014 nfa_regengine.expr = expr; 16015 #endif 16016 // reg_iswordc() uses rex.reg_buf 16017 rex.reg_buf = curbuf; 16018 16019 // 16020 // First try the NFA engine, unless backtracking was requested. 16021 // 16022 const int called_emsg_before = called_emsg; 16023 if (regexp_engine != BACKTRACKING_ENGINE) { 16024 prog = nfa_regengine.regcomp((uint8_t *)expr, 16025 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0)); 16026 } else { 16027 prog = bt_regengine.regcomp((uint8_t *)expr, re_flags); 16028 } 16029 16030 // Check for error compiling regexp with initial engine. 16031 if (prog == NULL) { 16032 #ifdef BT_REGEXP_DEBUG_LOG 16033 // Debugging log for BT engine. 16034 if (regexp_engine != BACKTRACKING_ENGINE) { 16035 FILE *f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a"); 16036 if (f) { 16037 fprintf(f, "Syntax error in \"%s\"\n", expr); 16038 fclose(f); 16039 } else { 16040 semsg("(NFA) Could not open \"%s\" to write !!!", 16041 BT_REGEXP_DEBUG_LOG_NAME); 16042 } 16043 } 16044 #endif 16045 // If the NFA engine failed, try the backtracking engine. The NFA engine 16046 // also fails for patterns that it can't handle well but are still valid 16047 // patterns, thus a retry should work. 16048 // But don't try if an error message was given. 16049 if (regexp_engine == AUTOMATIC_ENGINE && called_emsg == called_emsg_before) { 16050 regexp_engine = BACKTRACKING_ENGINE; 16051 report_re_switch(expr); 16052 prog = bt_regengine.regcomp((uint8_t *)expr, re_flags); 16053 } 16054 } 16055 16056 if (prog != NULL) { 16057 // Store the info needed to call regcomp() again when the engine turns out 16058 // to be very slow when executing it. 16059 prog->re_engine = (unsigned)regexp_engine; 16060 prog->re_flags = (unsigned)re_flags; 16061 } 16062 16063 return prog; 16064 } 16065 16066 // Free a compiled regexp program, returned by vim_regcomp(). 16067 void vim_regfree(regprog_T *prog) 16068 { 16069 if (prog != NULL) { 16070 prog->engine->regfree(prog); 16071 } 16072 } 16073 16074 #if defined(EXITFREE) 16075 void free_regexp_stuff(void) 16076 { 16077 ga_clear(®stack); 16078 ga_clear(&backpos); 16079 xfree(reg_tofree); 16080 xfree(reg_prev_sub); 16081 } 16082 16083 #endif 16084 16085 static void report_re_switch(const char *pat) 16086 { 16087 if (p_verbose > 0) { 16088 verbose_enter(); 16089 msg_puts(_("Switching to backtracking RE engine for pattern: ")); 16090 msg_puts(pat); 16091 verbose_leave(); 16092 } 16093 } 16094 16095 /// Match a regexp against a string. 16096 /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp(). 16097 /// Note: "rmp->regprog" may be freed and changed. 16098 /// Uses curbuf for line count and 'iskeyword'. 16099 /// When "nl" is true consider a "\n" in "line" to be a line break. 16100 /// 16101 /// @param rmp 16102 /// @param line the string to match against 16103 /// @param col the column to start looking for match 16104 /// @param nl 16105 /// 16106 /// @return true if there is a match, false if not. 16107 static bool vim_regexec_string(regmatch_T *rmp, const char *line, colnr_T col, bool nl) 16108 { 16109 regexec_T rex_save; 16110 bool rex_in_use_save = rex_in_use; 16111 16112 // Cannot use the same prog recursively, it contains state. 16113 if (rmp->regprog->re_in_use) { 16114 emsg(_(e_recursive)); 16115 return false; 16116 } 16117 rmp->regprog->re_in_use = true; 16118 16119 if (rex_in_use) { 16120 // Being called recursively, save the state. 16121 rex_save = rex; 16122 } 16123 rex_in_use = true; 16124 16125 rex.reg_startp = NULL; 16126 rex.reg_endp = NULL; 16127 rex.reg_startpos = NULL; 16128 rex.reg_endpos = NULL; 16129 16130 int result = rmp->regprog->engine->regexec_nl(rmp, (uint8_t *)line, col, nl); 16131 rmp->regprog->re_in_use = false; 16132 16133 // NFA engine aborted because it's very slow, use backtracking engine instead. 16134 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE 16135 && result == NFA_TOO_EXPENSIVE) { 16136 int save_p_re = (int)p_re; 16137 int re_flags = (int)rmp->regprog->re_flags; 16138 char *pat = xstrdup(((nfa_regprog_T *)rmp->regprog)->pattern); 16139 16140 p_re = BACKTRACKING_ENGINE; 16141 vim_regfree(rmp->regprog); 16142 report_re_switch(pat); 16143 rmp->regprog = vim_regcomp(pat, re_flags); 16144 if (rmp->regprog != NULL) { 16145 rmp->regprog->re_in_use = true; 16146 result = rmp->regprog->engine->regexec_nl(rmp, (uint8_t *)line, col, nl); 16147 rmp->regprog->re_in_use = false; 16148 } 16149 16150 xfree(pat); 16151 p_re = save_p_re; 16152 } 16153 16154 rex_in_use = rex_in_use_save; 16155 if (rex_in_use) { 16156 rex = rex_save; 16157 } 16158 16159 return result > 0; 16160 } 16161 16162 // Note: "*prog" may be freed and changed. 16163 // Return true if there is a match, false if not. 16164 bool vim_regexec_prog(regprog_T **prog, bool ignore_case, const char *line, colnr_T col) 16165 { 16166 regmatch_T regmatch = { .regprog = *prog, .rm_ic = ignore_case }; 16167 bool r = vim_regexec_string(®match, line, col, false); 16168 *prog = regmatch.regprog; 16169 return r; 16170 } 16171 16172 // Note: "rmp->regprog" may be freed and changed. 16173 // Return true if there is a match, false if not. 16174 bool vim_regexec(regmatch_T *rmp, const char *line, colnr_T col) 16175 { 16176 return vim_regexec_string(rmp, line, col, false); 16177 } 16178 16179 // Like vim_regexec(), but consider a "\n" in "line" to be a line break. 16180 // Note: "rmp->regprog" may be freed and changed. 16181 // Return true if there is a match, false if not. 16182 bool vim_regexec_nl(regmatch_T *rmp, const char *line, colnr_T col) 16183 { 16184 return vim_regexec_string(rmp, line, col, true); 16185 } 16186 16187 /// Match a regexp against multiple lines. 16188 /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp(). 16189 /// Note: "rmp->regprog" may be freed and changed, even set to NULL. 16190 /// Uses curbuf for line count and 'iskeyword'. 16191 /// 16192 /// @param win window in which to search or NULL 16193 /// @param buf buffer in which to search 16194 /// @param lnum nr of line to start looking for match 16195 /// @param col column to start looking for match 16196 /// @param tm timeout limit or NULL 16197 /// @param timed_out flag is set when timeout limit reached 16198 /// 16199 /// @return zero if there is no match. Return number of lines contained in the 16200 /// match otherwise. 16201 int vim_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, 16202 proftime_T *tm, int *timed_out) 16203 FUNC_ATTR_NONNULL_ARG(1) 16204 { 16205 regexec_T rex_save; 16206 bool rex_in_use_save = rex_in_use; 16207 16208 // Cannot use the same prog recursively, it contains state. 16209 if (rmp->regprog->re_in_use) { 16210 emsg(_(e_recursive)); 16211 return false; 16212 } 16213 rmp->regprog->re_in_use = true; 16214 16215 if (rex_in_use) { 16216 // Being called recursively, save the state. 16217 rex_save = rex; 16218 } 16219 rex_in_use = true; 16220 16221 int result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm, timed_out); 16222 rmp->regprog->re_in_use = false; 16223 16224 // NFA engine aborted because it's very slow, use backtracking engine instead. 16225 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE 16226 && result == NFA_TOO_EXPENSIVE) { 16227 int save_p_re = (int)p_re; 16228 int re_flags = (int)rmp->regprog->re_flags; 16229 char *pat = xstrdup(((nfa_regprog_T *)rmp->regprog)->pattern); 16230 16231 p_re = BACKTRACKING_ENGINE; 16232 regprog_T *prev_prog = rmp->regprog; 16233 16234 report_re_switch(pat); 16235 // checking for \z misuse was already done when compiling for NFA, 16236 // allow all here 16237 reg_do_extmatch = REX_ALL; 16238 rmp->regprog = vim_regcomp(pat, re_flags); 16239 reg_do_extmatch = 0; 16240 16241 if (rmp->regprog == NULL) { 16242 // Somehow compiling the pattern failed now, put back the 16243 // previous one to avoid "regprog" becoming NULL. 16244 rmp->regprog = prev_prog; 16245 } else { 16246 vim_regfree(prev_prog); 16247 16248 rmp->regprog->re_in_use = true; 16249 result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm, timed_out); 16250 rmp->regprog->re_in_use = false; 16251 } 16252 16253 xfree(pat); 16254 p_re = save_p_re; 16255 } 16256 16257 rex_in_use = rex_in_use_save; 16258 if (rex_in_use) { 16259 rex = rex_save; 16260 } 16261 16262 return result <= 0 ? 0 : result; 16263 }