spellsuggest.c (124194B)
1 // spellsuggest.c: functions for spelling suggestions 2 3 #include <assert.h> 4 #include <inttypes.h> 5 #include <limits.h> 6 #include <stdbool.h> 7 #include <stddef.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 12 #include "nvim/ascii_defs.h" 13 #include "nvim/buffer_defs.h" 14 #include "nvim/change.h" 15 #include "nvim/charset.h" 16 #include "nvim/cursor.h" 17 #include "nvim/errors.h" 18 #include "nvim/eval/typval.h" 19 #include "nvim/eval/typval_defs.h" 20 #include "nvim/eval/vars.h" 21 #include "nvim/fileio.h" 22 #include "nvim/garray.h" 23 #include "nvim/garray_defs.h" 24 #include "nvim/getchar.h" 25 #include "nvim/gettext_defs.h" 26 #include "nvim/globals.h" 27 #include "nvim/hashtab.h" 28 #include "nvim/hashtab_defs.h" 29 #include "nvim/highlight_defs.h" 30 #include "nvim/input.h" 31 #include "nvim/macros_defs.h" 32 #include "nvim/mbyte.h" 33 #include "nvim/mbyte_defs.h" 34 #include "nvim/memline.h" 35 #include "nvim/memory.h" 36 #include "nvim/message.h" 37 #include "nvim/normal.h" 38 #include "nvim/option.h" 39 #include "nvim/option_vars.h" 40 #include "nvim/os/fs.h" 41 #include "nvim/os/input.h" 42 #include "nvim/os/os_defs.h" 43 #include "nvim/pos_defs.h" 44 #include "nvim/profile.h" 45 #include "nvim/spell.h" 46 #include "nvim/spell_defs.h" 47 #include "nvim/spellfile.h" 48 #include "nvim/spellsuggest.h" 49 #include "nvim/strings.h" 50 #include "nvim/types_defs.h" 51 #include "nvim/ui.h" 52 #include "nvim/ui_defs.h" 53 #include "nvim/undo.h" 54 #include "nvim/vim_defs.h" 55 56 // Use this to adjust the score after finding suggestions, based on the 57 // suggested word sounding like the bad word. This is much faster than doing 58 // it for every possible suggestion. 59 // Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" 60 // vs "ht") and goes down in the list. 61 // Used when 'spellsuggest' is set to "best". 62 #define RESCORE(word_score, sound_score) ((3 * (word_score) + (sound_score)) / 4) 63 64 // Do the opposite: based on a maximum end score and a known sound score, 65 // compute the maximum word score that can be used. 66 #define MAXSCORE(word_score, sound_score) ((4 * (word_score) - (sound_score)) / 3) 67 68 // only used for su_badflags 69 #define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI 70 71 /// Information used when looking for suggestions. 72 typedef struct { 73 garray_T su_ga; ///< suggestions, contains "suggest_T" 74 int su_maxcount; ///< max. number of suggestions displayed 75 int su_maxscore; ///< maximum score for adding to su_ga 76 int su_sfmaxscore; ///< idem, for when doing soundfold words 77 garray_T su_sga; ///< like su_ga, sound-folded scoring 78 char *su_badptr; ///< start of bad word in line 79 int su_badlen; ///< length of detected bad word in line 80 int su_badflags; ///< caps flags for bad word 81 char su_badword[MAXWLEN]; ///< bad word truncated at su_badlen 82 char su_fbadword[MAXWLEN]; ///< su_badword case-folded 83 char su_sal_badword[MAXWLEN]; ///< su_badword soundfolded 84 hashtab_T su_banned; ///< table with banned words 85 slang_T *su_sallang; ///< default language for sound folding 86 } suginfo_T; 87 88 /// One word suggestion. Used in "si_ga". 89 typedef struct { 90 char *st_word; ///< suggested word, allocated string 91 int st_wordlen; ///< strlen(st_word) 92 int st_orglen; ///< length of replaced text 93 int st_score; ///< lower is better 94 int st_altscore; ///< used when st_score compares equal 95 bool st_salscore; ///< st_score is for soundalike 96 bool st_had_bonus; ///< bonus already included in score 97 slang_T *st_slang; ///< language used for sound folding 98 } suggest_T; 99 100 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) 101 102 // True if a word appears in the list of banned words. 103 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&(su)->su_banned, word))) 104 105 // Number of suggestions kept when cleaning up. We need to keep more than 106 // what is displayed, because when rescore_suggestions() is called the score 107 // may change and wrong suggestions may be removed later. 108 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < \ 109 130 ? 150 : (su)->su_maxcount + 20) 110 111 // Threshold for sorting and cleaning up suggestions. Don't want to keep lots 112 // of suggestions that are not going to be displayed. 113 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) 114 115 // score for various changes 116 enum { 117 SCORE_SPLIT = 149, // split bad word 118 SCORE_SPLIT_NO = 249, // split bad word with NOSPLITSUGS 119 SCORE_ICASE = 52, // slightly different case 120 SCORE_REGION = 200, // word is for different region 121 SCORE_RARE = 180, // rare word 122 SCORE_SWAP = 75, // swap two characters 123 SCORE_SWAP3 = 110, // swap two characters in three 124 SCORE_REP = 65, // REP replacement 125 SCORE_SUBST = 93, // substitute a character 126 SCORE_SIMILAR = 33, // substitute a similar character 127 SCORE_SUBCOMP = 33, // substitute a composing character 128 SCORE_DEL = 94, // delete a character 129 SCORE_DELDUP = 66, // delete a duplicated character 130 SCORE_DELCOMP = 28, // delete a composing character 131 SCORE_INS = 96, // insert a character 132 SCORE_INSDUP = 67, // insert a duplicate character 133 SCORE_INSCOMP = 30, // insert a composing character 134 SCORE_NONWORD = 103, // change non-word to word char 135 }; 136 137 enum { 138 SCORE_FILE = 30, // suggestion from a file 139 SCORE_MAXINIT = 350, // Initial maximum score: higher == slower. 140 // 350 allows for about three changes. 141 }; 142 143 enum { 144 SCORE_COMMON1 = 30, // subtracted for words seen before 145 SCORE_COMMON2 = 40, // subtracted for words often seen 146 SCORE_COMMON3 = 50, // subtracted for words very often seen 147 SCORE_THRES2 = 10, // word count threshold for COMMON2 148 SCORE_THRES3 = 100, // word count threshold for COMMON3 149 }; 150 151 // When trying changed soundfold words it becomes slow when trying more than 152 // two changes. With less than two changes it's slightly faster but we miss a 153 // few good suggestions. In rare cases we need to try three of four changes. 154 enum { 155 SCORE_SFMAX1 = 200, // maximum score for first try 156 SCORE_SFMAX2 = 300, // maximum score for second try 157 SCORE_SFMAX3 = 400, // maximum score for third try 158 }; 159 160 #define SCORE_BIG (SCORE_INS * 3) // big difference 161 enum { 162 SCORE_MAXMAX = 999999, // accept any score 163 SCORE_LIMITMAX = 350, // for spell_edit_score_limit() 164 }; 165 166 // for spell_edit_score_limit() we need to know the minimum value of 167 // SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS 168 #define SCORE_EDIT_MIN SCORE_SIMILAR 169 170 /// For finding suggestions: At each node in the tree these states are tried: 171 typedef enum { 172 STATE_START = 0, ///< At start of node check for NUL bytes (goodword 173 ///< ends); if badword ends there is a match, otherwise 174 ///< try splitting word. 175 STATE_NOPREFIX, ///< try without prefix 176 STATE_SPLITUNDO, ///< Undo splitting. 177 STATE_ENDNUL, ///< Past NUL bytes at start of the node. 178 STATE_PLAIN, ///< Use each byte of the node. 179 STATE_DEL, ///< Delete a byte from the bad word. 180 STATE_INS_PREP, ///< Prepare for inserting bytes. 181 STATE_INS, ///< Insert a byte in the bad word. 182 STATE_SWAP, ///< Swap two bytes. 183 STATE_UNSWAP, ///< Undo swap two characters. 184 STATE_SWAP3, ///< Swap two characters over three. 185 STATE_UNSWAP3, ///< Undo Swap two characters over three. 186 STATE_UNROT3L, ///< Undo rotate three characters left 187 STATE_UNROT3R, ///< Undo rotate three characters right 188 STATE_REP_INI, ///< Prepare for using REP items. 189 STATE_REP, ///< Use matching REP items from the .aff file. 190 STATE_REP_UNDO, ///< Undo a REP item replacement. 191 STATE_FINAL, ///< End of this node. 192 } state_T; 193 194 /// Struct to keep the state at each level in suggest_try_change(). 195 typedef struct { 196 state_T ts_state; ///< state at this level, STATE_ 197 int ts_score; ///< score 198 idx_T ts_arridx; ///< index in tree array, start of node 199 int16_t ts_curi; ///< index in list of child nodes 200 uint8_t ts_fidx; ///< index in fword[], case-folded bad word 201 uint8_t ts_fidxtry; ///< ts_fidx at which bytes may be changed 202 uint8_t ts_twordlen; ///< valid length of tword[] 203 uint8_t ts_prefixdepth; ///< stack depth for end of prefix or 204 ///< PFD_PREFIXTREE or PFD_NOPREFIX 205 uint8_t ts_flags; ///< TSF_ flags 206 uint8_t ts_tcharlen; ///< number of bytes in tword character 207 uint8_t ts_tcharidx; ///< current byte index in tword character 208 uint8_t ts_isdiff; ///< DIFF_ values 209 uint8_t ts_fcharstart; ///< index in fword where badword char started 210 uint8_t ts_prewordlen; ///< length of word in "preword[]" 211 uint8_t ts_splitoff; ///< index in "tword" after last split 212 uint8_t ts_splitfidx; ///< "ts_fidx" at word split 213 uint8_t ts_complen; ///< nr of compound words used 214 uint8_t ts_compsplit; ///< index for "compflags" where word was spit 215 uint8_t ts_save_badflags; ///< su_badflags saved here 216 uint8_t ts_delidx; ///< index in fword for char that was deleted, 217 ///< valid when "ts_flags" has TSF_DIDDEL 218 } trystate_T; 219 220 // values for ts_isdiff 221 enum { 222 DIFF_NONE = 0, // no different byte (yet) 223 DIFF_YES = 1, // different byte found 224 DIFF_INSERT = 2, // inserting character 225 }; 226 227 // values for ts_flags 228 enum { 229 TSF_PREFIXOK = 1, // already checked that prefix is OK 230 TSF_DIDSPLIT = 2, // tried split at this point 231 TSF_DIDDEL = 4, // did a delete, "ts_delidx" has index 232 }; 233 234 // special values ts_prefixdepth 235 enum { 236 PFD_NOPREFIX = 0xff, // not using prefixes 237 PFD_PREFIXTREE = 0xfe, // walking through the prefix tree 238 PFD_NOTSPECIAL = 0xfd, // highest value that's not special 239 }; 240 241 static int spell_suggest_timeout = 5000; 242 243 #include "spellsuggest.c.generated.h" 244 245 /// Returns true when the sequence of flags in "compflags" plus "flag" can 246 /// possibly form a valid compounded word. This also checks the COMPOUNDRULE 247 /// lines if they don't contain wildcards. 248 static bool can_be_compound(trystate_T *sp, slang_T *slang, uint8_t *compflags, int flag) 249 { 250 // If the flag doesn't appear in sl_compstartflags or sl_compallflags 251 // then it can't possibly compound. 252 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit 253 ? slang->sl_compstartflags : slang->sl_compallflags, flag)) { 254 return false; 255 } 256 257 // If there are no wildcards, we can check if the flags collected so far 258 // possibly can form a match with COMPOUNDRULE patterns. This only 259 // makes sense when we have two or more words. 260 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) { 261 compflags[sp->ts_complen] = (uint8_t)flag; 262 compflags[sp->ts_complen + 1] = NUL; 263 bool v = match_compoundrule(slang, compflags + sp->ts_compsplit); 264 compflags[sp->ts_complen] = NUL; 265 return v; 266 } 267 268 return true; 269 } 270 271 /// Adjust the score of common words. 272 /// 273 /// @param split word was split, less bonus 274 static int score_wordcount_adj(slang_T *slang, int score, char *word, bool split) 275 { 276 int bonus; 277 278 hashitem_T *hi = hash_find(&slang->sl_wordcount, word); 279 if (HASHITEM_EMPTY(hi)) { 280 return score; 281 } 282 283 wordcount_T *wc = HI2WC(hi); 284 if (wc->wc_count < SCORE_THRES2) { 285 bonus = SCORE_COMMON1; 286 } else if (wc->wc_count < SCORE_THRES3) { 287 bonus = SCORE_COMMON2; 288 } else { 289 bonus = SCORE_COMMON3; 290 } 291 int newscore = split ? score - bonus / 2 292 : score - bonus; 293 if (newscore < 0) { 294 return 0; 295 } 296 return newscore; 297 } 298 299 /// Like captype() but for a KEEPCAP word add ONECAP if the word starts with a 300 /// capital. So that make_case_word() can turn WOrd into Word. 301 /// Add ALLCAP for "WOrD". 302 static int badword_captype(char *word, char *end) 303 FUNC_ATTR_NONNULL_ALL 304 { 305 int flags = captype(word, end); 306 307 if (!(flags & WF_KEEPCAP)) { 308 return flags; 309 } 310 311 // Count the number of UPPER and lower case letters. 312 int l = 0; 313 int u = 0; 314 bool first = false; 315 for (char *p = word; p < end; MB_PTR_ADV(p)) { 316 int c = utf_ptr2char(p); 317 if (SPELL_ISUPPER(c)) { 318 u++; 319 if (p == word) { 320 first = true; 321 } 322 } else { 323 l++; 324 } 325 } 326 327 // If there are more UPPER than lower case letters suggest an 328 // ALLCAP word. Otherwise, if the first letter is UPPER then 329 // suggest ONECAP. Exception: "ALl" most likely should be "All", 330 // require three upper case letters. 331 if (u > l && u > 2) { 332 flags |= WF_ALLCAP; 333 } else if (first) { 334 flags |= WF_ONECAP; 335 } 336 337 if (u >= 2 && l >= 2) { // maCARONI maCAroni 338 flags |= WF_MIXCAP; 339 } 340 341 return flags; 342 } 343 344 /// Opposite of offset2bytes(). 345 /// "pp" points to the bytes and is advanced over it. 346 /// 347 /// @return the offset. 348 static int bytes2offset(char **pp) 349 { 350 uint8_t *p = (uint8_t *)(*pp); 351 int nr; 352 353 int c = *p++; 354 if ((c & 0x80) == 0x00) { // 1 byte 355 nr = c - 1; 356 } else if ((c & 0xc0) == 0x80) { // 2 bytes 357 nr = (c & 0x3f) - 1; 358 nr = nr * 255 + (*p++ - 1); 359 } else if ((c & 0xe0) == 0xc0) { // 3 bytes 360 nr = (c & 0x1f) - 1; 361 nr = nr * 255 + (*p++ - 1); 362 nr = nr * 255 + (*p++ - 1); 363 } else { // 4 bytes 364 nr = (c & 0x0f) - 1; 365 nr = nr * 255 + (*p++ - 1); 366 nr = nr * 255 + (*p++ - 1); 367 nr = nr * 255 + (*p++ - 1); 368 } 369 370 *pp = (char *)p; 371 return nr; 372 } 373 374 // values for sps_flags 375 enum { 376 SPS_BEST = 1, 377 SPS_FAST = 2, 378 SPS_DOUBLE = 4, 379 }; 380 381 static int sps_flags = SPS_BEST; ///< flags from 'spellsuggest' 382 static int sps_limit = 9999; ///< max nr of suggestions given 383 384 /// Check the 'spellsuggest' option. Return FAIL if it's wrong. 385 /// Sets "sps_flags" and "sps_limit". 386 int spell_check_sps(void) 387 { 388 char buf[MAXPATHL]; 389 390 sps_flags = 0; 391 sps_limit = 9999; 392 393 for (char *p = p_sps; *p != NUL;) { 394 copy_option_part(&p, buf, MAXPATHL, ","); 395 396 int f = 0; 397 if (ascii_isdigit(*buf)) { 398 char *s = buf; 399 sps_limit = getdigits_int(&s, true, 0); 400 if (*s != NUL && !ascii_isdigit(*s)) { 401 f = -1; 402 } 403 // Note: Keep this in sync with opt_sps_values. 404 } else if (strcmp(buf, "best") == 0) { 405 f = SPS_BEST; 406 } else if (strcmp(buf, "fast") == 0) { 407 f = SPS_FAST; 408 } else if (strcmp(buf, "double") == 0) { 409 f = SPS_DOUBLE; 410 } else if (strncmp(buf, "expr:", 5) != 0 411 && strncmp(buf, "file:", 5) != 0 412 && (strncmp(buf, "timeout:", 8) != 0 413 || (!ascii_isdigit(buf[8]) 414 && !(buf[8] == '-' && ascii_isdigit(buf[9]))))) { 415 f = -1; 416 } 417 418 if (f == -1 || (sps_flags != 0 && f != 0)) { 419 sps_flags = SPS_BEST; 420 sps_limit = 9999; 421 return FAIL; 422 } 423 if (f != 0) { 424 sps_flags = f; 425 } 426 } 427 428 if (sps_flags == 0) { 429 sps_flags = SPS_BEST; 430 } 431 432 return OK; 433 } 434 435 /// "z=": Find badly spelled word under or after the cursor. 436 /// Give suggestions for the properly spelled word. 437 /// In Visual mode use the highlighted word as the bad word. 438 /// When "count" is non-zero use that suggestion. 439 void spell_suggest(int count) 440 { 441 pos_T prev_cursor = curwin->w_cursor; 442 bool mouse_used = false; 443 int badlen = 0; 444 int msg_scroll_save = msg_scroll; 445 const int wo_spell_save = curwin->w_p_spell; 446 447 if (!curwin->w_p_spell) { 448 parse_spelllang(curwin); 449 curwin->w_p_spell = true; 450 } 451 452 if (*curwin->w_s->b_p_spl == NUL) { 453 emsg(_(e_no_spell)); 454 goto skip; 455 } 456 457 if (VIsual_active) { 458 // Use the Visually selected text as the bad word. But reject 459 // a multi-line selection. 460 if (curwin->w_cursor.lnum != VIsual.lnum) { 461 vim_beep(kOptBoFlagSpell); 462 goto skip; 463 } 464 badlen = (int)curwin->w_cursor.col - (int)VIsual.col; 465 if (badlen < 0) { 466 badlen = -badlen; 467 } else { 468 curwin->w_cursor.col = VIsual.col; 469 } 470 badlen++; 471 end_visual_mode(); 472 // make sure we don't include the NUL at the end of the line 473 badlen = MIN(badlen, get_cursor_line_len() - curwin->w_cursor.col); 474 // Find the start of the badly spelled word. 475 } else if (spell_move_to(curwin, FORWARD, SMT_ALL, true, NULL) == 0 476 || curwin->w_cursor.col > prev_cursor.col) { 477 // No bad word or it starts after the cursor: use the word under the 478 // cursor. 479 curwin->w_cursor = prev_cursor; 480 char *curline = get_cursor_line_ptr(); 481 char *p = curline + curwin->w_cursor.col; 482 // Backup to before start of word. 483 while (p > curline && spell_iswordp_nmw(p, curwin)) { 484 MB_PTR_BACK(curline, p); 485 } 486 // Forward to start of word. 487 while (*p != NUL && !spell_iswordp_nmw(p, curwin)) { 488 MB_PTR_ADV(p); 489 } 490 491 if (!spell_iswordp_nmw(p, curwin)) { // No word found. 492 beep_flush(); 493 goto skip; 494 } 495 curwin->w_cursor.col = (colnr_T)(p - curline); 496 } 497 498 // Get the word and its length. 499 500 // Figure out if the word should be capitalised. 501 int need_cap = check_need_cap(curwin, curwin->w_cursor.lnum, curwin->w_cursor.col); 502 503 // Make a copy of current line since autocommands may free the line. 504 char *line = xstrnsave(get_cursor_line_ptr(), (size_t)get_cursor_line_len()); 505 spell_suggest_timeout = 5000; 506 507 // Get the list of suggestions. Limit to 'lines' - 2 or the number in 508 // 'spellsuggest', whatever is smaller. 509 suginfo_T sug; 510 int limit = MIN(sps_limit, Rows - 2); 511 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit, 512 true, need_cap, true); 513 514 int selected = count; 515 msg_ext_set_kind("confirm"); 516 if (GA_EMPTY(&sug.su_ga)) { 517 msg(_("Sorry, no suggestions"), 0); 518 } else if (count > 0) { 519 if (count > sug.su_ga.ga_len) { 520 smsg(0, _("Sorry, only %" PRId64 " suggestions"), 521 (int64_t)sug.su_ga.ga_len); 522 } 523 } else { 524 // When 'rightleft' is set the list is drawn right-left. 525 cmdmsg_rl = curwin->w_p_rl; 526 527 // List the suggestions. 528 msg_start(); 529 msg_row = Rows - 1; // for when 'cmdheight' > 1 530 lines_left = Rows; // avoid more prompt 531 char *fmt = _("Change \"%.*s\" to:"); 532 if (cmdmsg_rl && strncmp(fmt, "Change", 6) == 0) { 533 // And now the rabbit from the high hat: Avoid showing the 534 // untranslated message rightleft. 535 fmt = ":ot \"%.*s\" egnahC"; 536 } 537 vim_snprintf(IObuff, IOSIZE, fmt, sug.su_badlen, sug.su_badptr); 538 msg_puts(IObuff); 539 msg_clr_eos(); 540 msg_putchar('\n'); 541 542 msg_scroll = true; 543 for (int i = 0; i < sug.su_ga.ga_len; i++) { 544 suggest_T *stp = &SUG(sug.su_ga, i); 545 546 // The suggested word may replace only part of the bad word, add 547 // the not replaced part. But only when it's not getting too long. 548 char wcopy[MAXWLEN + 2]; 549 xstrlcpy(wcopy, stp->st_word, MAXWLEN + 1); 550 int el = sug.su_badlen - stp->st_orglen; 551 if (el > 0 && stp->st_wordlen + el <= MAXWLEN) { 552 assert(sug.su_badptr != NULL); 553 xmemcpyz(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen, (size_t)el); 554 } 555 vim_snprintf(IObuff, IOSIZE, "%2d", i + 1); 556 if (cmdmsg_rl) { 557 rl_mirror_ascii(IObuff, NULL); 558 } 559 msg_puts(IObuff); 560 561 vim_snprintf(IObuff, IOSIZE, " \"%s\"", wcopy); 562 msg_puts(IObuff); 563 564 // The word may replace more than "su_badlen". 565 if (sug.su_badlen < stp->st_orglen) { 566 vim_snprintf(IObuff, IOSIZE, _(" < \"%.*s\""), 567 stp->st_orglen, sug.su_badptr); 568 msg_puts(IObuff); 569 } 570 571 if (p_verbose > 0) { 572 // Add the score. 573 if (sps_flags & (SPS_DOUBLE | SPS_BEST)) { 574 vim_snprintf(IObuff, IOSIZE, " (%s%d - %d)", 575 stp->st_salscore ? "s " : "", 576 stp->st_score, stp->st_altscore); 577 } else { 578 vim_snprintf(IObuff, IOSIZE, " (%d)", 579 stp->st_score); 580 } 581 if (cmdmsg_rl) { 582 // Mirror the numbers, but keep the leading space. 583 rl_mirror_ascii(IObuff + 1, NULL); 584 } 585 msg_advance(30); 586 msg_puts(IObuff); 587 } 588 if (!ui_has(kUIMessages) || i < sug.su_ga.ga_len - 1) { 589 msg_putchar('\n'); 590 } 591 } 592 593 cmdmsg_rl = false; 594 msg_col = 0; 595 // Ask for choice. 596 selected = prompt_for_input(NULL, 0, false, &mouse_used); 597 if (mouse_used) { 598 selected = sug.su_ga.ga_len + 1 - (cmdline_row - mouse_row); 599 } 600 601 lines_left = Rows; // avoid more prompt 602 // don't delay for 'smd' in normal_cmd() 603 msg_scroll = msg_scroll_save; 604 } 605 606 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) { 607 // Save the from and to text for :spellrepall. 608 XFREE_CLEAR(repl_from); 609 XFREE_CLEAR(repl_to); 610 611 suggest_T *stp = &SUG(sug.su_ga, selected - 1); 612 if (sug.su_badlen > stp->st_orglen) { 613 // Replacing less than "su_badlen", append the remainder to 614 // repl_to. 615 repl_from = xstrnsave(sug.su_badptr, (size_t)sug.su_badlen); 616 vim_snprintf(IObuff, IOSIZE, "%s%.*s", stp->st_word, 617 sug.su_badlen - stp->st_orglen, 618 sug.su_badptr + stp->st_orglen); 619 repl_to = xstrdup(IObuff); 620 } else { 621 // Replacing su_badlen or more, use the whole word. 622 repl_from = xstrnsave(sug.su_badptr, (size_t)stp->st_orglen); 623 repl_to = xstrdup(stp->st_word); 624 } 625 626 // Replace the word. 627 char *p = xmalloc(strlen(line) - (size_t)stp->st_orglen + (size_t)stp->st_wordlen + 1); 628 int c = (int)(sug.su_badptr - line); 629 memmove(p, line, (size_t)c); 630 STRCPY(p + c, stp->st_word); 631 strcat(p, sug.su_badptr + stp->st_orglen); 632 633 // For redo we use a change-word command. 634 ResetRedobuff(); 635 AppendToRedobuff("ciw"); 636 AppendToRedobuffLit(p + c, 637 stp->st_wordlen + sug.su_badlen - stp->st_orglen); 638 AppendCharToRedobuff(ESC); 639 640 // "p" may be freed here 641 ml_replace(curwin->w_cursor.lnum, p, false); 642 curwin->w_cursor.col = c; 643 644 inserted_bytes(curwin->w_cursor.lnum, c, stp->st_orglen, stp->st_wordlen); 645 } else { 646 curwin->w_cursor = prev_cursor; 647 } 648 649 spell_find_cleanup(&sug); 650 xfree(line); 651 skip: 652 curwin->w_p_spell = wo_spell_save; 653 } 654 655 /// Find spell suggestions for "word". Return them in the growarray "*gap" as 656 /// a list of allocated strings. 657 /// 658 /// @param maxcount maximum nr of suggestions 659 /// @param need_cap 'spellcapcheck' matched 660 void spell_suggest_list(garray_T *gap, char *word, int maxcount, bool need_cap, bool interactive) 661 { 662 suginfo_T sug; 663 664 spell_find_suggest(word, 0, &sug, maxcount, false, need_cap, interactive); 665 666 // Make room in "gap". 667 ga_init(gap, sizeof(char *), sug.su_ga.ga_len + 1); 668 ga_grow(gap, sug.su_ga.ga_len); 669 for (int i = 0; i < sug.su_ga.ga_len; i++) { 670 suggest_T *stp = &SUG(sug.su_ga, i); 671 672 // The suggested word may replace only part of "word", add the not 673 // replaced part. 674 char *wcopy = xmalloc((size_t)stp->st_wordlen + strlen(sug.su_badptr + stp->st_orglen) + 1); 675 STRCPY(wcopy, stp->st_word); 676 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); 677 ((char **)gap->ga_data)[gap->ga_len++] = wcopy; 678 } 679 680 spell_find_cleanup(&sug); 681 } 682 683 /// Find spell suggestions for the word at the start of "badptr". 684 /// Return the suggestions in "su->su_ga". 685 /// The maximum number of suggestions is "maxcount". 686 /// Note: does use info for the current window. 687 /// This is based on the mechanisms of Aspell, but completely reimplemented. 688 /// 689 /// @param badlen length of bad word or 0 if unknown 690 /// @param banbadword don't include badword in suggestions 691 /// @param need_cap word should start with capital 692 static void spell_find_suggest(char *badptr, int badlen, suginfo_T *su, int maxcount, 693 bool banbadword, bool need_cap, bool interactive) 694 { 695 hlf_T attr = HLF_COUNT; 696 char buf[MAXPATHL]; 697 bool do_combine = false; 698 static bool expr_busy = false; 699 bool did_intern = false; 700 701 // Set the info in "*su". 702 CLEAR_POINTER(su); 703 ga_init(&su->su_ga, (int)sizeof(suggest_T), 10); 704 ga_init(&su->su_sga, (int)sizeof(suggest_T), 10); 705 if (*badptr == NUL) { 706 return; 707 } 708 hash_init(&su->su_banned); 709 710 su->su_badptr = badptr; 711 if (badlen != 0) { 712 su->su_badlen = badlen; 713 } else { 714 size_t tmplen = spell_check(curwin, su->su_badptr, &attr, NULL, false); 715 assert(tmplen <= INT_MAX); 716 su->su_badlen = (int)tmplen; 717 } 718 su->su_maxcount = maxcount; 719 su->su_maxscore = SCORE_MAXINIT; 720 su->su_badlen = MIN(su->su_badlen, MAXWLEN - 1); // just in case 721 xmemcpyz(su->su_badword, su->su_badptr, (size_t)su->su_badlen); 722 spell_casefold(curwin, su->su_badptr, su->su_badlen, su->su_fbadword, 723 MAXWLEN); 724 725 // TODO(vim): make this work if the case-folded text is longer than the 726 // original text. Currently an illegal byte causes wrong pointer 727 // computations. 728 su->su_fbadword[su->su_badlen] = NUL; 729 730 // get caps flags for bad word 731 su->su_badflags = badword_captype(su->su_badptr, 732 su->su_badptr + su->su_badlen); 733 if (need_cap) { 734 su->su_badflags |= WF_ONECAP; 735 } 736 737 // Find the default language for sound folding. We simply use the first 738 // one in 'spelllang' that supports sound folding. That's good for when 739 // using multiple files for one language, it's not that bad when mixing 740 // languages (e.g., "pl,en"). 741 for (int i = 0; i < curbuf->b_s.b_langp.ga_len; i++) { 742 langp_T *lp = LANGP_ENTRY(curbuf->b_s.b_langp, i); 743 if (lp->lp_sallang != NULL) { 744 su->su_sallang = lp->lp_sallang; 745 break; 746 } 747 } 748 749 // Soundfold the bad word with the default sound folding, so that we don't 750 // have to do this many times. 751 if (su->su_sallang != NULL) { 752 spell_soundfold(su->su_sallang, su->su_fbadword, true, 753 su->su_sal_badword); 754 } 755 756 // If the word is not capitalised and spell_check() doesn't consider the 757 // word to be bad then it might need to be capitalised. Add a suggestion 758 // for that. 759 int c = utf_ptr2char(su->su_badptr); 760 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) { 761 make_case_word(su->su_badword, buf, WF_ONECAP); 762 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, 763 0, true, su->su_sallang, false); 764 } 765 766 // Ban the bad word itself. It may appear in another region. 767 if (banbadword) { 768 add_banned(su, su->su_badword); 769 } 770 771 // Make a copy of 'spellsuggest', because the expression may change it. 772 char *sps_copy = xstrdup(p_sps); 773 774 // Loop over the items in 'spellsuggest'. 775 for (char *p = sps_copy; *p != NUL;) { 776 copy_option_part(&p, buf, MAXPATHL, ","); 777 778 if (strncmp(buf, "expr:", 5) == 0) { 779 // Evaluate an expression. Skip this when called recursively, 780 // when using spellsuggest() in the expression. 781 if (!expr_busy) { 782 expr_busy = true; 783 spell_suggest_expr(su, buf + 5); 784 expr_busy = false; 785 } 786 } else if (strncmp(buf, "file:", 5) == 0) { 787 // Use list of suggestions in a file. 788 spell_suggest_file(su, buf + 5); 789 } else if (strncmp(buf, "timeout:", 8) == 0) { 790 // Limit the time searching for suggestions. 791 spell_suggest_timeout = atoi(buf + 8); 792 } else if (!did_intern) { 793 // Use internal method once. 794 spell_suggest_intern(su, interactive); 795 if (sps_flags & SPS_DOUBLE) { 796 do_combine = true; 797 } 798 did_intern = true; 799 } 800 } 801 802 xfree(sps_copy); 803 804 if (do_combine) { 805 // Combine the two list of suggestions. This must be done last, 806 // because sorting changes the order again. 807 score_combine(su); 808 } 809 } 810 811 /// Find suggestions by evaluating expression "expr". 812 static void spell_suggest_expr(suginfo_T *su, char *expr) 813 { 814 const char *p; 815 816 // The work is split up in a few parts to avoid having to export 817 // suginfo_T. 818 // First evaluate the expression and get the resulting list. 819 list_T *const list = eval_spell_expr(su->su_badword, expr); 820 if (list != NULL) { 821 // Loop over the items in the list. 822 TV_LIST_ITER(list, li, { 823 if (TV_LIST_ITEM_TV(li)->v_type == VAR_LIST) { 824 // Get the word and the score from the items. 825 int score = get_spellword(TV_LIST_ITEM_TV(li)->vval.v_list, &p); 826 if (score >= 0 && score <= su->su_maxscore) { 827 add_suggestion(su, &su->su_ga, p, su->su_badlen, 828 score, 0, true, su->su_sallang, false); 829 } 830 } 831 }); 832 tv_list_unref(list); 833 } 834 835 // Remove bogus suggestions, sort and truncate at "maxcount". 836 check_suggestions(su, &su->su_ga); 837 cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 838 } 839 840 /// Find suggestions in file "fname". Used for "file:" in 'spellsuggest'. 841 static void spell_suggest_file(suginfo_T *su, char *fname) 842 { 843 char line[MAXWLEN * 2]; 844 int len; 845 char cword[MAXWLEN]; 846 847 // Open the file. 848 FILE *fd = os_fopen(fname, "r"); 849 if (fd == NULL) { 850 semsg(_(e_notopen), fname); 851 return; 852 } 853 854 // Read it line by line. 855 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) { 856 line_breakcheck(); 857 858 char *p = vim_strchr(line, '/'); 859 if (p == NULL) { 860 continue; // No Tab found, just skip the line. 861 } 862 *p++ = NUL; 863 if (STRICMP(su->su_badword, line) == 0) { 864 // Match! Isolate the good word, until CR or NL. 865 for (len = 0; (uint8_t)p[len] >= ' '; len++) {} 866 p[len] = NUL; 867 868 // If the suggestion doesn't have specific case duplicate the case 869 // of the bad word. 870 if (captype(p, NULL) == 0) { 871 make_case_word(p, cword, su->su_badflags); 872 p = cword; 873 } 874 875 add_suggestion(su, &su->su_ga, p, su->su_badlen, 876 SCORE_FILE, 0, true, su->su_sallang, false); 877 } 878 } 879 880 fclose(fd); 881 882 // Remove bogus suggestions, sort and truncate at "maxcount". 883 check_suggestions(su, &su->su_ga); 884 cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 885 } 886 887 /// Find suggestions for the internal method indicated by "sps_flags". 888 static void spell_suggest_intern(suginfo_T *su, bool interactive) 889 { 890 // Load the .sug file(s) that are available and not done yet. 891 suggest_load_files(); 892 893 // 1. Try special cases, such as repeating a word: "the the" -> "the". 894 // 895 // Set a maximum score to limit the combination of operations that is 896 // tried. 897 suggest_try_special(su); 898 899 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries 900 // from the .aff file and inserting a space (split the word). 901 suggest_try_change(su); 902 903 // For the resulting top-scorers compute the sound-a-like score. 904 if (sps_flags & SPS_DOUBLE) { 905 score_comp_sal(su); 906 } 907 908 // 3. Try finding sound-a-like words. 909 if ((sps_flags & SPS_FAST) == 0) { 910 if (sps_flags & SPS_BEST) { 911 // Adjust the word score for the suggestions found so far for how 912 // they sounds like. 913 rescore_suggestions(su); 914 } 915 916 // While going through the soundfold tree "su_maxscore" is the score 917 // for the soundfold word, limits the changes that are being tried, 918 // and "su_sfmaxscore" the rescored score, which is set by 919 // cleanup_suggestions(). 920 // First find words with a small edit distance, because this is much 921 // faster and often already finds the top-N suggestions. If we didn't 922 // find many suggestions try again with a higher edit distance. 923 // "sl_sounddone" is used to avoid doing the same word twice. 924 suggest_try_soundalike_prep(); 925 su->su_maxscore = SCORE_SFMAX1; 926 su->su_sfmaxscore = SCORE_MAXINIT * 3; 927 suggest_try_soundalike(su); 928 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) { 929 // We didn't find enough matches, try again, allowing more 930 // changes to the soundfold word. 931 su->su_maxscore = SCORE_SFMAX2; 932 suggest_try_soundalike(su); 933 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) { 934 // Still didn't find enough matches, try again, allowing even 935 // more changes to the soundfold word. 936 su->su_maxscore = SCORE_SFMAX3; 937 suggest_try_soundalike(su); 938 } 939 } 940 su->su_maxscore = su->su_sfmaxscore; 941 suggest_try_soundalike_finish(); 942 } 943 944 // When CTRL-C was hit while searching do show the results. Only clear 945 // got_int when using a command, not for spellsuggest(). 946 os_breakcheck(); 947 if (interactive && got_int) { 948 vgetc(); 949 got_int = false; 950 } 951 952 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) { 953 if (sps_flags & SPS_BEST) { 954 // Adjust the word score for how it sounds like. 955 rescore_suggestions(su); 956 } 957 958 // Remove bogus suggestions, sort and truncate at "maxcount". 959 check_suggestions(su, &su->su_ga); 960 cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 961 } 962 } 963 964 /// Free the info put in "*su" by spell_find_suggest(). 965 static void spell_find_cleanup(suginfo_T *su) 966 { 967 #define FREE_SUG_WORD(sug) xfree((sug)->st_word) 968 // Free the suggestions. 969 GA_DEEP_CLEAR(&su->su_ga, suggest_T, FREE_SUG_WORD); 970 GA_DEEP_CLEAR(&su->su_sga, suggest_T, FREE_SUG_WORD); 971 972 // Free the banned words. 973 hash_clear_all(&su->su_banned, 0); 974 } 975 976 /// Try finding suggestions by recognizing specific situations. 977 static void suggest_try_special(suginfo_T *su) 978 { 979 char word[MAXWLEN]; 980 981 // Recognize a word that is repeated: "the the". 982 char *p = skiptowhite(su->su_fbadword); 983 size_t len = (size_t)(p - su->su_fbadword); 984 p = skipwhite(p); 985 if (strlen(p) == len && strncmp(su->su_fbadword, p, len) == 0) { 986 // Include badflags: if the badword is onecap or allcap 987 // use that for the goodword too: "The the" -> "The". 988 char c = su->su_fbadword[len]; 989 su->su_fbadword[len] = NUL; 990 make_case_word(su->su_fbadword, word, su->su_badflags); 991 su->su_fbadword[len] = c; 992 993 // Give a soundalike score of 0, compute the score as if deleting one 994 // character. 995 add_suggestion(su, &su->su_ga, word, su->su_badlen, 996 RESCORE(SCORE_REP, 0), 0, true, su->su_sallang, false); 997 } 998 } 999 1000 // Measure how much time is spent in each state. 1001 // Output is dumped in "suggestprof". 1002 1003 #ifdef SUGGEST_PROFILE 1004 proftime_T current; 1005 proftime_T total; 1006 proftime_T times[STATE_FINAL + 1]; 1007 long counts[STATE_FINAL + 1]; 1008 1009 static void prof_init(void) 1010 { 1011 for (int i = 0; i <= STATE_FINAL; i++) { 1012 profile_zero(×[i]); 1013 counts[i] = 0; 1014 } 1015 profile_start(¤t); 1016 profile_start(&total); 1017 } 1018 1019 /// call before changing state 1020 static void prof_store(state_T state) 1021 { 1022 profile_end(¤t); 1023 profile_add(×[state], ¤t); 1024 counts[state]++; 1025 profile_start(¤t); 1026 } 1027 # define PROF_STORE(state) prof_store(state); 1028 1029 static void prof_report(char *name) 1030 { 1031 FILE *fd = fopen("suggestprof", "a"); 1032 1033 profile_end(&total); 1034 fprintf(fd, "-----------------------\n"); 1035 fprintf(fd, "%s: %s\n", name, profile_msg(&total)); 1036 for (int i = 0; i <= STATE_FINAL; i++) { 1037 fprintf(fd, "%d: %s ("%" PRId64)\n", i, profile_msg(×[i]), counts[i]); 1038 } 1039 fclose(fd); 1040 } 1041 #else 1042 # define PROF_STORE(state) 1043 #endif 1044 1045 /// Try finding suggestions by adding/removing/swapping letters. 1046 static void suggest_try_change(suginfo_T *su) 1047 { 1048 char fword[MAXWLEN]; // copy of the bad word, case-folded 1049 1050 // We make a copy of the case-folded bad word, so that we can modify it 1051 // to find matches (esp. REP items). Append some more text, changing 1052 // chars after the bad word may help. 1053 STRCPY(fword, su->su_fbadword); 1054 int n = (int)strlen(fword); 1055 char *p = su->su_badptr + su->su_badlen; 1056 spell_casefold(curwin, p, (int)strlen(p), fword + n, MAXWLEN - n); 1057 1058 // Make sure the resulting text is not longer than the original text. 1059 n = (int)strlen(su->su_badptr); 1060 if (n < MAXWLEN) { 1061 fword[n] = NUL; 1062 } 1063 1064 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { 1065 langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 1066 1067 // If reloading a spell file fails it's still in the list but 1068 // everything has been cleared. 1069 if (lp->lp_slang->sl_fbyts == NULL) { 1070 continue; 1071 } 1072 1073 // Try it for this language. Will add possible suggestions. 1074 #ifdef SUGGEST_PROFILE 1075 prof_init(); 1076 #endif 1077 suggest_trie_walk(su, lp, fword, false); 1078 #ifdef SUGGEST_PROFILE 1079 prof_report("try_change"); 1080 #endif 1081 } 1082 } 1083 1084 // Check the maximum score, if we go over it we won't try this change. 1085 #define TRY_DEEPER(su, stack, depth, add) \ 1086 ((depth) < MAXWLEN - 1 && (stack)[depth].ts_score + (add) < (su)->su_maxscore) 1087 1088 /// Try finding suggestions by adding/removing/swapping letters. 1089 /// 1090 /// This uses a state machine. At each node in the tree we try various 1091 /// operations. When trying if an operation works "depth" is increased and the 1092 /// stack[] is used to store info. This allows combinations, thus insert one 1093 /// character, replace one and delete another. The number of changes is 1094 /// limited by su->su_maxscore. 1095 /// 1096 /// After implementing this I noticed an article by Kemal Oflazer that 1097 /// describes something similar: "Error-tolerant Finite State Recognition with 1098 /// Applications to Morphological Analysis and Spelling Correction" (1996). 1099 /// The implementation in the article is simplified and requires a stack of 1100 /// unknown depth. The implementation here only needs a stack depth equal to 1101 /// the length of the word. 1102 /// 1103 /// This is also used for the sound-folded word, "soundfold" is true then. 1104 /// The mechanism is the same, but we find a match with a sound-folded word 1105 /// that comes from one or more original words. Each of these words may be 1106 /// added, this is done by add_sound_suggest(). 1107 /// Don't use: 1108 /// the prefix tree or the keep-case tree 1109 /// "su->su_badlen" 1110 /// anything to do with upper and lower case 1111 /// anything to do with word or non-word characters ("spell_iswordp()") 1112 /// banned words 1113 /// word flags (rare, region, compounding) 1114 /// word splitting for now 1115 /// "similar_chars()" 1116 /// use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" 1117 static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soundfold) 1118 { 1119 char tword[MAXWLEN]; // good word collected so far 1120 trystate_T stack[MAXWLEN]; 1121 char preword[MAXWLEN * 3] = { 0 }; // word found with proper case; 1122 // concatenation of prefix compound 1123 // words and split word. NUL terminated 1124 // when going deeper but not when coming 1125 // back. 1126 uint8_t compflags[MAXWLEN]; // compound flags, one for each word 1127 uint8_t *byts, *fbyts, *pbyts; 1128 idx_T *idxs, *fidxs, *pidxs; 1129 int c, c2, c3; 1130 int n = 0; 1131 idx_T arridx; 1132 int fl = 0; 1133 int tl; 1134 int repextra = 0; // extra bytes in fword[] from REP item 1135 slang_T *slang = lp->lp_slang; 1136 bool goodword_ends; 1137 #ifdef DEBUG_TRIEWALK 1138 // Stores the name of the change made at each level. 1139 uint8_t changename[MAXWLEN][80]; 1140 #endif 1141 int breakcheckcount = 1000; 1142 1143 // Go through the whole case-fold tree, try changes at each node. 1144 // "tword[]" contains the word collected from nodes in the tree. 1145 // "fword[]" the word we are trying to match with (initially the bad 1146 // word). 1147 int depth = 0; 1148 trystate_T *sp = &stack[0]; 1149 CLEAR_POINTER(sp); 1150 sp->ts_curi = 1; 1151 1152 if (soundfold) { 1153 // Going through the soundfold tree. 1154 byts = fbyts = slang->sl_sbyts; 1155 idxs = fidxs = slang->sl_sidxs; 1156 pbyts = NULL; 1157 pidxs = NULL; 1158 sp->ts_prefixdepth = PFD_NOPREFIX; 1159 sp->ts_state = STATE_START; 1160 } else { 1161 // When there are postponed prefixes we need to use these first. At 1162 // the end of the prefix we continue in the case-fold tree. 1163 fbyts = slang->sl_fbyts; 1164 fidxs = slang->sl_fidxs; 1165 pbyts = slang->sl_pbyts; 1166 pidxs = slang->sl_pidxs; 1167 if (pbyts != NULL) { 1168 byts = pbyts; 1169 idxs = pidxs; 1170 sp->ts_prefixdepth = PFD_PREFIXTREE; 1171 sp->ts_state = STATE_NOPREFIX; // try without prefix first 1172 } else { 1173 byts = fbyts; 1174 idxs = fidxs; 1175 sp->ts_prefixdepth = PFD_NOPREFIX; 1176 sp->ts_state = STATE_START; 1177 } 1178 } 1179 1180 // The loop may take an indefinite amount of time. Break out after some 1181 // time. 1182 proftime_T time_limit = 0; 1183 if (spell_suggest_timeout > 0) { 1184 time_limit = profile_setlimit(spell_suggest_timeout); 1185 } 1186 1187 // Loop to find all suggestions. At each round we either: 1188 // - For the current state try one operation, advance "ts_curi", 1189 // increase "depth". 1190 // - When a state is done go to the next, set "ts_state". 1191 // - When all states are tried decrease "depth". 1192 while (depth >= 0 && !got_int) { 1193 sp = &stack[depth]; 1194 switch (sp->ts_state) { 1195 case STATE_START: 1196 case STATE_NOPREFIX: 1197 // Start of node: Deal with NUL bytes, which means 1198 // tword[] may end here. 1199 arridx = sp->ts_arridx; // current node in the tree 1200 int len = byts[arridx]; // bytes in this node 1201 arridx += sp->ts_curi; // index of current byte 1202 1203 if (sp->ts_prefixdepth == PFD_PREFIXTREE) { 1204 // Skip over the NUL bytes, we use them later. 1205 for (n = 0; n < len && byts[arridx + n] == 0; n++) {} 1206 sp->ts_curi = (int16_t)(sp->ts_curi + n); 1207 1208 // Always past NUL bytes now. 1209 n = (int)sp->ts_state; 1210 PROF_STORE(sp->ts_state) 1211 sp->ts_state = STATE_ENDNUL; 1212 sp->ts_save_badflags = (uint8_t)su->su_badflags; 1213 1214 // At end of a prefix or at start of prefixtree: check for 1215 // following word. 1216 if (depth < MAXWLEN - 1 && (byts[arridx] == 0 || n == STATE_NOPREFIX)) { 1217 // Set su->su_badflags to the caps type at this position. 1218 // Use the caps type until here for the prefix itself. 1219 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 1220 int flags = badword_captype(su->su_badptr, su->su_badptr + n); 1221 su->su_badflags = badword_captype(su->su_badptr + n, 1222 su->su_badptr + su->su_badlen); 1223 #ifdef DEBUG_TRIEWALK 1224 sprintf(changename[depth], "prefix"); // NOLINT(runtime/printf) 1225 #endif 1226 go_deeper(stack, depth, 0); 1227 depth++; 1228 sp = &stack[depth]; 1229 sp->ts_prefixdepth = (uint8_t)(depth - 1); 1230 byts = fbyts; 1231 idxs = fidxs; 1232 sp->ts_arridx = 0; 1233 1234 // Move the prefix to preword[] with the right case 1235 // and make find_keepcap_word() works. 1236 tword[sp->ts_twordlen] = NUL; 1237 make_case_word(tword + sp->ts_splitoff, 1238 preword + sp->ts_prewordlen, flags); 1239 sp->ts_prewordlen = (uint8_t)strlen(preword); 1240 sp->ts_splitoff = sp->ts_twordlen; 1241 } 1242 break; 1243 } 1244 1245 if (sp->ts_curi > len || byts[arridx] != 0) { 1246 // Past bytes in node and/or past NUL bytes. 1247 PROF_STORE(sp->ts_state) 1248 sp->ts_state = STATE_ENDNUL; 1249 sp->ts_save_badflags = (uint8_t)su->su_badflags; 1250 break; 1251 } 1252 1253 // End of word in tree. 1254 sp->ts_curi++; // eat one NUL byte 1255 1256 int flags = (int)idxs[arridx]; 1257 1258 // Skip words with the NOSUGGEST flag. 1259 if (flags & WF_NOSUGGEST) { 1260 break; 1261 } 1262 1263 bool fword_ends = (fword[sp->ts_fidx] == NUL 1264 || (soundfold 1265 ? ascii_iswhite(fword[sp->ts_fidx]) 1266 : !spell_iswordp(fword + sp->ts_fidx, curwin))); 1267 tword[sp->ts_twordlen] = NUL; 1268 1269 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL 1270 && (sp->ts_flags & TSF_PREFIXOK) == 0 1271 && pbyts != NULL) { 1272 // There was a prefix before the word. Check that the prefix 1273 // can be used with this word. 1274 // Count the length of the NULs in the prefix. If there are 1275 // none this must be the first try without a prefix. 1276 n = stack[sp->ts_prefixdepth].ts_arridx; 1277 len = pbyts[n++]; 1278 for (c = 0; c < len && pbyts[n + c] == 0; c++) {} 1279 if (c > 0) { 1280 c = valid_word_prefix(c, n, flags, 1281 tword + sp->ts_splitoff, slang, false); 1282 if (c == 0) { 1283 break; 1284 } 1285 1286 // Use the WF_RARE flag for a rare prefix. 1287 if (c & WF_RAREPFX) { 1288 flags |= WF_RARE; 1289 } 1290 1291 // Tricky: when checking for both prefix and compounding 1292 // we run into the prefix flag first. 1293 // Remember that it's OK, so that we accept the prefix 1294 // when arriving at a compound flag. 1295 sp->ts_flags |= TSF_PREFIXOK; 1296 } 1297 } 1298 1299 // Check NEEDCOMPOUND: can't use word without compounding. Do try 1300 // appending another compound word below. 1301 if (sp->ts_complen == sp->ts_compsplit && fword_ends 1302 && (flags & WF_NEEDCOMP)) { 1303 goodword_ends = false; 1304 } else { 1305 goodword_ends = true; 1306 } 1307 1308 char *p = NULL; 1309 bool compound_ok = true; 1310 if (sp->ts_complen > sp->ts_compsplit) { 1311 if (slang->sl_nobreak) { 1312 // There was a word before this word. When there was no 1313 // change in this word (it was correct) add the first word 1314 // as a suggestion. If this word was corrected too, we 1315 // need to check if a correct word follows. 1316 if (sp->ts_fidx - sp->ts_splitfidx 1317 == sp->ts_twordlen - sp->ts_splitoff 1318 && strncmp(fword + sp->ts_splitfidx, 1319 tword + sp->ts_splitoff, 1320 (size_t)(sp->ts_fidx - sp->ts_splitfidx)) == 0) { 1321 preword[sp->ts_prewordlen] = NUL; 1322 int newscore = score_wordcount_adj(slang, sp->ts_score, 1323 preword + sp->ts_prewordlen, 1324 sp->ts_prewordlen > 0); 1325 // Add the suggestion if the score isn't too bad. 1326 if (newscore <= su->su_maxscore) { 1327 add_suggestion(su, &su->su_ga, preword, 1328 sp->ts_splitfidx - repextra, 1329 newscore, 0, false, 1330 lp->lp_sallang, false); 1331 } 1332 break; 1333 } 1334 } else { 1335 // There was a compound word before this word. If this 1336 // word does not support compounding then give up 1337 // (splitting is tried for the word without compound 1338 // flag). 1339 if (((unsigned)flags >> 24) == 0 1340 || sp->ts_twordlen - sp->ts_splitoff 1341 < slang->sl_compminlen) { 1342 break; 1343 } 1344 // For multi-byte chars check character length against 1345 // COMPOUNDMIN. 1346 if (slang->sl_compminlen > 0 1347 && mb_charlen(tword + sp->ts_splitoff) 1348 < slang->sl_compminlen) { 1349 break; 1350 } 1351 1352 compflags[sp->ts_complen] = (uint8_t)((unsigned)flags >> 24); 1353 compflags[sp->ts_complen + 1] = NUL; 1354 xmemcpyz(preword + sp->ts_prewordlen, 1355 tword + sp->ts_splitoff, 1356 (size_t)(sp->ts_twordlen - sp->ts_splitoff)); 1357 1358 // Verify CHECKCOMPOUNDPATTERN rules. 1359 if (match_checkcompoundpattern(preword, sp->ts_prewordlen, 1360 &slang->sl_comppat)) { 1361 compound_ok = false; 1362 } 1363 1364 if (compound_ok) { 1365 p = preword; 1366 while (*skiptowhite(p) != NUL) { 1367 p = skipwhite(skiptowhite(p)); 1368 } 1369 if (fword_ends && !can_compound(slang, p, compflags + sp->ts_compsplit)) { 1370 // Compound is not allowed. But it may still be 1371 // possible if we add another (short) word. 1372 compound_ok = false; 1373 } 1374 } 1375 1376 // Get pointer to last char of previous word. 1377 p = preword + sp->ts_prewordlen; 1378 MB_PTR_BACK(preword, p); 1379 } 1380 } 1381 1382 // Form the word with proper case in preword. 1383 // If there is a word from a previous split, append. 1384 // For the soundfold tree don't change the case, simply append. 1385 if (soundfold) { 1386 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); 1387 } else if (flags & WF_KEEPCAP) { 1388 // Must find the word in the keep-case tree. 1389 find_keepcap_word(slang, tword + sp->ts_splitoff, preword + sp->ts_prewordlen); 1390 } else { 1391 // Include badflags: If the badword is onecap or allcap 1392 // use that for the goodword too. But if the badword is 1393 // allcap and it's only one char long use onecap. 1394 c = su->su_badflags; 1395 if ((c & WF_ALLCAP) && su->su_badlen == utfc_ptr2len(su->su_badptr)) { 1396 c = WF_ONECAP; 1397 } 1398 c |= flags; 1399 1400 // When appending a compound word after a word character don't 1401 // use Onecap. 1402 if (p != NULL && spell_iswordp_nmw(p, curwin)) { 1403 c &= ~WF_ONECAP; 1404 } 1405 make_case_word(tword + sp->ts_splitoff, 1406 preword + sp->ts_prewordlen, c); 1407 } 1408 1409 if (!soundfold) { 1410 // Don't use a banned word. It may appear again as a good 1411 // word, thus remember it. 1412 if (flags & WF_BANNED) { 1413 add_banned(su, preword + sp->ts_prewordlen); 1414 break; 1415 } 1416 if ((sp->ts_complen == sp->ts_compsplit 1417 && WAS_BANNED(su, preword + sp->ts_prewordlen)) 1418 || WAS_BANNED(su, preword)) { 1419 if (slang->sl_compprog == NULL) { 1420 break; 1421 } 1422 // the word so far was banned but we may try compounding 1423 goodword_ends = false; 1424 } 1425 } 1426 1427 int newscore = 0; 1428 if (!soundfold) { // soundfold words don't have flags 1429 if ((flags & WF_REGION) 1430 && (((unsigned)flags >> 16) & (unsigned)lp->lp_region) == 0) { 1431 newscore += SCORE_REGION; 1432 } 1433 if (flags & WF_RARE) { 1434 newscore += SCORE_RARE; 1435 } 1436 1437 if (!spell_valid_case(su->su_badflags, 1438 captype(preword + sp->ts_prewordlen, NULL))) { 1439 newscore += SCORE_ICASE; 1440 } 1441 } 1442 1443 // TODO(vim): how about splitting in the soundfold tree? 1444 if (fword_ends 1445 && goodword_ends 1446 && sp->ts_fidx >= sp->ts_fidxtry 1447 && compound_ok) { 1448 // The badword also ends: add suggestions. 1449 #ifdef DEBUG_TRIEWALK 1450 if (soundfold && strcmp(preword, "smwrd") == 0) { 1451 int j; 1452 1453 // print the stack of changes that brought us here 1454 smsg(0, "------ %s -------", fword); 1455 for (j = 0; j < depth; j++) { 1456 smsg(0, "%s", changename[j]); 1457 } 1458 } 1459 #endif 1460 if (soundfold) { 1461 // For soundfolded words we need to find the original 1462 // words, the edit distance and then add them. 1463 add_sound_suggest(su, preword, sp->ts_score, lp); 1464 } else if (sp->ts_fidx > 0) { 1465 // Give a penalty when changing non-word char to word 1466 // char, e.g., "thes," -> "these". 1467 p = fword + sp->ts_fidx; 1468 MB_PTR_BACK(fword, p); 1469 if (!spell_iswordp(p, curwin) && *preword != NUL) { 1470 p = preword + strlen(preword); 1471 MB_PTR_BACK(preword, p); 1472 if (spell_iswordp(p, curwin)) { 1473 newscore += SCORE_NONWORD; 1474 } 1475 } 1476 1477 // Give a bonus to words seen before. 1478 int score = score_wordcount_adj(slang, 1479 sp->ts_score + newscore, 1480 preword + sp->ts_prewordlen, 1481 sp->ts_prewordlen > 0); 1482 1483 // Add the suggestion if the score isn't too bad. 1484 if (score <= su->su_maxscore) { 1485 add_suggestion(su, &su->su_ga, preword, 1486 sp->ts_fidx - repextra, 1487 score, 0, false, lp->lp_sallang, false); 1488 1489 if (su->su_badflags & WF_MIXCAP) { 1490 // We really don't know if the word should be 1491 // upper or lower case, add both. 1492 c = captype(preword, NULL); 1493 if (c == 0 || c == WF_ALLCAP) { 1494 make_case_word(tword + sp->ts_splitoff, 1495 preword + sp->ts_prewordlen, 1496 c == 0 ? WF_ALLCAP : 0); 1497 1498 add_suggestion(su, &su->su_ga, preword, 1499 sp->ts_fidx - repextra, 1500 score + SCORE_ICASE, 0, false, 1501 lp->lp_sallang, false); 1502 } 1503 } 1504 } 1505 } 1506 } 1507 1508 // Try word split and/or compounding. 1509 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) 1510 // Don't split in the middle of a character 1511 && (sp->ts_tcharlen == 0)) { 1512 bool try_compound; 1513 int try_split; 1514 1515 // If past the end of the bad word don't try a split. 1516 // Otherwise try changing the next word. E.g., find 1517 // suggestions for "the the" where the second "the" is 1518 // different. It's done like a split. 1519 // TODO(vim): word split for soundfold words 1520 try_split = (sp->ts_fidx - repextra < su->su_badlen) 1521 && !soundfold; 1522 1523 // Get here in several situations: 1524 // 1. The word in the tree ends: 1525 // If the word allows compounding try that. Otherwise try 1526 // a split by inserting a space. For both check that a 1527 // valid words starts at fword[sp->ts_fidx]. 1528 // For NOBREAK do like compounding to be able to check if 1529 // the next word is valid. 1530 // 2. The badword does end, but it was due to a change (e.g., 1531 // a swap). No need to split, but do check that the 1532 // following word is valid. 1533 // 3. The badword and the word in the tree end. It may still 1534 // be possible to compound another (short) word. 1535 try_compound = false; 1536 if (!soundfold 1537 && !slang->sl_nocompoundsugs 1538 && slang->sl_compprog != NULL 1539 && ((unsigned)flags >> 24) != 0 1540 && sp->ts_twordlen - sp->ts_splitoff 1541 >= slang->sl_compminlen 1542 && (slang->sl_compminlen == 0 1543 || mb_charlen(tword + sp->ts_splitoff) 1544 >= slang->sl_compminlen) 1545 && (slang->sl_compsylmax < MAXWLEN 1546 || sp->ts_complen + 1 - sp->ts_compsplit 1547 < slang->sl_compmax) 1548 && (can_be_compound(sp, slang, compflags, (int)((unsigned)flags >> 24)))) { 1549 try_compound = true; 1550 compflags[sp->ts_complen] = (uint8_t)((unsigned)flags >> 24); 1551 compflags[sp->ts_complen + 1] = NUL; 1552 } 1553 1554 // For NOBREAK we never try splitting, it won't make any word 1555 // valid. 1556 if (slang->sl_nobreak && !slang->sl_nocompoundsugs) { 1557 try_compound = true; 1558 } else if (!fword_ends 1559 && try_compound 1560 && (sp->ts_flags & TSF_DIDSPLIT) == 0) { 1561 // If we could add a compound word, and it's also possible to 1562 // split at this point, do the split first and set 1563 // TSF_DIDSPLIT to avoid doing it again. 1564 try_compound = false; 1565 sp->ts_flags |= TSF_DIDSPLIT; 1566 sp->ts_curi--; // do the same NUL again 1567 compflags[sp->ts_complen] = NUL; 1568 } else { 1569 sp->ts_flags &= (uint8_t) ~TSF_DIDSPLIT; 1570 } 1571 1572 if (try_split || try_compound) { 1573 if (!try_compound && (!fword_ends || !goodword_ends)) { 1574 // If we're going to split need to check that the 1575 // words so far are valid for compounding. If there 1576 // is only one word it must not have the NEEDCOMPOUND 1577 // flag. 1578 if (sp->ts_complen == sp->ts_compsplit 1579 && (flags & WF_NEEDCOMP)) { 1580 break; 1581 } 1582 p = preword; 1583 while (*skiptowhite(p) != NUL) { 1584 p = skipwhite(skiptowhite(p)); 1585 } 1586 if (sp->ts_complen > sp->ts_compsplit 1587 && !can_compound(slang, p, compflags + sp->ts_compsplit)) { 1588 break; 1589 } 1590 1591 if (slang->sl_nosplitsugs) { 1592 newscore += SCORE_SPLIT_NO; 1593 } else { 1594 newscore += SCORE_SPLIT; 1595 } 1596 1597 // Give a bonus to words seen before. 1598 newscore = score_wordcount_adj(slang, newscore, 1599 preword + sp->ts_prewordlen, true); 1600 } 1601 1602 if (TRY_DEEPER(su, stack, depth, newscore)) { 1603 go_deeper(stack, depth, newscore); 1604 #ifdef DEBUG_TRIEWALK 1605 if (!try_compound && !fword_ends) { 1606 sprintf(changename[depth], "%.*s-%s: split", // NOLINT(runtime/printf) 1607 sp->ts_twordlen, tword, fword + sp->ts_fidx); 1608 } else { 1609 sprintf(changename[depth], "%.*s-%s: compound", // NOLINT(runtime/printf) 1610 sp->ts_twordlen, tword, fword + sp->ts_fidx); 1611 } 1612 #endif 1613 // Save things to be restored at STATE_SPLITUNDO. 1614 sp->ts_save_badflags = (uint8_t)su->su_badflags; 1615 PROF_STORE(sp->ts_state) 1616 sp->ts_state = STATE_SPLITUNDO; 1617 1618 depth++; 1619 sp = &stack[depth]; 1620 1621 // Append a space to preword when splitting. 1622 if (!try_compound && !fword_ends) { 1623 strcat(preword, " "); 1624 } 1625 sp->ts_prewordlen = (uint8_t)strlen(preword); 1626 sp->ts_splitoff = sp->ts_twordlen; 1627 sp->ts_splitfidx = sp->ts_fidx; 1628 1629 // If the badword has a non-word character at this 1630 // position skip it. That means replacing the 1631 // non-word character with a space. Always skip a 1632 // character when the word ends. But only when the 1633 // good word can end. 1634 if (((!try_compound && !spell_iswordp_nmw(fword 1635 + sp->ts_fidx, 1636 curwin)) 1637 || fword_ends) 1638 && fword[sp->ts_fidx] != NUL 1639 && goodword_ends) { 1640 int l; 1641 1642 l = utfc_ptr2len(fword + sp->ts_fidx); 1643 if (fword_ends) { 1644 // Copy the skipped character to preword. 1645 memmove(preword + sp->ts_prewordlen, fword + sp->ts_fidx, (size_t)l); 1646 sp->ts_prewordlen = (uint8_t)(sp->ts_prewordlen + l); 1647 preword[sp->ts_prewordlen] = NUL; 1648 } else { 1649 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST; 1650 } 1651 sp->ts_fidx = (uint8_t)(sp->ts_fidx + l); 1652 } 1653 1654 // When compounding include compound flag in 1655 // compflags[] (already set above). When splitting we 1656 // may start compounding over again. 1657 if (try_compound) { 1658 sp->ts_complen++; 1659 } else { 1660 sp->ts_compsplit = sp->ts_complen; 1661 } 1662 sp->ts_prefixdepth = PFD_NOPREFIX; 1663 1664 // set su->su_badflags to the caps type at this 1665 // position 1666 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 1667 su->su_badflags = badword_captype(su->su_badptr + n, 1668 su->su_badptr + su->su_badlen); 1669 1670 // Restart at top of the tree. 1671 sp->ts_arridx = 0; 1672 1673 // If there are postponed prefixes, try these too. 1674 if (pbyts != NULL) { 1675 byts = pbyts; 1676 idxs = pidxs; 1677 sp->ts_prefixdepth = PFD_PREFIXTREE; 1678 PROF_STORE(sp->ts_state) 1679 sp->ts_state = STATE_NOPREFIX; 1680 } 1681 } 1682 } 1683 } 1684 break; 1685 1686 case STATE_SPLITUNDO: 1687 // Undo the changes done for word split or compound word. 1688 su->su_badflags = sp->ts_save_badflags; 1689 1690 // Continue looking for NUL bytes. 1691 PROF_STORE(sp->ts_state) 1692 sp->ts_state = STATE_START; 1693 1694 // In case we went into the prefix tree. 1695 byts = fbyts; 1696 idxs = fidxs; 1697 break; 1698 1699 case STATE_ENDNUL: 1700 // Past the NUL bytes in the node. 1701 su->su_badflags = sp->ts_save_badflags; 1702 if (fword[sp->ts_fidx] == NUL 1703 && sp->ts_tcharlen == 0) { 1704 // The badword ends, can't use STATE_PLAIN. 1705 PROF_STORE(sp->ts_state) 1706 sp->ts_state = STATE_DEL; 1707 break; 1708 } 1709 PROF_STORE(sp->ts_state) 1710 sp->ts_state = STATE_PLAIN; 1711 FALLTHROUGH; 1712 1713 case STATE_PLAIN: 1714 // Go over all possible bytes at this node, add each to tword[] 1715 // and use child node. "ts_curi" is the index. 1716 arridx = sp->ts_arridx; 1717 if (sp->ts_curi > byts[arridx]) { 1718 // Done all bytes at this node, do next state. When still at 1719 // already changed bytes skip the other tricks. 1720 PROF_STORE(sp->ts_state) 1721 sp->ts_state = sp->ts_fidx >= sp->ts_fidxtry ? STATE_DEL 1722 : STATE_FINAL; 1723 } else { 1724 arridx += sp->ts_curi++; 1725 c = byts[arridx]; 1726 1727 // Normal byte, go one level deeper. If it's not equal to the 1728 // byte in the bad word adjust the score. But don't even try 1729 // when the byte was already changed. And don't try when we 1730 // just deleted this byte, accepting it is always cheaper than 1731 // delete + substitute. 1732 if (c == (uint8_t)fword[sp->ts_fidx] 1733 || (sp->ts_tcharlen > 0 1734 && sp->ts_isdiff != DIFF_NONE)) { 1735 newscore = 0; 1736 } else { 1737 newscore = SCORE_SUBST; 1738 } 1739 if ((newscore == 0 1740 || (sp->ts_fidx >= sp->ts_fidxtry 1741 && ((sp->ts_flags & TSF_DIDDEL) == 0 1742 || c != (uint8_t)fword[sp->ts_delidx]))) 1743 && TRY_DEEPER(su, stack, depth, newscore)) { 1744 go_deeper(stack, depth, newscore); 1745 #ifdef DEBUG_TRIEWALK 1746 if (newscore > 0) { 1747 sprintf(changename[depth], "%.*s-%s: subst %c to %c", // NOLINT(runtime/printf) 1748 sp->ts_twordlen, tword, fword + sp->ts_fidx, 1749 fword[sp->ts_fidx], c); 1750 } else { 1751 sprintf(changename[depth], "%.*s-%s: accept %c", // NOLINT(runtime/printf) 1752 sp->ts_twordlen, tword, fword + sp->ts_fidx, 1753 fword[sp->ts_fidx]); 1754 } 1755 #endif 1756 depth++; 1757 sp = &stack[depth]; 1758 if (fword[sp->ts_fidx] != NUL) { 1759 sp->ts_fidx++; 1760 } 1761 tword[sp->ts_twordlen++] = (char)c; 1762 sp->ts_arridx = idxs[arridx]; 1763 if (newscore == SCORE_SUBST) { 1764 sp->ts_isdiff = DIFF_YES; 1765 } 1766 // Multi-byte characters are a bit complicated to 1767 // handle: They differ when any of the bytes differ 1768 // and then their length may also differ. 1769 if (sp->ts_tcharlen == 0) { 1770 // First byte. 1771 sp->ts_tcharidx = 0; 1772 sp->ts_tcharlen = MB_BYTE2LEN(c); 1773 sp->ts_fcharstart = (uint8_t)(sp->ts_fidx - 1); 1774 sp->ts_isdiff = (newscore != 0) 1775 ? DIFF_YES : DIFF_NONE; 1776 } else if (sp->ts_isdiff == DIFF_INSERT && sp->ts_fidx > 0) { 1777 // When inserting trail bytes don't advance in the 1778 // bad word. 1779 sp->ts_fidx--; 1780 } 1781 if (++sp->ts_tcharidx == sp->ts_tcharlen) { 1782 // Last byte of character. 1783 if (sp->ts_isdiff == DIFF_YES) { 1784 // Correct ts_fidx for the byte length of the 1785 // character (we didn't check that before). 1786 sp->ts_fidx = (uint8_t)(sp->ts_fcharstart 1787 + utfc_ptr2len(fword + sp->ts_fcharstart)); 1788 1789 // For changing a composing character adjust 1790 // the score from SCORE_SUBST to 1791 // SCORE_SUBCOMP. 1792 if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen)) 1793 && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) { 1794 sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP; 1795 } else if (!soundfold 1796 && slang->sl_has_map 1797 && similar_chars(slang, 1798 utf_ptr2char(tword + sp->ts_twordlen - 1799 sp->ts_tcharlen), 1800 utf_ptr2char(fword + sp->ts_fcharstart))) { 1801 // For a similar character adjust score from 1802 // SCORE_SUBST to SCORE_SIMILAR. 1803 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; 1804 } 1805 } else if (sp->ts_isdiff == DIFF_INSERT 1806 && sp->ts_twordlen > sp->ts_tcharlen) { 1807 p = tword + sp->ts_twordlen - sp->ts_tcharlen; 1808 c = utf_ptr2char(p); 1809 if (utf_iscomposing_legacy(c)) { 1810 // Inserting a composing char doesn't 1811 // count that much. 1812 sp->ts_score -= SCORE_INS - SCORE_INSCOMP; 1813 } else { 1814 // If the previous character was the same, 1815 // thus doubling a character, give a bonus 1816 // to the score. Also for the soundfold 1817 // tree (might seem illogical but does 1818 // give better scores). 1819 MB_PTR_BACK(tword, p); 1820 if (c == utf_ptr2char(p)) { 1821 sp->ts_score -= SCORE_INS - SCORE_INSDUP; 1822 } 1823 } 1824 } 1825 1826 // Starting a new char, reset the length. 1827 sp->ts_tcharlen = 0; 1828 } 1829 } 1830 } 1831 break; 1832 1833 case STATE_DEL: 1834 // When past the first byte of a multi-byte char don't try 1835 // delete/insert/swap a character. 1836 if (sp->ts_tcharlen > 0) { 1837 PROF_STORE(sp->ts_state) 1838 sp->ts_state = STATE_FINAL; 1839 break; 1840 } 1841 // Try skipping one character in the bad word (delete it). 1842 PROF_STORE(sp->ts_state) 1843 sp->ts_state = STATE_INS_PREP; 1844 sp->ts_curi = 1; 1845 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') { 1846 // Deleting a vowel at the start of a word counts less, see 1847 // soundalike_score(). 1848 newscore = 2 * SCORE_DEL / 3; 1849 } else { 1850 newscore = SCORE_DEL; 1851 } 1852 if (fword[sp->ts_fidx] != NUL 1853 && TRY_DEEPER(su, stack, depth, newscore)) { 1854 go_deeper(stack, depth, newscore); 1855 #ifdef DEBUG_TRIEWALK 1856 sprintf(changename[depth], "%.*s-%s: delete %c", // NOLINT(runtime/printf) 1857 sp->ts_twordlen, tword, fword + sp->ts_fidx, 1858 fword[sp->ts_fidx]); 1859 #endif 1860 depth++; 1861 1862 // Remember what character we deleted, so that we can avoid 1863 // inserting it again. 1864 stack[depth].ts_flags |= TSF_DIDDEL; 1865 stack[depth].ts_delidx = sp->ts_fidx; 1866 1867 // Advance over the character in fword[]. Give a bonus to the 1868 // score if the same character is following "nn" -> "n". It's 1869 // a bit illogical for soundfold tree but it does give better 1870 // results. 1871 c = utf_ptr2char(fword + sp->ts_fidx); 1872 stack[depth].ts_fidx = 1873 (uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx)); 1874 if (utf_iscomposing_legacy(c)) { 1875 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; 1876 } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) { 1877 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 1878 } 1879 1880 break; 1881 } 1882 FALLTHROUGH; 1883 1884 case STATE_INS_PREP: 1885 if (sp->ts_flags & TSF_DIDDEL) { 1886 // If we just deleted a byte then inserting won't make sense, 1887 // a substitute is always cheaper. 1888 PROF_STORE(sp->ts_state) 1889 sp->ts_state = STATE_SWAP; 1890 break; 1891 } 1892 1893 // skip over NUL bytes 1894 n = sp->ts_arridx; 1895 while (true) { 1896 if (sp->ts_curi > byts[n]) { 1897 // Only NUL bytes at this node, go to next state. 1898 PROF_STORE(sp->ts_state) 1899 sp->ts_state = STATE_SWAP; 1900 break; 1901 } 1902 if (byts[n + sp->ts_curi] != NUL) { 1903 // Found a byte to insert. 1904 PROF_STORE(sp->ts_state) 1905 sp->ts_state = STATE_INS; 1906 break; 1907 } 1908 sp->ts_curi++; 1909 } 1910 break; 1911 1912 case STATE_INS: 1913 // Insert one byte. Repeat this for each possible byte at this 1914 // node. 1915 n = sp->ts_arridx; 1916 if (sp->ts_curi > byts[n]) { 1917 // Done all bytes at this node, go to next state. 1918 PROF_STORE(sp->ts_state) 1919 sp->ts_state = STATE_SWAP; 1920 break; 1921 } 1922 1923 // Do one more byte at this node, but: 1924 // - Skip NUL bytes. 1925 // - Skip the byte if it's equal to the byte in the word, 1926 // accepting that byte is always better. 1927 n += sp->ts_curi++; 1928 1929 // break out, if we would be accessing byts buffer out of bounds 1930 if (byts == slang->sl_fbyts && n >= slang->sl_fbyts_len) { 1931 got_int = true; 1932 break; 1933 } 1934 c = byts[n]; 1935 if (soundfold && sp->ts_twordlen == 0 && c == '*') { 1936 // Inserting a vowel at the start of a word counts less, 1937 // see soundalike_score(). 1938 newscore = 2 * SCORE_INS / 3; 1939 } else { 1940 newscore = SCORE_INS; 1941 } 1942 if (c != (uint8_t)fword[sp->ts_fidx] 1943 && TRY_DEEPER(su, stack, depth, newscore)) { 1944 go_deeper(stack, depth, newscore); 1945 #ifdef DEBUG_TRIEWALK 1946 sprintf(changename[depth], "%.*s-%s: insert %c", // NOLINT(runtime/printf) 1947 sp->ts_twordlen, tword, fword + sp->ts_fidx, 1948 c); 1949 #endif 1950 depth++; 1951 sp = &stack[depth]; 1952 tword[sp->ts_twordlen++] = (char)c; 1953 sp->ts_arridx = idxs[n]; 1954 fl = MB_BYTE2LEN(c); 1955 if (fl > 1) { 1956 // There are following bytes for the same character. 1957 // We must find all bytes before trying 1958 // delete/insert/swap/etc. 1959 sp->ts_tcharlen = (uint8_t)fl; 1960 sp->ts_tcharidx = 1; 1961 sp->ts_isdiff = DIFF_INSERT; 1962 } 1963 if (fl == 1) { 1964 // If the previous character was the same, thus doubling a 1965 // character, give a bonus to the score. Also for 1966 // soundfold words (illogical but does give a better 1967 // score). 1968 if (sp->ts_twordlen >= 2 1969 && (uint8_t)tword[sp->ts_twordlen - 2] == c) { 1970 sp->ts_score -= SCORE_INS - SCORE_INSDUP; 1971 } 1972 } 1973 } 1974 break; 1975 1976 case STATE_SWAP: 1977 // Swap two bytes in the bad word: "12" -> "21". 1978 // We change "fword" here, it's changed back afterwards at 1979 // STATE_UNSWAP. 1980 p = fword + sp->ts_fidx; 1981 c = (uint8_t)(*p); 1982 if (c == NUL) { 1983 // End of word, can't swap or replace. 1984 PROF_STORE(sp->ts_state) 1985 sp->ts_state = STATE_FINAL; 1986 break; 1987 } 1988 1989 // Don't swap if the first character is not a word character. 1990 // SWAP3 etc. also don't make sense then. 1991 if (!soundfold && !spell_iswordp(p, curwin)) { 1992 PROF_STORE(sp->ts_state) 1993 sp->ts_state = STATE_REP_INI; 1994 break; 1995 } 1996 1997 n = utf_ptr2len(p); 1998 c = utf_ptr2char(p); 1999 if (p[n] == NUL) { 2000 c2 = NUL; 2001 } else if (!soundfold && !spell_iswordp(p + n, curwin)) { 2002 c2 = c; // don't swap non-word char 2003 } else { 2004 c2 = utf_ptr2char(p + n); 2005 } 2006 2007 // When the second character is NUL we can't swap. 2008 if (c2 == NUL) { 2009 PROF_STORE(sp->ts_state) 2010 sp->ts_state = STATE_REP_INI; 2011 break; 2012 } 2013 2014 // When characters are identical, swap won't do anything. 2015 // Also get here if the second char is not a word character. 2016 if (c == c2) { 2017 PROF_STORE(sp->ts_state) 2018 sp->ts_state = STATE_SWAP3; 2019 break; 2020 } 2021 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP)) { 2022 go_deeper(stack, depth, SCORE_SWAP); 2023 #ifdef DEBUG_TRIEWALK 2024 snprintf(changename[depth], sizeof(changename[0]), 2025 "%.*s-%s: swap %c and %c", 2026 sp->ts_twordlen, tword, fword + sp->ts_fidx, 2027 c, c2); 2028 #endif 2029 PROF_STORE(sp->ts_state) 2030 sp->ts_state = STATE_UNSWAP; 2031 depth++; 2032 fl = utf_char2len(c2); 2033 memmove(p, p + n, (size_t)fl); 2034 utf_char2bytes(c, p + fl); 2035 stack[depth].ts_fidxtry = (uint8_t)(sp->ts_fidx + n + fl); 2036 } else { 2037 // If this swap doesn't work then SWAP3 won't either. 2038 PROF_STORE(sp->ts_state) 2039 sp->ts_state = STATE_REP_INI; 2040 } 2041 break; 2042 2043 case STATE_UNSWAP: 2044 // Undo the STATE_SWAP swap: "21" -> "12". 2045 p = fword + sp->ts_fidx; 2046 n = utfc_ptr2len(p); 2047 c = utf_ptr2char(p + n); 2048 memmove(p + utfc_ptr2len(p + n), p, (size_t)n); 2049 utf_char2bytes(c, p); 2050 2051 FALLTHROUGH; 2052 2053 case STATE_SWAP3: 2054 // Swap two bytes, skipping one: "123" -> "321". We change 2055 // "fword" here, it's changed back afterwards at STATE_UNSWAP3. 2056 p = fword + sp->ts_fidx; 2057 n = utf_ptr2len(p); 2058 c = utf_ptr2char(p); 2059 fl = utf_ptr2len(p + n); 2060 c2 = utf_ptr2char(p + n); 2061 if (!soundfold && !spell_iswordp(p + n + fl, curwin)) { 2062 c3 = c; // don't swap non-word char 2063 } else { 2064 c3 = utf_ptr2char(p + n + fl); 2065 } 2066 2067 // When characters are identical: "121" then SWAP3 result is 2068 // identical, ROT3L result is same as SWAP: "211", ROT3L result is 2069 // same as SWAP on next char: "112". Thus skip all swapping. 2070 // Also skip when c3 is NUL. 2071 // Also get here when the third character is not a word character. 2072 // Second character may any char: "a.b" -> "b.a" 2073 if (c == c3 || c3 == NUL) { 2074 PROF_STORE(sp->ts_state) 2075 sp->ts_state = STATE_REP_INI; 2076 break; 2077 } 2078 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) { 2079 go_deeper(stack, depth, SCORE_SWAP3); 2080 #ifdef DEBUG_TRIEWALK 2081 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c", // NOLINT(runtime/printf) 2082 sp->ts_twordlen, tword, fword + sp->ts_fidx, 2083 c, c3); 2084 #endif 2085 PROF_STORE(sp->ts_state) 2086 sp->ts_state = STATE_UNSWAP3; 2087 depth++; 2088 tl = utf_char2len(c3); 2089 memmove(p, p + n + fl, (size_t)tl); 2090 utf_char2bytes(c2, p + tl); 2091 utf_char2bytes(c, p + fl + tl); 2092 stack[depth].ts_fidxtry = (uint8_t)(sp->ts_fidx + n + fl + tl); 2093 } else { 2094 PROF_STORE(sp->ts_state) 2095 sp->ts_state = STATE_REP_INI; 2096 } 2097 break; 2098 2099 case STATE_UNSWAP3: 2100 // Undo STATE_SWAP3: "321" -> "123" 2101 p = fword + sp->ts_fidx; 2102 n = utfc_ptr2len(p); 2103 c2 = utf_ptr2char(p + n); 2104 fl = utfc_ptr2len(p + n); 2105 c = utf_ptr2char(p + n + fl); 2106 tl = utfc_ptr2len(p + n + fl); 2107 memmove(p + fl + tl, p, (size_t)n); 2108 utf_char2bytes(c, p); 2109 utf_char2bytes(c2, p + tl); 2110 p = p + tl; 2111 2112 if (!soundfold && !spell_iswordp(p, curwin)) { 2113 // Middle char is not a word char, skip the rotate. First and 2114 // third char were already checked at swap and swap3. 2115 PROF_STORE(sp->ts_state) 2116 sp->ts_state = STATE_REP_INI; 2117 break; 2118 } 2119 2120 // Rotate three characters left: "123" -> "231". We change 2121 // "fword" here, it's changed back afterwards at STATE_UNROT3L. 2122 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) { 2123 go_deeper(stack, depth, SCORE_SWAP3); 2124 #ifdef DEBUG_TRIEWALK 2125 p = fword + sp->ts_fidx; 2126 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c", // NOLINT(runtime/printf) 2127 sp->ts_twordlen, tword, fword + sp->ts_fidx, 2128 p[0], p[1], p[2]); 2129 #endif 2130 PROF_STORE(sp->ts_state) 2131 sp->ts_state = STATE_UNROT3L; 2132 depth++; 2133 p = fword + sp->ts_fidx; 2134 n = utf_ptr2len(p); 2135 c = utf_ptr2char(p); 2136 fl = utf_ptr2len(p + n); 2137 fl += utf_ptr2len(p + n + fl); 2138 memmove(p, p + n, (size_t)fl); 2139 utf_char2bytes(c, p + fl); 2140 stack[depth].ts_fidxtry = (uint8_t)(sp->ts_fidx + n + fl); 2141 } else { 2142 PROF_STORE(sp->ts_state) 2143 sp->ts_state = STATE_REP_INI; 2144 } 2145 break; 2146 2147 case STATE_UNROT3L: 2148 // Undo ROT3L: "231" -> "123" 2149 p = fword + sp->ts_fidx; 2150 n = utfc_ptr2len(p); 2151 n += utfc_ptr2len(p + n); 2152 c = utf_ptr2char(p + n); 2153 tl = utfc_ptr2len(p + n); 2154 memmove(p + tl, p, (size_t)n); 2155 utf_char2bytes(c, p); 2156 2157 // Rotate three bytes right: "123" -> "312". We change "fword" 2158 // here, it's changed back afterwards at STATE_UNROT3R. 2159 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) { 2160 go_deeper(stack, depth, SCORE_SWAP3); 2161 #ifdef DEBUG_TRIEWALK 2162 p = fword + sp->ts_fidx; 2163 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c", // NOLINT(runtime/printf) 2164 sp->ts_twordlen, tword, fword + sp->ts_fidx, 2165 p[0], p[1], p[2]); 2166 #endif 2167 PROF_STORE(sp->ts_state) 2168 sp->ts_state = STATE_UNROT3R; 2169 depth++; 2170 p = fword + sp->ts_fidx; 2171 n = utf_ptr2len(p); 2172 n += utf_ptr2len(p + n); 2173 c = utf_ptr2char(p + n); 2174 tl = utf_ptr2len(p + n); 2175 memmove(p + tl, p, (size_t)n); 2176 utf_char2bytes(c, p); 2177 stack[depth].ts_fidxtry = (uint8_t)(sp->ts_fidx + n + tl); 2178 } else { 2179 PROF_STORE(sp->ts_state) 2180 sp->ts_state = STATE_REP_INI; 2181 } 2182 break; 2183 2184 case STATE_UNROT3R: 2185 // Undo ROT3R: "312" -> "123" 2186 p = fword + sp->ts_fidx; 2187 c = utf_ptr2char(p); 2188 tl = utfc_ptr2len(p); 2189 n = utfc_ptr2len(p + tl); 2190 n += utfc_ptr2len(p + tl + n); 2191 memmove(p, p + tl, (size_t)n); 2192 utf_char2bytes(c, p + n); 2193 2194 FALLTHROUGH; 2195 2196 case STATE_REP_INI: 2197 // Check if matching with REP items from the .aff file would work. 2198 // Quickly skip if: 2199 // - there are no REP items and we are not in the soundfold trie 2200 // - the score is going to be too high anyway 2201 // - already applied a REP item or swapped here 2202 if ((lp->lp_replang == NULL && !soundfold) 2203 || sp->ts_score + SCORE_REP >= su->su_maxscore 2204 || sp->ts_fidx < sp->ts_fidxtry) { 2205 PROF_STORE(sp->ts_state) 2206 sp->ts_state = STATE_FINAL; 2207 break; 2208 } 2209 2210 // Use the first byte to quickly find the first entry that may 2211 // match. If the index is -1 there is none. 2212 if (soundfold) { 2213 sp->ts_curi = slang->sl_repsal_first[(uint8_t)fword[sp->ts_fidx]]; 2214 } else { 2215 sp->ts_curi = lp->lp_replang->sl_rep_first[(uint8_t)fword[sp->ts_fidx]]; 2216 } 2217 2218 if (sp->ts_curi < 0) { 2219 PROF_STORE(sp->ts_state) 2220 sp->ts_state = STATE_FINAL; 2221 break; 2222 } 2223 2224 PROF_STORE(sp->ts_state) 2225 sp->ts_state = STATE_REP; 2226 FALLTHROUGH; 2227 2228 case STATE_REP: 2229 // Try matching with REP items from the .aff file. For each match 2230 // replace the characters and check if the resulting word is 2231 // valid. 2232 p = fword + sp->ts_fidx; 2233 2234 garray_T *gap = soundfold ? &slang->sl_repsal 2235 : &lp->lp_replang->sl_rep; 2236 while (sp->ts_curi < gap->ga_len) { 2237 fromto_T *ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; 2238 if (*ftp->ft_from != *p) { 2239 // past possible matching entries 2240 sp->ts_curi = (int16_t)gap->ga_len; 2241 break; 2242 } 2243 if (strncmp(ftp->ft_from, p, strlen(ftp->ft_from)) == 0 2244 && TRY_DEEPER(su, stack, depth, SCORE_REP)) { 2245 go_deeper(stack, depth, SCORE_REP); 2246 #ifdef DEBUG_TRIEWALK 2247 sprintf(changename[depth], "%.*s-%s: replace %s with %s", // NOLINT(runtime/printf) 2248 sp->ts_twordlen, tword, fword + sp->ts_fidx, 2249 ftp->ft_from, ftp->ft_to); 2250 #endif 2251 // Need to undo this afterwards. 2252 PROF_STORE(sp->ts_state) 2253 sp->ts_state = STATE_REP_UNDO; 2254 2255 // Change the "from" to the "to" string. 2256 depth++; 2257 fl = (int)strlen(ftp->ft_from); 2258 tl = (int)strlen(ftp->ft_to); 2259 if (fl != tl) { 2260 STRMOVE(p + tl, p + fl); 2261 repextra += tl - fl; 2262 } 2263 memmove(p, ftp->ft_to, (size_t)tl); 2264 stack[depth].ts_fidxtry = (uint8_t)(sp->ts_fidx + tl); 2265 stack[depth].ts_tcharlen = 0; 2266 break; 2267 } 2268 } 2269 2270 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) { 2271 // No (more) matches. 2272 PROF_STORE(sp->ts_state) 2273 sp->ts_state = STATE_FINAL; 2274 } 2275 2276 break; 2277 2278 case STATE_REP_UNDO: 2279 // Undo a REP replacement and continue with the next one. 2280 if (soundfold) { 2281 gap = &slang->sl_repsal; 2282 } else { 2283 gap = &lp->lp_replang->sl_rep; 2284 } 2285 fromto_T *ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; 2286 fl = (int)strlen(ftp->ft_from); 2287 tl = (int)strlen(ftp->ft_to); 2288 p = fword + sp->ts_fidx; 2289 if (fl != tl) { 2290 STRMOVE(p + fl, p + tl); 2291 repextra -= tl - fl; 2292 } 2293 memmove(p, ftp->ft_from, (size_t)fl); 2294 PROF_STORE(sp->ts_state) 2295 sp->ts_state = STATE_REP; 2296 break; 2297 2298 default: 2299 // Did all possible states at this level, go up one level. 2300 depth--; 2301 2302 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) { 2303 // Continue in or go back to the prefix tree. 2304 byts = pbyts; 2305 idxs = pidxs; 2306 } 2307 2308 // Don't check for CTRL-C too often, it takes time. 2309 if (--breakcheckcount == 0) { 2310 os_breakcheck(); 2311 breakcheckcount = 1000; 2312 if (spell_suggest_timeout > 0 && profile_passed_limit(time_limit)) { 2313 got_int = true; 2314 } 2315 } 2316 } 2317 } 2318 } 2319 2320 /// Go one level deeper in the tree. 2321 static void go_deeper(trystate_T *stack, int depth, int score_add) 2322 { 2323 stack[depth + 1] = stack[depth]; 2324 stack[depth + 1].ts_state = STATE_START; 2325 stack[depth + 1].ts_score = stack[depth].ts_score + score_add; 2326 stack[depth + 1].ts_curi = 1; // start just after length byte 2327 stack[depth + 1].ts_flags = 0; 2328 } 2329 2330 /// "fword" is a good word with case folded. Find the matching keep-case 2331 /// words and put it in "kword". 2332 /// Theoretically there could be several keep-case words that result in the 2333 /// same case-folded word, but we only find one... 2334 static void find_keepcap_word(slang_T *slang, char *fword, char *kword) 2335 { 2336 char uword[MAXWLEN]; // "fword" in upper-case 2337 idx_T tryidx; 2338 2339 // The following arrays are used at each depth in the tree. 2340 idx_T arridx[MAXWLEN]; 2341 int round[MAXWLEN]; 2342 int fwordidx[MAXWLEN]; 2343 int uwordidx[MAXWLEN]; 2344 int kwordlen[MAXWLEN]; 2345 2346 int l; 2347 char *p; 2348 uint8_t *byts = slang->sl_kbyts; // array with bytes of the words 2349 idx_T *idxs = slang->sl_kidxs; // array with indexes 2350 2351 if (byts == NULL) { 2352 // array is empty: "cannot happen" 2353 *kword = NUL; 2354 return; 2355 } 2356 2357 // Make an all-cap version of "fword". 2358 allcap_copy(fword, uword); 2359 2360 // Each character needs to be tried both case-folded and upper-case. 2361 // All this gets very complicated if we keep in mind that changing case 2362 // may change the byte length of a multi-byte character... 2363 int depth = 0; 2364 arridx[0] = 0; 2365 round[0] = 0; 2366 fwordidx[0] = 0; 2367 uwordidx[0] = 0; 2368 kwordlen[0] = 0; 2369 while (depth >= 0) { 2370 if (fword[fwordidx[depth]] == NUL) { 2371 // We are at the end of "fword". If the tree allows a word to end 2372 // here we have found a match. 2373 if (byts[arridx[depth] + 1] == 0) { 2374 kword[kwordlen[depth]] = NUL; 2375 return; 2376 } 2377 2378 // kword is getting too long, continue one level up 2379 depth--; 2380 } else if (++round[depth] > 2) { 2381 // tried both fold-case and upper-case character, continue one 2382 // level up 2383 depth--; 2384 } else { 2385 // round[depth] == 1: Try using the folded-case character. 2386 // round[depth] == 2: Try using the upper-case character. 2387 int flen = utf_ptr2len(fword + fwordidx[depth]); 2388 int ulen = utf_ptr2len(uword + uwordidx[depth]); 2389 if (round[depth] == 1) { 2390 p = fword + fwordidx[depth]; 2391 l = flen; 2392 } else { 2393 p = uword + uwordidx[depth]; 2394 l = ulen; 2395 } 2396 2397 for (tryidx = arridx[depth]; l > 0; l--) { 2398 // Perform a binary search in the list of accepted bytes. 2399 int len = byts[tryidx++]; 2400 int c = (uint8_t)(*p++); 2401 idx_T lo = tryidx; 2402 idx_T hi = tryidx + len - 1; 2403 while (lo < hi) { 2404 idx_T m = (lo + hi) / 2; 2405 if (byts[m] > c) { 2406 hi = m - 1; 2407 } else if (byts[m] < c) { 2408 lo = m + 1; 2409 } else { 2410 lo = hi = m; 2411 break; 2412 } 2413 } 2414 2415 // Stop if there is no matching byte. 2416 if (hi < lo || byts[lo] != c) { 2417 break; 2418 } 2419 2420 // Continue at the child (if there is one). 2421 tryidx = idxs[lo]; 2422 } 2423 2424 if (l == 0) { 2425 // Found the matching char. Copy it to "kword" and go a 2426 // level deeper. 2427 if (round[depth] == 1) { 2428 strncpy(kword + kwordlen[depth], // NOLINT(runtime/printf) 2429 fword + fwordidx[depth], 2430 (size_t)flen); 2431 kwordlen[depth + 1] = kwordlen[depth] + flen; 2432 } else { 2433 strncpy(kword + kwordlen[depth], // NOLINT(runtime/printf) 2434 uword + uwordidx[depth], 2435 (size_t)ulen); 2436 kwordlen[depth + 1] = kwordlen[depth] + ulen; 2437 } 2438 fwordidx[depth + 1] = fwordidx[depth] + flen; 2439 uwordidx[depth + 1] = uwordidx[depth] + ulen; 2440 2441 depth++; 2442 arridx[depth] = tryidx; 2443 round[depth] = 0; 2444 } 2445 } 2446 } 2447 2448 // Didn't find it: "cannot happen". 2449 *kword = NUL; 2450 } 2451 2452 /// Compute the sound-a-like score for suggestions in su->su_ga and add them to 2453 /// su->su_sga. 2454 static void score_comp_sal(suginfo_T *su) 2455 { 2456 char badsound[MAXWLEN]; 2457 2458 ga_grow(&su->su_sga, su->su_ga.ga_len); 2459 2460 // Use the sound-folding of the first language that supports it. 2461 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { 2462 langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 2463 if (!GA_EMPTY(&lp->lp_slang->sl_sal)) { 2464 // soundfold the bad word 2465 spell_soundfold(lp->lp_slang, su->su_fbadword, true, badsound); 2466 2467 for (int i = 0; i < su->su_ga.ga_len; i++) { 2468 suggest_T *stp = &SUG(su->su_ga, i); 2469 2470 // Case-fold the suggested word, sound-fold it and compute the 2471 // sound-a-like score. 2472 int score = stp_sal_score(stp, su, lp->lp_slang, badsound); 2473 if (score < SCORE_MAXMAX) { 2474 // Add the suggestion. 2475 suggest_T *sstp = &SUG(su->su_sga, su->su_sga.ga_len); 2476 sstp->st_word = xstrdup(stp->st_word); 2477 sstp->st_wordlen = stp->st_wordlen; 2478 sstp->st_score = score; 2479 sstp->st_altscore = 0; 2480 sstp->st_orglen = stp->st_orglen; 2481 su->su_sga.ga_len++; 2482 } 2483 } 2484 break; 2485 } 2486 } 2487 } 2488 2489 /// Combine the list of suggestions in su->su_ga and su->su_sga. 2490 /// They are entwined. 2491 static void score_combine(suginfo_T *su) 2492 { 2493 garray_T ga; 2494 char *p; 2495 char badsound[MAXWLEN]; 2496 slang_T *slang = NULL; 2497 2498 // Add the alternate score to su_ga. 2499 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { 2500 langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 2501 if (!GA_EMPTY(&lp->lp_slang->sl_sal)) { 2502 // soundfold the bad word 2503 slang = lp->lp_slang; 2504 spell_soundfold(slang, su->su_fbadword, true, badsound); 2505 2506 for (int i = 0; i < su->su_ga.ga_len; i++) { 2507 suggest_T *stp = &SUG(su->su_ga, i); 2508 stp->st_altscore = stp_sal_score(stp, su, slang, badsound); 2509 if (stp->st_altscore == SCORE_MAXMAX) { 2510 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; 2511 } else { 2512 stp->st_score = (stp->st_score * 3 + stp->st_altscore) / 4; 2513 } 2514 stp->st_salscore = false; 2515 } 2516 break; 2517 } 2518 } 2519 2520 if (slang == NULL) { // Using "double" without sound folding. 2521 cleanup_suggestions(&su->su_ga, su->su_maxscore, 2522 su->su_maxcount); 2523 return; 2524 } 2525 2526 // Add the alternate score to su_sga. 2527 for (int i = 0; i < su->su_sga.ga_len; i++) { 2528 suggest_T *stp = &SUG(su->su_sga, i); 2529 stp->st_altscore = spell_edit_score(slang, su->su_badword, stp->st_word); 2530 if (stp->st_score == SCORE_MAXMAX) { 2531 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; 2532 } else { 2533 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; 2534 } 2535 stp->st_salscore = true; 2536 } 2537 2538 // Remove bad suggestions, sort the suggestions and truncate at "maxcount" 2539 // for both lists. 2540 check_suggestions(su, &su->su_ga); 2541 cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 2542 check_suggestions(su, &su->su_sga); 2543 cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); 2544 2545 ga_init(&ga, (int)sizeof(suginfo_T), 1); 2546 ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len); 2547 2548 suggest_T *stp = &SUG(ga, 0); 2549 for (int i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; i++) { 2550 // round 1: get a suggestion from su_ga 2551 // round 2: get a suggestion from su_sga 2552 for (int round = 1; round <= 2; round++) { 2553 garray_T *gap = round == 1 ? &su->su_ga : &su->su_sga; 2554 if (i < gap->ga_len) { 2555 // Don't add a word if it's already there. 2556 p = SUG(*gap, i).st_word; 2557 int j; 2558 for (j = 0; j < ga.ga_len; j++) { 2559 if (strcmp(stp[j].st_word, p) == 0) { 2560 break; 2561 } 2562 } 2563 if (j == ga.ga_len) { 2564 stp[ga.ga_len++] = SUG(*gap, i); 2565 } else { 2566 xfree(p); 2567 } 2568 } 2569 } 2570 } 2571 2572 ga_clear(&su->su_ga); 2573 ga_clear(&su->su_sga); 2574 2575 // Truncate the list to the number of suggestions that will be displayed. 2576 if (ga.ga_len > su->su_maxcount) { 2577 for (int i = su->su_maxcount; i < ga.ga_len; i++) { 2578 xfree(stp[i].st_word); 2579 } 2580 ga.ga_len = su->su_maxcount; 2581 } 2582 2583 su->su_ga = ga; 2584 } 2585 2586 /// For the goodword in "stp" compute the soundalike score compared to the 2587 /// badword. 2588 /// 2589 /// @param badsound sound-folded badword 2590 static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char *badsound) 2591 { 2592 char *pbad; 2593 char *pgood; 2594 char badsound2[MAXWLEN]; 2595 char fword[MAXWLEN]; 2596 char goodsound[MAXWLEN]; 2597 char goodword[MAXWLEN]; 2598 2599 int lendiff = su->su_badlen - stp->st_orglen; 2600 if (lendiff >= 0) { 2601 pbad = badsound; 2602 } else { 2603 // soundfold the bad word with more characters following 2604 spell_casefold(curwin, su->su_badptr, stp->st_orglen, fword, MAXWLEN); 2605 2606 // When joining two words the sound often changes a lot. E.g., "t he" 2607 // sounds like "t h" while "the" sounds like "@". Avoid that by 2608 // removing the space. Don't do it when the good word also contains a 2609 // space. 2610 if (ascii_iswhite(su->su_badptr[su->su_badlen]) 2611 && *skiptowhite(stp->st_word) == NUL) { 2612 for (char *p = fword; *(p = skiptowhite(p)) != NUL;) { 2613 STRMOVE(p, p + 1); 2614 } 2615 } 2616 2617 spell_soundfold(slang, fword, true, badsound2); 2618 pbad = badsound2; 2619 } 2620 2621 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) { 2622 // Add part of the bad word to the good word, so that we soundfold 2623 // what replaces the bad word. 2624 STRCPY(goodword, stp->st_word); 2625 xmemcpyz(goodword + stp->st_wordlen, 2626 su->su_badptr + su->su_badlen - lendiff, (size_t)lendiff); 2627 pgood = goodword; 2628 } else { 2629 pgood = stp->st_word; 2630 } 2631 2632 // Sound-fold the word and compute the score for the difference. 2633 spell_soundfold(slang, pgood, false, goodsound); 2634 2635 return soundalike_score(goodsound, pbad); 2636 } 2637 2638 /// structure used to store soundfolded words that add_sound_suggest() has 2639 /// handled already. 2640 typedef struct { 2641 int16_t sft_score; ///< lowest score used 2642 uint8_t sft_word[]; ///< soundfolded word 2643 } sftword_T; 2644 2645 static sftword_T dumsft; 2646 #define HIKEY2SFT(p) ((sftword_T *)((p) - (dumsft.sft_word - (uint8_t *)&dumsft))) 2647 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) 2648 2649 /// Prepare for calling suggest_try_soundalike(). 2650 static void suggest_try_soundalike_prep(void) 2651 { 2652 // Do this for all languages that support sound folding and for which a 2653 // .sug file has been loaded. 2654 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { 2655 langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 2656 slang_T *slang = lp->lp_slang; 2657 if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) { 2658 // prepare the hashtable used by add_sound_suggest() 2659 hash_init(&slang->sl_sounddone); 2660 } 2661 } 2662 } 2663 2664 /// Find suggestions by comparing the word in a sound-a-like form. 2665 /// Note: This doesn't support postponed prefixes. 2666 static void suggest_try_soundalike(suginfo_T *su) 2667 { 2668 char salword[MAXWLEN]; 2669 2670 // Do this for all languages that support sound folding and for which a 2671 // .sug file has been loaded. 2672 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { 2673 langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 2674 slang_T *slang = lp->lp_slang; 2675 if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) { 2676 // soundfold the bad word 2677 spell_soundfold(slang, su->su_fbadword, true, salword); 2678 2679 // try all kinds of inserts/deletes/swaps/etc. 2680 // TODO(vim): also soundfold the next words, so that we can try joining 2681 // and splitting 2682 #ifdef SUGGEST_PROFILE 2683 prof_init(); 2684 #endif 2685 suggest_trie_walk(su, lp, salword, true); 2686 #ifdef SUGGEST_PROFILE 2687 prof_report("soundalike"); 2688 #endif 2689 } 2690 } 2691 } 2692 2693 /// Finish up after calling suggest_try_soundalike(). 2694 static void suggest_try_soundalike_finish(void) 2695 { 2696 // Do this for all languages that support sound folding and for which a 2697 // .sug file has been loaded. 2698 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { 2699 langp_T *lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); 2700 slang_T *slang = lp->lp_slang; 2701 if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) { 2702 // Free the info about handled words. 2703 int todo = (int)slang->sl_sounddone.ht_used; 2704 for (hashitem_T *hi = slang->sl_sounddone.ht_array; todo > 0; hi++) { 2705 if (!HASHITEM_EMPTY(hi)) { 2706 xfree(HI2SFT(hi)); 2707 todo--; 2708 } 2709 } 2710 2711 // Clear the hashtable, it may also be used by another region. 2712 hash_clear(&slang->sl_sounddone); 2713 hash_init(&slang->sl_sounddone); 2714 } 2715 } 2716 } 2717 2718 /// A match with a soundfolded word is found. Add the good word(s) that 2719 /// produce this soundfolded word. 2720 /// 2721 /// @param score soundfold score 2722 static void add_sound_suggest(suginfo_T *su, char *goodword, int score, langp_T *lp) 2723 { 2724 slang_T *slang = lp->lp_slang; // language for sound folding 2725 char theword[MAXWLEN]; 2726 int i; 2727 int wlen; 2728 uint8_t *byts; 2729 int wc; 2730 int goodscore; 2731 sftword_T *sft; 2732 2733 // It's very well possible that the same soundfold word is found several 2734 // times with different scores. Since the following is quite slow only do 2735 // the words that have a better score than before. Use a hashtable to 2736 // remember the words that have been done. 2737 hash_T hash = hash_hash(goodword); 2738 const size_t goodword_len = strlen(goodword); 2739 hashitem_T *hi = hash_lookup(&slang->sl_sounddone, goodword, goodword_len, hash); 2740 if (HASHITEM_EMPTY(hi)) { 2741 sft = xmalloc(offsetof(sftword_T, sft_word) + goodword_len + 1); 2742 sft->sft_score = (int16_t)score; 2743 memcpy(sft->sft_word, goodword, goodword_len + 1); 2744 hash_add_item(&slang->sl_sounddone, hi, (char *)sft->sft_word, hash); 2745 } else { 2746 sft = HI2SFT(hi); 2747 if (score >= sft->sft_score) { 2748 return; 2749 } 2750 sft->sft_score = (int16_t)score; 2751 } 2752 2753 // Find the word nr in the soundfold tree. 2754 int sfwordnr = soundfold_find(slang, goodword); 2755 if (sfwordnr < 0) { 2756 internal_error("add_sound_suggest()"); 2757 return; 2758 } 2759 2760 // Go over the list of good words that produce this soundfold word 2761 char *nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)sfwordnr + 1); 2762 int orgnr = 0; 2763 while (*nrline != NUL) { 2764 // The wordnr was stored in a minimal nr of bytes as an offset to the 2765 // previous wordnr. 2766 orgnr += bytes2offset(&nrline); 2767 2768 byts = slang->sl_fbyts; 2769 idx_T *idxs = slang->sl_fidxs; 2770 2771 // Lookup the word "orgnr" one of the two tries. 2772 int n = 0; 2773 int wordcount = 0; 2774 for (wlen = 0; wlen < MAXWLEN - 3; wlen++) { 2775 i = 1; 2776 if (wordcount == orgnr && byts[n + 1] == NUL) { 2777 break; // found end of word 2778 } 2779 if (byts[n + 1] == NUL) { 2780 wordcount++; 2781 } 2782 2783 // skip over the NUL bytes 2784 for (; byts[n + i] == NUL; i++) { 2785 if (i > byts[n]) { // safety check 2786 STRCPY(theword + wlen, "BAD"); 2787 wlen += 3; 2788 goto badword; 2789 } 2790 } 2791 2792 // One of the siblings must have the word. 2793 for (; i < byts[n]; i++) { 2794 wc = idxs[idxs[n + i]]; // nr of words under this byte 2795 if (wordcount + wc > orgnr) { 2796 break; 2797 } 2798 wordcount += wc; 2799 } 2800 2801 theword[wlen] = (char)byts[n + i]; 2802 n = idxs[n + i]; 2803 } 2804 badword: 2805 theword[wlen] = NUL; 2806 2807 // Go over the possible flags and regions. 2808 for (; i <= byts[n] && byts[n + i] == NUL; i++) { 2809 char cword[MAXWLEN]; 2810 char *p; 2811 int flags = (int)idxs[n + i]; 2812 2813 // Skip words with the NOSUGGEST flag 2814 if (flags & WF_NOSUGGEST) { 2815 continue; 2816 } 2817 2818 if (flags & WF_KEEPCAP) { 2819 // Must find the word in the keep-case tree. 2820 find_keepcap_word(slang, theword, cword); 2821 p = cword; 2822 } else { 2823 flags |= su->su_badflags; 2824 if ((flags & WF_CAPMASK) != 0) { 2825 // Need to fix case according to "flags". 2826 make_case_word(theword, cword, flags); 2827 p = cword; 2828 } else { 2829 p = theword; 2830 } 2831 } 2832 2833 // Add the suggestion. 2834 if (sps_flags & SPS_DOUBLE) { 2835 // Add the suggestion if the score isn't too bad. 2836 if (score <= su->su_maxscore) { 2837 add_suggestion(su, &su->su_sga, p, su->su_badlen, 2838 score, 0, false, slang, false); 2839 } 2840 } else { 2841 // Add a penalty for words in another region. 2842 if ((flags & WF_REGION) 2843 && (((unsigned)flags >> 16) & (unsigned)lp->lp_region) == 0) { 2844 goodscore = SCORE_REGION; 2845 } else { 2846 goodscore = 0; 2847 } 2848 2849 // Add a small penalty for changing the first letter from 2850 // lower to upper case. Helps for "tath" -> "Kath", which is 2851 // less common than "tath" -> "path". Don't do it when the 2852 // letter is the same, that has already been counted. 2853 int gc = utf_ptr2char(p); 2854 if (SPELL_ISUPPER(gc)) { 2855 int bc = utf_ptr2char(su->su_badword); 2856 if (!SPELL_ISUPPER(bc) 2857 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) { 2858 goodscore += SCORE_ICASE / 2; 2859 } 2860 } 2861 2862 // Compute the score for the good word. This only does letter 2863 // insert/delete/swap/replace. REP items are not considered, 2864 // which may make the score a bit higher. 2865 // Use a limit for the score to make it work faster. Use 2866 // MAXSCORE(), because RESCORE() will change the score. 2867 // If the limit is very high then the iterative method is 2868 // inefficient, using an array is quicker. 2869 int limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); 2870 if (limit > SCORE_LIMITMAX) { 2871 goodscore += spell_edit_score(slang, su->su_badword, p); 2872 } else { 2873 goodscore += spell_edit_score_limit(slang, su->su_badword, p, limit); 2874 } 2875 2876 // When going over the limit don't bother to do the rest. 2877 if (goodscore < SCORE_MAXMAX) { 2878 // Give a bonus to words seen before. 2879 goodscore = score_wordcount_adj(slang, goodscore, p, false); 2880 2881 // Add the suggestion if the score isn't too bad. 2882 goodscore = RESCORE(goodscore, score); 2883 if (goodscore <= su->su_sfmaxscore) { 2884 add_suggestion(su, &su->su_ga, p, su->su_badlen, 2885 goodscore, score, true, slang, true); 2886 } 2887 } 2888 } 2889 } 2890 } 2891 } 2892 2893 /// Find word "word" in fold-case tree for "slang" and return the word number. 2894 static int soundfold_find(slang_T *slang, char *word) 2895 { 2896 idx_T arridx = 0; 2897 int wlen = 0; 2898 uint8_t *ptr = (uint8_t *)word; 2899 int wordnr = 0; 2900 2901 uint8_t *byts = slang->sl_sbyts; 2902 idx_T *idxs = slang->sl_sidxs; 2903 2904 while (true) { 2905 // First byte is the number of possible bytes. 2906 int len = byts[arridx++]; 2907 2908 // If the first possible byte is a zero the word could end here. 2909 // If the word ends we found the word. If not skip the NUL bytes. 2910 int c = ptr[wlen]; 2911 if (byts[arridx] == NUL) { 2912 if (c == NUL) { 2913 break; 2914 } 2915 2916 // Skip over the zeros, there can be several. 2917 while (len > 0 && byts[arridx] == NUL) { 2918 arridx++; 2919 len--; 2920 } 2921 if (len == 0) { 2922 return -1; // no children, word should have ended here 2923 } 2924 wordnr++; 2925 } 2926 2927 // If the word ends we didn't find it. 2928 if (c == NUL) { 2929 return -1; 2930 } 2931 2932 // Perform a binary search in the list of accepted bytes. 2933 if (c == TAB) { // <Tab> is handled like <Space> 2934 c = ' '; 2935 } 2936 while (byts[arridx] < c) { 2937 // The word count is in the first idxs[] entry of the child. 2938 wordnr += idxs[idxs[arridx]]; 2939 arridx++; 2940 if (--len == 0) { // end of the bytes, didn't find it 2941 return -1; 2942 } 2943 } 2944 if (byts[arridx] != c) { // didn't find the byte 2945 return -1; 2946 } 2947 2948 // Continue at the child (if there is one). 2949 arridx = idxs[arridx]; 2950 wlen++; 2951 2952 // One space in the good word may stand for several spaces in the 2953 // checked word. 2954 if (c == ' ') { 2955 while (ptr[wlen] == ' ' || ptr[wlen] == TAB) { 2956 wlen++; 2957 } 2958 } 2959 } 2960 2961 return wordnr; 2962 } 2963 2964 /// Returns true if "c1" and "c2" are similar characters according to the MAP 2965 /// lines in the .aff file. 2966 static bool similar_chars(slang_T *slang, int c1, int c2) 2967 { 2968 int m1, m2; 2969 char buf[MB_MAXCHAR + 1]; 2970 2971 if (c1 >= 256) { 2972 buf[utf_char2bytes(c1, buf)] = 0; 2973 hashitem_T *hi = hash_find(&slang->sl_map_hash, buf); 2974 if (HASHITEM_EMPTY(hi)) { 2975 m1 = 0; 2976 } else { 2977 m1 = utf_ptr2char(hi->hi_key + strlen(hi->hi_key) + 1); 2978 } 2979 } else { 2980 m1 = slang->sl_map_array[c1]; 2981 } 2982 if (m1 == 0) { 2983 return false; 2984 } 2985 2986 if (c2 >= 256) { 2987 buf[utf_char2bytes(c2, buf)] = 0; 2988 hashitem_T *hi = hash_find(&slang->sl_map_hash, buf); 2989 if (HASHITEM_EMPTY(hi)) { 2990 m2 = 0; 2991 } else { 2992 m2 = utf_ptr2char(hi->hi_key + strlen(hi->hi_key) + 1); 2993 } 2994 } else { 2995 m2 = slang->sl_map_array[c2]; 2996 } 2997 2998 return m1 == m2; 2999 } 3000 3001 /// Adds a suggestion to the list of suggestions. 3002 /// For a suggestion that is already in the list the lowest score is remembered. 3003 /// 3004 /// @param gap either su_ga or su_sga 3005 /// @param badlenarg len of bad word replaced with "goodword" 3006 /// @param had_bonus value for st_had_bonus 3007 /// @param slang language for sound folding 3008 /// @param maxsf su_maxscore applies to soundfold score, su_sfmaxscore to the total score. 3009 static void add_suggestion(suginfo_T *su, garray_T *gap, const char *goodword, int badlenarg, 3010 int score, int altscore, bool had_bonus, slang_T *slang, bool maxsf) 3011 { 3012 int goodlen; // len of goodword changed 3013 int badlen; // len of bad word changed 3014 suggest_T new_sug; 3015 3016 // Minimize "badlen" for consistency. Avoids that changing "the the" to 3017 // "thee the" is added next to changing the first "the" the "thee". 3018 const char *pgood = goodword + strlen(goodword); 3019 char *pbad = su->su_badptr + badlenarg; 3020 while (true) { 3021 goodlen = (int)(pgood - goodword); 3022 badlen = (int)(pbad - su->su_badptr); 3023 if (goodlen <= 0 || badlen <= 0) { 3024 break; 3025 } 3026 MB_PTR_BACK(goodword, pgood); 3027 MB_PTR_BACK(su->su_badptr, pbad); 3028 if (utf_ptr2char(pgood) != utf_ptr2char(pbad)) { 3029 break; 3030 } 3031 } 3032 3033 if (badlen == 0 && goodlen == 0) { 3034 // goodword doesn't change anything; may happen for "the the" changing 3035 // the first "the" to itself. 3036 return; 3037 } 3038 3039 int i; 3040 if (GA_EMPTY(gap)) { 3041 i = -1; 3042 } else { 3043 // Check if the word is already there. Also check the length that is 3044 // being replaced "thes," -> "these" is a different suggestion from 3045 // "thes" -> "these". 3046 suggest_T *stp = &SUG(*gap, 0); 3047 for (i = gap->ga_len; --i >= 0; stp++) { 3048 if (stp->st_wordlen == goodlen 3049 && stp->st_orglen == badlen 3050 && strncmp(stp->st_word, goodword, (size_t)goodlen) == 0) { 3051 // Found it. Remember the word with the lowest score. 3052 if (stp->st_slang == NULL) { 3053 stp->st_slang = slang; 3054 } 3055 3056 new_sug.st_score = score; 3057 new_sug.st_altscore = altscore; 3058 new_sug.st_had_bonus = had_bonus; 3059 3060 if (stp->st_had_bonus != had_bonus) { 3061 // Only one of the two had the soundalike score computed. 3062 // Need to do that for the other one now, otherwise the 3063 // scores can't be compared. This happens because 3064 // suggest_try_change() doesn't compute the soundalike 3065 // word to keep it fast, while some special methods set 3066 // the soundalike score to zero. 3067 if (had_bonus) { 3068 rescore_one(su, stp); 3069 } else { 3070 new_sug.st_word = stp->st_word; 3071 new_sug.st_wordlen = stp->st_wordlen; 3072 new_sug.st_slang = stp->st_slang; 3073 new_sug.st_orglen = badlen; 3074 rescore_one(su, &new_sug); 3075 } 3076 } 3077 3078 if (stp->st_score > new_sug.st_score) { 3079 stp->st_score = new_sug.st_score; 3080 stp->st_altscore = new_sug.st_altscore; 3081 stp->st_had_bonus = new_sug.st_had_bonus; 3082 } 3083 break; 3084 } 3085 } 3086 } 3087 3088 if (i < 0) { 3089 // Add a suggestion. 3090 suggest_T *stp = GA_APPEND_VIA_PTR(suggest_T, gap); 3091 stp->st_word = xmemdupz(goodword, (size_t)goodlen); 3092 stp->st_wordlen = goodlen; 3093 stp->st_score = score; 3094 stp->st_altscore = altscore; 3095 stp->st_had_bonus = had_bonus; 3096 stp->st_orglen = badlen; 3097 stp->st_slang = slang; 3098 3099 // If we have too many suggestions now, sort the list and keep 3100 // the best suggestions. 3101 if (gap->ga_len > SUG_MAX_COUNT(su)) { 3102 if (maxsf) { 3103 su->su_sfmaxscore = cleanup_suggestions(gap, su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); 3104 } else { 3105 su->su_maxscore = cleanup_suggestions(gap, su->su_maxscore, SUG_CLEAN_COUNT(su)); 3106 } 3107 } 3108 } 3109 } 3110 3111 /// Suggestions may in fact be flagged as errors. Esp. for banned words and 3112 /// for split words, such as "the the". Remove these from the list here. 3113 /// 3114 /// @param gap either su_ga or su_sga 3115 static void check_suggestions(suginfo_T *su, garray_T *gap) 3116 { 3117 char longword[MAXWLEN + 1]; 3118 3119 if (gap->ga_len == 0) { 3120 return; 3121 } 3122 suggest_T *stp = &SUG(*gap, 0); 3123 for (int i = gap->ga_len - 1; i >= 0; i--) { 3124 // Need to append what follows to check for "the the". 3125 xstrlcpy(longword, stp[i].st_word, MAXWLEN + 1); 3126 int len = stp[i].st_wordlen; 3127 xstrlcpy(longword + len, su->su_badptr + stp[i].st_orglen, MAXWLEN + 1 - (size_t)len); 3128 hlf_T attr = HLF_COUNT; 3129 spell_check(curwin, longword, &attr, NULL, false); 3130 if (attr != HLF_COUNT) { 3131 // Remove this entry. 3132 xfree(stp[i].st_word); 3133 gap->ga_len--; 3134 if (i < gap->ga_len) { 3135 memmove(stp + i, stp + i + 1, sizeof(suggest_T) * (size_t)(gap->ga_len - i)); 3136 } 3137 } 3138 } 3139 } 3140 3141 /// Add a word to be banned. 3142 static void add_banned(suginfo_T *su, char *word) 3143 { 3144 hash_T hash = hash_hash(word); 3145 const size_t word_len = strlen(word); 3146 hashitem_T *hi = hash_lookup(&su->su_banned, word, word_len, hash); 3147 if (!HASHITEM_EMPTY(hi)) { // already present 3148 return; 3149 } 3150 char *s = xmemdupz(word, word_len); 3151 hash_add_item(&su->su_banned, hi, s, hash); 3152 } 3153 3154 /// Recompute the score for all suggestions if sound-folding is possible. This 3155 /// is slow, thus only done for the final results. 3156 static void rescore_suggestions(suginfo_T *su) 3157 { 3158 if (su->su_sallang != NULL) { 3159 for (int i = 0; i < su->su_ga.ga_len; i++) { 3160 rescore_one(su, &SUG(su->su_ga, i)); 3161 } 3162 } 3163 } 3164 3165 /// Recompute the score for one suggestion if sound-folding is possible. 3166 static void rescore_one(suginfo_T *su, suggest_T *stp) 3167 { 3168 slang_T *slang = stp->st_slang; 3169 char sal_badword[MAXWLEN]; 3170 3171 // Only rescore suggestions that have no sal score yet and do have a 3172 // language. 3173 if (slang != NULL && !GA_EMPTY(&slang->sl_sal) && !stp->st_had_bonus) { 3174 char *p; 3175 if (slang == su->su_sallang) { 3176 p = su->su_sal_badword; 3177 } else { 3178 spell_soundfold(slang, su->su_fbadword, true, sal_badword); 3179 p = sal_badword; 3180 } 3181 3182 stp->st_altscore = stp_sal_score(stp, su, slang, p); 3183 if (stp->st_altscore == SCORE_MAXMAX) { 3184 stp->st_altscore = SCORE_BIG; 3185 } 3186 stp->st_score = RESCORE(stp->st_score, stp->st_altscore); 3187 stp->st_had_bonus = true; 3188 } 3189 } 3190 3191 /// Function given to qsort() to sort the suggestions on st_score. 3192 /// First on "st_score", then "st_altscore" then alphabetically. 3193 static int sug_compare(const void *s1, const void *s2) 3194 { 3195 suggest_T *p1 = (suggest_T *)s1; 3196 suggest_T *p2 = (suggest_T *)s2; 3197 int n = p1->st_score == p2->st_score ? 0 : p1->st_score > p2->st_score ? 1 : -1; 3198 3199 if (n == 0) { 3200 n = p1->st_altscore == p2->st_altscore ? 0 : p1->st_altscore > p2->st_altscore ? 1 : -1; 3201 if (n == 0) { 3202 n = STRICMP(p1->st_word, p2->st_word); 3203 } 3204 } 3205 return n; 3206 } 3207 3208 /// Cleanup the suggestions: 3209 /// - Sort on score. 3210 /// - Remove words that won't be displayed. 3211 /// 3212 /// @param keep nr of suggestions to keep 3213 /// 3214 /// @return the maximum score in the list or "maxscore" unmodified. 3215 static int cleanup_suggestions(garray_T *gap, int maxscore, int keep) 3216 FUNC_ATTR_NONNULL_ALL 3217 { 3218 if (gap->ga_len <= 0) { 3219 return maxscore; 3220 } 3221 3222 // Sort the list. 3223 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); 3224 3225 // Truncate the list to the number of suggestions that will be displayed. 3226 if (gap->ga_len > keep) { 3227 suggest_T *const stp = &SUG(*gap, 0); 3228 3229 for (int i = keep; i < gap->ga_len; i++) { 3230 xfree(stp[i].st_word); 3231 } 3232 gap->ga_len = keep; 3233 if (keep >= 1) { 3234 return stp[keep - 1].st_score; 3235 } 3236 } 3237 return maxscore; 3238 } 3239 3240 /// Compute a score for two sound-a-like words. 3241 /// This permits up to two inserts/deletes/swaps/etc. to keep things fast. 3242 /// Instead of a generic loop we write out the code. That keeps it fast by 3243 /// avoiding checks that will not be possible. 3244 /// 3245 /// @param goodstart sound-folded good word 3246 /// @param badstart sound-folded bad word 3247 static int soundalike_score(char *goodstart, char *badstart) 3248 { 3249 char *goodsound = goodstart; 3250 char *badsound = badstart; 3251 int score = 0; 3252 3253 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be 3254 // counted so much, vowels in the middle of the word aren't counted at all. 3255 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) { 3256 if ((badsound[0] == NUL && goodsound[1] == NUL) 3257 || (goodsound[0] == NUL && badsound[1] == NUL)) { 3258 // changing word with vowel to word without a sound 3259 return SCORE_DEL; 3260 } 3261 if (badsound[0] == NUL || goodsound[0] == NUL) { 3262 // more than two changes 3263 return SCORE_MAXMAX; 3264 } 3265 3266 if (badsound[1] == goodsound[1] 3267 || (badsound[1] != NUL 3268 && goodsound[1] != NUL 3269 && badsound[2] == goodsound[2])) { 3270 // handle like a substitute 3271 } else { 3272 score = 2 * SCORE_DEL / 3; 3273 if (*badsound == '*') { 3274 badsound++; 3275 } else { 3276 goodsound++; 3277 } 3278 } 3279 } 3280 3281 int goodlen = (int)strlen(goodsound); 3282 int badlen = (int)strlen(badsound); 3283 3284 // Return quickly if the lengths are too different to be fixed by two 3285 // changes. 3286 int n = goodlen - badlen; 3287 if (n < -2 || n > 2) { 3288 return SCORE_MAXMAX; 3289 } 3290 3291 // n > 0 : goodsound is longest 3292 // n <= 0 : badsound is longest 3293 char *pl = n > 0 ? goodsound : badsound; 3294 char *ps = n > 0 ? badsound : goodsound; 3295 3296 // Skip over the identical part. 3297 while (*pl == *ps && *pl != NUL) { 3298 pl++; 3299 ps++; 3300 } 3301 3302 char *pl2, *ps2; 3303 3304 switch (n) { 3305 case -2: 3306 case 2: 3307 // Must delete two characters from "pl". 3308 pl++; // first delete 3309 while (*pl == *ps) { 3310 pl++; 3311 ps++; 3312 } 3313 // strings must be equal after second delete 3314 if (strcmp(pl + 1, ps) == 0) { 3315 return score + SCORE_DEL * 2; 3316 } 3317 3318 // Failed to compare. 3319 break; 3320 3321 case -1: 3322 case 1: 3323 // Minimal one delete from "pl" required. 3324 3325 // 1: delete 3326 pl2 = pl + 1; 3327 ps2 = ps; 3328 while (*pl2 == *ps2) { 3329 if (*pl2 == NUL) { // reached the end 3330 return score + SCORE_DEL; 3331 } 3332 pl2++; 3333 ps2++; 3334 } 3335 3336 // 2: delete then swap, then rest must be equal 3337 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 3338 && strcmp(pl2 + 2, ps2 + 2) == 0) { 3339 return score + SCORE_DEL + SCORE_SWAP; 3340 } 3341 3342 // 3: delete then substitute, then the rest must be equal 3343 if (strcmp(pl2 + 1, ps2 + 1) == 0) { 3344 return score + SCORE_DEL + SCORE_SUBST; 3345 } 3346 3347 // 4: first swap then delete 3348 if (pl[0] == ps[1] && pl[1] == ps[0]) { 3349 pl2 = pl + 2; // swap, skip two chars 3350 ps2 = ps + 2; 3351 while (*pl2 == *ps2) { 3352 pl2++; 3353 ps2++; 3354 } 3355 // delete a char and then strings must be equal 3356 if (strcmp(pl2 + 1, ps2) == 0) { 3357 return score + SCORE_SWAP + SCORE_DEL; 3358 } 3359 } 3360 3361 // 5: first substitute then delete 3362 pl2 = pl + 1; // substitute, skip one char 3363 ps2 = ps + 1; 3364 while (*pl2 == *ps2) { 3365 pl2++; 3366 ps2++; 3367 } 3368 // delete a char and then strings must be equal 3369 if (strcmp(pl2 + 1, ps2) == 0) { 3370 return score + SCORE_SUBST + SCORE_DEL; 3371 } 3372 3373 // Failed to compare. 3374 break; 3375 3376 case 0: 3377 // Lengths are equal, thus changes must result in same length: An 3378 // insert is only possible in combination with a delete. 3379 // 1: check if for identical strings 3380 if (*pl == NUL) { 3381 return score; 3382 } 3383 3384 // 2: swap 3385 if (pl[0] == ps[1] && pl[1] == ps[0]) { 3386 pl2 = pl + 2; // swap, skip two chars 3387 ps2 = ps + 2; 3388 while (*pl2 == *ps2) { 3389 if (*pl2 == NUL) { // reached the end 3390 return score + SCORE_SWAP; 3391 } 3392 pl2++; 3393 ps2++; 3394 } 3395 // 3: swap and swap again 3396 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 3397 && strcmp(pl2 + 2, ps2 + 2) == 0) { 3398 return score + SCORE_SWAP + SCORE_SWAP; 3399 } 3400 3401 // 4: swap and substitute 3402 if (strcmp(pl2 + 1, ps2 + 1) == 0) { 3403 return score + SCORE_SWAP + SCORE_SUBST; 3404 } 3405 } 3406 3407 // 5: substitute 3408 pl2 = pl + 1; 3409 ps2 = ps + 1; 3410 while (*pl2 == *ps2) { 3411 if (*pl2 == NUL) { // reached the end 3412 return score + SCORE_SUBST; 3413 } 3414 pl2++; 3415 ps2++; 3416 } 3417 3418 // 6: substitute and swap 3419 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] 3420 && strcmp(pl2 + 2, ps2 + 2) == 0) { 3421 return score + SCORE_SUBST + SCORE_SWAP; 3422 } 3423 3424 // 7: substitute and substitute 3425 if (strcmp(pl2 + 1, ps2 + 1) == 0) { 3426 return score + SCORE_SUBST + SCORE_SUBST; 3427 } 3428 3429 // 8: insert then delete 3430 pl2 = pl; 3431 ps2 = ps + 1; 3432 while (*pl2 == *ps2) { 3433 pl2++; 3434 ps2++; 3435 } 3436 if (strcmp(pl2 + 1, ps2) == 0) { 3437 return score + SCORE_INS + SCORE_DEL; 3438 } 3439 3440 // 9: delete then insert 3441 pl2 = pl + 1; 3442 ps2 = ps; 3443 while (*pl2 == *ps2) { 3444 pl2++; 3445 ps2++; 3446 } 3447 if (strcmp(pl2, ps2 + 1) == 0) { 3448 return score + SCORE_INS + SCORE_DEL; 3449 } 3450 3451 // Failed to compare. 3452 break; 3453 } 3454 3455 return SCORE_MAXMAX; 3456 } 3457 3458 /// Compute the "edit distance" to turn "badword" into "goodword". The less 3459 /// deletes/inserts/substitutes/swaps are required the lower the score. 3460 /// 3461 /// The algorithm is described by Du and Chang, 1992. 3462 /// The implementation of the algorithm comes from Aspell editdist.cpp, 3463 /// edit_distance(). It has been converted from C++ to C and modified to 3464 /// support multi-byte characters. 3465 static int spell_edit_score(slang_T *slang, const char *badword, const char *goodword) 3466 { 3467 int wbadword[MAXWLEN]; 3468 int wgoodword[MAXWLEN]; 3469 3470 // Lengths with NUL. 3471 int badlen; 3472 int goodlen; 3473 { 3474 // Get the characters from the multi-byte strings and put them in an 3475 // int array for easy access. 3476 badlen = 0; 3477 for (const char *p = badword; *p != NUL;) { 3478 wbadword[badlen++] = mb_cptr2char_adv(&p); 3479 } 3480 wbadword[badlen++] = 0; 3481 goodlen = 0; 3482 for (const char *p = goodword; *p != NUL;) { 3483 wgoodword[goodlen++] = mb_cptr2char_adv(&p); 3484 } 3485 wgoodword[goodlen++] = 0; 3486 } 3487 3488 // We use "cnt" as an array: CNT(badword_idx, goodword_idx). 3489 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] 3490 int *cnt = xmalloc(sizeof(int) * ((size_t)badlen + 1) * ((size_t)goodlen + 1)); 3491 3492 CNT(0, 0) = 0; 3493 for (int j = 1; j <= goodlen; j++) { 3494 CNT(0, j) = CNT(0, j - 1) + SCORE_INS; 3495 } 3496 3497 for (int i = 1; i <= badlen; i++) { 3498 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; 3499 for (int j = 1; j <= goodlen; j++) { 3500 int bc = wbadword[i - 1]; 3501 int gc = wgoodword[j - 1]; 3502 if (bc == gc) { 3503 CNT(i, j) = CNT(i - 1, j - 1); 3504 } else { 3505 // Use a better score when there is only a case difference. 3506 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) { 3507 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); 3508 } else { 3509 // For a similar character use SCORE_SIMILAR. 3510 if (slang != NULL 3511 && slang->sl_has_map 3512 && similar_chars(slang, gc, bc)) { 3513 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); 3514 } else { 3515 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); 3516 } 3517 } 3518 3519 if (i > 1 && j > 1) { 3520 int pbc = wbadword[i - 2]; 3521 int pgc = wgoodword[j - 2]; 3522 if (bc == pgc && pbc == gc) { 3523 int t = SCORE_SWAP + CNT(i - 2, j - 2); 3524 CNT(i, j) = MIN(CNT(i, j), t); 3525 } 3526 } 3527 int t = SCORE_DEL + CNT(i - 1, j); 3528 CNT(i, j) = MIN(CNT(i, j), t); 3529 t = SCORE_INS + CNT(i, j - 1); 3530 CNT(i, j) = MIN(CNT(i, j), t); 3531 } 3532 } 3533 } 3534 3535 int i = CNT(badlen - 1, goodlen - 1); 3536 xfree(cnt); 3537 return i; 3538 } 3539 3540 typedef struct { 3541 int badi; 3542 int goodi; 3543 int score; 3544 } limitscore_T; 3545 3546 /// Like spell_edit_score(), but with a limit on the score to make it faster. 3547 /// May return SCORE_MAXMAX when the score is higher than "limit". 3548 /// 3549 /// This uses a stack for the edits still to be tried. 3550 /// The idea comes from Aspell leditdist.cpp. Rewritten in C and added support 3551 /// for multi-byte characters. 3552 static int spell_edit_score_limit(slang_T *slang, char *badword, char *goodword, int limit) 3553 { 3554 return spell_edit_score_limit_w(slang, badword, goodword, limit); 3555 } 3556 3557 /// Multi-byte version of spell_edit_score_limit(). 3558 /// Keep it in sync with the above! 3559 static int spell_edit_score_limit_w(slang_T *slang, const char *badword, const char *goodword, 3560 int limit) 3561 { 3562 limitscore_T stack[10]; // allow for over 3 * 2 edits 3563 int bc, gc; 3564 int score_off; 3565 int wbadword[MAXWLEN]; 3566 int wgoodword[MAXWLEN]; 3567 3568 // Get the characters from the multi-byte strings and put them in an 3569 // int array for easy access. 3570 int bi = 0; 3571 for (const char *p = badword; *p != NUL;) { 3572 wbadword[bi++] = mb_cptr2char_adv(&p); 3573 } 3574 wbadword[bi++] = 0; 3575 int gi = 0; 3576 for (const char *p = goodword; *p != NUL;) { 3577 wgoodword[gi++] = mb_cptr2char_adv(&p); 3578 } 3579 wgoodword[gi++] = 0; 3580 3581 // The idea is to go from start to end over the words. So long as 3582 // characters are equal just continue, this always gives the lowest score. 3583 // When there is a difference try several alternatives. Each alternative 3584 // increases "score" for the edit distance. Some of the alternatives are 3585 // pushed unto a stack and tried later, some are tried right away. At the 3586 // end of the word the score for one alternative is known. The lowest 3587 // possible score is stored in "minscore". 3588 int stackidx = 0; 3589 bi = 0; 3590 gi = 0; 3591 int score = 0; 3592 int minscore = limit + 1; 3593 3594 while (true) { 3595 // Skip over an equal part, score remains the same. 3596 while (true) { 3597 bc = wbadword[bi]; 3598 gc = wgoodword[gi]; 3599 3600 if (bc != gc) { // stop at a char that's different 3601 break; 3602 } 3603 if (bc == NUL) { // both words end 3604 if (score < minscore) { 3605 minscore = score; 3606 } 3607 goto pop; // do next alternative 3608 } 3609 bi++; 3610 gi++; 3611 } 3612 3613 if (gc == NUL) { // goodword ends, delete badword chars 3614 do { 3615 if ((score += SCORE_DEL) >= minscore) { 3616 goto pop; // do next alternative 3617 } 3618 } while (wbadword[++bi] != NUL); 3619 minscore = score; 3620 } else if (bc == NUL) { // badword ends, insert badword chars 3621 do { 3622 if ((score += SCORE_INS) >= minscore) { 3623 goto pop; // do next alternative 3624 } 3625 } while (wgoodword[++gi] != NUL); 3626 minscore = score; 3627 } else { // both words continue 3628 // If not close to the limit, perform a change. Only try changes 3629 // that may lead to a lower score than "minscore". 3630 // round 0: try deleting a char from badword 3631 // round 1: try inserting a char in badword 3632 for (int round = 0; round <= 1; round++) { 3633 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); 3634 if (score_off < minscore) { 3635 if (score_off + SCORE_EDIT_MIN >= minscore) { 3636 // Near the limit, rest of the words must match. We 3637 // can check that right now, no need to push an item 3638 // onto the stack. 3639 int bi2 = bi + 1 - round; 3640 int gi2 = gi + round; 3641 while (wgoodword[gi2] == wbadword[bi2]) { 3642 if (wgoodword[gi2] == NUL) { 3643 minscore = score_off; 3644 break; 3645 } 3646 bi2++; 3647 gi2++; 3648 } 3649 } else { 3650 // try deleting a character from badword later 3651 stack[stackidx].badi = bi + 1 - round; 3652 stack[stackidx].goodi = gi + round; 3653 stack[stackidx].score = score_off; 3654 stackidx++; 3655 } 3656 } 3657 } 3658 3659 if (score + SCORE_SWAP < minscore) { 3660 // If swapping two characters makes a match then the 3661 // substitution is more expensive, thus there is no need to 3662 // try both. 3663 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) { 3664 // Swap two characters, that is: skip them. 3665 gi += 2; 3666 bi += 2; 3667 score += SCORE_SWAP; 3668 continue; 3669 } 3670 } 3671 3672 // Substitute one character for another which is the same 3673 // thing as deleting a character from both goodword and badword. 3674 // Use a better score when there is only a case difference. 3675 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) { 3676 score += SCORE_ICASE; 3677 } else { 3678 // For a similar character use SCORE_SIMILAR. 3679 if (slang != NULL 3680 && slang->sl_has_map 3681 && similar_chars(slang, gc, bc)) { 3682 score += SCORE_SIMILAR; 3683 } else { 3684 score += SCORE_SUBST; 3685 } 3686 } 3687 3688 if (score < minscore) { 3689 // Do the substitution. 3690 gi++; 3691 bi++; 3692 continue; 3693 } 3694 } 3695 pop: 3696 // Get here to try the next alternative, pop it from the stack. 3697 if (stackidx == 0) { // stack is empty, finished 3698 break; 3699 } 3700 3701 // pop an item from the stack 3702 stackidx--; 3703 gi = stack[stackidx].goodi; 3704 bi = stack[stackidx].badi; 3705 score = stack[stackidx].score; 3706 } 3707 3708 // When the score goes over "limit" it may actually be much higher. 3709 // Return a very large number to avoid going below the limit when giving a 3710 // bonus. 3711 if (minscore > limit) { 3712 return SCORE_MAXMAX; 3713 } 3714 return minscore; 3715 }