affixmgr.cxx (152435B)
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 * 4 * Copyright (C) 2002-2022 Németh László 5 * 6 * The contents of this file are subject to the Mozilla Public License Version 7 * 1.1 (the "License"); you may not use this file except in compliance with 8 * the License. You may obtain a copy of the License at 9 * http://www.mozilla.org/MPL/ 10 * 11 * Software distributed under the License is distributed on an "AS IS" basis, 12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 13 * for the specific language governing rights and limitations under the 14 * License. 15 * 16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. 17 * 18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, 19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, 20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, 21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, 22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen 23 * 24 * Alternatively, the contents of this file may be used under the terms of 25 * either the GNU General Public License Version 2 or later (the "GPL"), or 26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 * in which case the provisions of the GPL or the LGPL are applicable instead 28 * of those above. If you wish to allow use of your version of this file only 29 * under the terms of either the GPL or the LGPL, and not to allow others to 30 * use your version of this file under the terms of the MPL, indicate your 31 * decision by deleting the provisions above and replace them with the notice 32 * and other provisions required by the GPL or the LGPL. If you do not delete 33 * the provisions above, a recipient may use your version of this file under 34 * the terms of any one of the MPL, the GPL or the LGPL. 35 * 36 * ***** END LICENSE BLOCK ***** */ 37 /* 38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada 39 * And Contributors. All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 52 * 3. All modifications to the source code must be clearly marked as 53 * such. Binary redistributions based on modified source code 54 * must be clearly marked as modified versions in the documentation 55 * and/or other materials provided with the distribution. 56 * 57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS 58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 */ 70 71 #include <stdlib.h> 72 #include <string.h> 73 #include <stdio.h> 74 #include <ctype.h> 75 #include <time.h> 76 77 #include <algorithm> 78 #include <limits> 79 #include <string> 80 #include <vector> 81 82 #include "affixmgr.hxx" 83 #include "affentry.hxx" 84 #include "langnum.hxx" 85 86 #include "csutil.hxx" 87 88 AffixMgr::AffixMgr(const char* affpath, 89 const std::vector<HashMgr*>& ptr, 90 const char* key) 91 : alldic(ptr) 92 , pHMgr(ptr[0]) { 93 94 // register hash manager and load affix data from aff file 95 csconv = NULL; 96 utf8 = 0; 97 complexprefixes = 0; 98 parsedmaptable = false; 99 parsedbreaktable = false; 100 iconvtable = NULL; 101 oconvtable = NULL; 102 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) 103 simplifiedcpd = 0; 104 parsedcheckcpd = false; 105 parseddefcpd = false; 106 phone = NULL; 107 compoundflag = FLAG_NULL; // permits word in compound forms 108 compoundbegin = FLAG_NULL; // may be first word in compound forms 109 compoundmiddle = FLAG_NULL; // may be middle word in compound forms 110 compoundend = FLAG_NULL; // may be last word in compound forms 111 compoundroot = FLAG_NULL; // compound word signing flag 112 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word 113 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word 114 compoundmoresuffixes = 0; // allow more suffixes within compound words 115 checkcompounddup = 0; // forbid double words in compounds 116 checkcompoundrep = 0; // forbid bad compounds (may be non-compound word with 117 // a REP substitution) 118 checkcompoundcase = 119 0; // forbid upper and lowercase combinations at word bounds 120 checkcompoundtriple = 0; // forbid compounds with triple letters 121 simplifiedtriple = 0; // allow simplified triple letters in compounds 122 // (Schiff+fahrt -> Schiffahrt) 123 forbiddenword = FORBIDDENWORD; // forbidden word signing flag 124 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag 125 nongramsuggest = FLAG_NULL; 126 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) 127 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes 128 cpdwordmax = -1; // default: unlimited wordcount in compound words 129 cpdmin = -1; // undefined 130 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words 131 pfxappnd = NULL; // previous prefix for counting syllables of the prefix BUG 132 sfxappnd = NULL; // previous suffix for counting syllables of the suffix BUG 133 sfxextra = 0; // modifier for syllable count of sfxappnd BUG 134 checknum = 0; // checking numbers, and word with numbers 135 havecontclass = 0; // flags of possible continuing classes (double affix) 136 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents 137 // in morhological description in dictionary file. It's often combined with 138 // PSEUDOROOT. 139 lemma_present = FLAG_NULL; 140 circumfix = FLAG_NULL; 141 onlyincompound = FLAG_NULL; 142 maxngramsugs = -1; // undefined 143 maxdiff = -1; // undefined 144 onlymaxdiff = 0; 145 maxcpdsugs = -1; // undefined 146 nosplitsugs = 0; 147 sugswithdots = 0; 148 keepcase = 0; 149 forceucase = 0; 150 warn = 0; 151 forbidwarn = 0; 152 checksharps = 0; 153 substandard = FLAG_NULL; 154 fullstrip = 0; 155 156 sfx = NULL; 157 pfx = NULL; 158 159 for (int i = 0; i < SETSIZE; i++) { 160 pStart[i] = NULL; 161 sStart[i] = NULL; 162 pFlag[i] = NULL; 163 sFlag[i] = NULL; 164 } 165 166 for (int j = 0; j < CONTSIZE; j++) { 167 contclasses[j] = 0; 168 } 169 170 if (parse_file(affpath, key)) { 171 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath); 172 } 173 174 if (cpdmin == -1) 175 cpdmin = MINCPDLEN; 176 } 177 178 AffixMgr::~AffixMgr() { 179 // pass through linked prefix entries and clean up 180 for (int i = 0; i < SETSIZE; i++) { 181 pFlag[i] = NULL; 182 PfxEntry* ptr = pStart[i]; 183 PfxEntry* nptr = NULL; 184 while (ptr) { 185 nptr = ptr->getNext(); 186 delete (ptr); 187 ptr = nptr; 188 nptr = NULL; 189 } 190 } 191 192 // pass through linked suffix entries and clean up 193 for (int j = 0; j < SETSIZE; j++) { 194 sFlag[j] = NULL; 195 SfxEntry* ptr = sStart[j]; 196 SfxEntry* nptr = NULL; 197 while (ptr) { 198 nptr = ptr->getNext(); 199 delete (ptr); 200 ptr = nptr; 201 nptr = NULL; 202 } 203 sStart[j] = NULL; 204 } 205 206 delete iconvtable; 207 delete oconvtable; 208 delete phone; 209 210 FREE_FLAG(compoundflag); 211 FREE_FLAG(compoundbegin); 212 FREE_FLAG(compoundmiddle); 213 FREE_FLAG(compoundend); 214 FREE_FLAG(compoundpermitflag); 215 FREE_FLAG(compoundforbidflag); 216 FREE_FLAG(compoundroot); 217 FREE_FLAG(forbiddenword); 218 FREE_FLAG(nosuggest); 219 FREE_FLAG(nongramsuggest); 220 FREE_FLAG(needaffix); 221 FREE_FLAG(lemma_present); 222 FREE_FLAG(circumfix); 223 FREE_FLAG(onlyincompound); 224 225 cpdwordmax = 0; 226 pHMgr = NULL; 227 cpdmin = 0; 228 cpdmaxsyllable = 0; 229 free_utf_tbl(); 230 checknum = 0; 231 #ifdef MOZILLA_CLIENT 232 delete[] csconv; 233 #endif 234 } 235 236 void AffixMgr::finishFileMgr(FileMgr* afflst) { 237 delete afflst; 238 239 // convert affix trees to sorted list 240 process_pfx_tree_to_list(); 241 process_sfx_tree_to_list(); 242 } 243 244 // read in aff file and build up prefix and suffix entry objects 245 int AffixMgr::parse_file(const char* affpath, const char* key) { 246 247 // checking flag duplication 248 char dupflags[CONTSIZE]; 249 char dupflags_ini = 1; 250 251 // first line indicator for removing byte order mark 252 int firstline = 1; 253 254 // open the affix file 255 FileMgr* afflst = new FileMgr(affpath, key); 256 if (!afflst) { 257 HUNSPELL_WARNING( 258 stderr, "error: could not open affix description file %s\n", affpath); 259 return 1; 260 } 261 262 // step one is to parse the affix file building up the internal 263 // affix data structures 264 265 // read in each line ignoring any that do not 266 // start with a known line type indicator 267 std::string line; 268 while (afflst->getline(line)) { 269 mychomp(line); 270 271 /* remove byte order mark */ 272 if (firstline) { 273 firstline = 0; 274 // Affix file begins with byte order mark: possible incompatibility with 275 // old Hunspell versions 276 if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { 277 line.erase(0, 3); 278 } 279 } 280 281 /* parse in the keyboard string */ 282 if (line.compare(0, 3, "KEY", 3) == 0) { 283 if (!parse_string(line, keystring, afflst->getlinenum())) { 284 finishFileMgr(afflst); 285 return 1; 286 } 287 } 288 289 /* parse in the try string */ 290 if (line.compare(0, 3, "TRY", 3) == 0) { 291 if (!parse_string(line, trystring, afflst->getlinenum())) { 292 finishFileMgr(afflst); 293 return 1; 294 } 295 } 296 297 /* parse in the name of the character set used by the .dict and .aff */ 298 if (line.compare(0, 3, "SET", 3) == 0) { 299 if (!parse_string(line, encoding, afflst->getlinenum())) { 300 finishFileMgr(afflst); 301 return 1; 302 } 303 if (encoding == "UTF-8") { 304 utf8 = 1; 305 #ifndef OPENOFFICEORG 306 #ifndef MOZILLA_CLIENT 307 initialize_utf_tbl(); 308 #endif 309 #endif 310 } 311 } 312 313 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left 314 * writing system */ 315 if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) 316 complexprefixes = 1; 317 318 /* parse in the flag used by the controlled compound words */ 319 if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) { 320 if (!parse_flag(line, &compoundflag, afflst)) { 321 finishFileMgr(afflst); 322 return 1; 323 } 324 } 325 326 /* parse in the flag used by compound words */ 327 if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) { 328 if (complexprefixes) { 329 if (!parse_flag(line, &compoundend, afflst)) { 330 finishFileMgr(afflst); 331 return 1; 332 } 333 } else { 334 if (!parse_flag(line, &compoundbegin, afflst)) { 335 finishFileMgr(afflst); 336 return 1; 337 } 338 } 339 } 340 341 /* parse in the flag used by compound words */ 342 if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) { 343 if (!parse_flag(line, &compoundmiddle, afflst)) { 344 finishFileMgr(afflst); 345 return 1; 346 } 347 } 348 349 /* parse in the flag used by compound words */ 350 if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) { 351 if (complexprefixes) { 352 if (!parse_flag(line, &compoundbegin, afflst)) { 353 finishFileMgr(afflst); 354 return 1; 355 } 356 } else { 357 if (!parse_flag(line, &compoundend, afflst)) { 358 finishFileMgr(afflst); 359 return 1; 360 } 361 } 362 } 363 364 /* parse in the data used by compound_check() method */ 365 if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) { 366 if (!parse_num(line, &cpdwordmax, afflst)) { 367 finishFileMgr(afflst); 368 return 1; 369 } 370 } 371 372 /* parse in the flag sign compounds in dictionary */ 373 if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) { 374 if (!parse_flag(line, &compoundroot, afflst)) { 375 finishFileMgr(afflst); 376 return 1; 377 } 378 } 379 380 /* parse in the flag used by compound_check() method */ 381 if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) { 382 if (!parse_flag(line, &compoundpermitflag, afflst)) { 383 finishFileMgr(afflst); 384 return 1; 385 } 386 } 387 388 /* parse in the flag used by compound_check() method */ 389 if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) { 390 if (!parse_flag(line, &compoundforbidflag, afflst)) { 391 finishFileMgr(afflst); 392 return 1; 393 } 394 } 395 396 if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) { 397 compoundmoresuffixes = 1; 398 } 399 400 if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) { 401 checkcompounddup = 1; 402 } 403 404 if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) { 405 checkcompoundrep = 1; 406 } 407 408 if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) { 409 checkcompoundtriple = 1; 410 } 411 412 if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) { 413 simplifiedtriple = 1; 414 } 415 416 if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) { 417 checkcompoundcase = 1; 418 } 419 420 if (line.compare(0, 9, "NOSUGGEST", 9) == 0) { 421 if (!parse_flag(line, &nosuggest, afflst)) { 422 finishFileMgr(afflst); 423 return 1; 424 } 425 } 426 427 if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) { 428 if (!parse_flag(line, &nongramsuggest, afflst)) { 429 finishFileMgr(afflst); 430 return 1; 431 } 432 } 433 434 /* parse in the flag used by forbidden words */ 435 if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) { 436 if (!parse_flag(line, &forbiddenword, afflst)) { 437 finishFileMgr(afflst); 438 return 1; 439 } 440 } 441 442 /* parse in the flag used by forbidden words (is deprecated) */ 443 if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) { 444 if (!parse_flag(line, &lemma_present, afflst)) { 445 finishFileMgr(afflst); 446 return 1; 447 } 448 } 449 450 /* parse in the flag used by circumfixes */ 451 if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) { 452 if (!parse_flag(line, &circumfix, afflst)) { 453 finishFileMgr(afflst); 454 return 1; 455 } 456 } 457 458 /* parse in the flag used by fogemorphemes */ 459 if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) { 460 if (!parse_flag(line, &onlyincompound, afflst)) { 461 finishFileMgr(afflst); 462 return 1; 463 } 464 } 465 466 /* parse in the flag used by `needaffixs' (is deprecated) */ 467 if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) { 468 if (!parse_flag(line, &needaffix, afflst)) { 469 finishFileMgr(afflst); 470 return 1; 471 } 472 } 473 474 /* parse in the flag used by `needaffixs' */ 475 if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) { 476 if (!parse_flag(line, &needaffix, afflst)) { 477 finishFileMgr(afflst); 478 return 1; 479 } 480 } 481 482 /* parse in the minimal length for words in compounds */ 483 if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) { 484 if (!parse_num(line, &cpdmin, afflst)) { 485 finishFileMgr(afflst); 486 return 1; 487 } 488 if (cpdmin < 1) 489 cpdmin = 1; 490 } 491 492 /* parse in the max. words and syllables in compounds */ 493 if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) { 494 if (!parse_cpdsyllable(line, afflst)) { 495 finishFileMgr(afflst); 496 return 1; 497 } 498 } 499 500 /* parse in the flag used by compound_check() method */ 501 if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) { 502 if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) { 503 finishFileMgr(afflst); 504 return 1; 505 } 506 } 507 508 /* parse in the flag used by the controlled compound words */ 509 if (line.compare(0, 8, "CHECKNUM", 8) == 0) { 510 checknum = 1; 511 } 512 513 /* parse in the extra word characters */ 514 if (line.compare(0, 9, "WORDCHARS", 9) == 0) { 515 if (!parse_array(line, wordchars, wordchars_utf16, 516 utf8, afflst->getlinenum())) { 517 finishFileMgr(afflst); 518 return 1; 519 } 520 } 521 522 /* parse in the ignored characters (for example, Arabic optional diacretics 523 * charachters */ 524 if (line.compare(0, 6, "IGNORE", 6) == 0) { 525 if (!parse_array(line, ignorechars, ignorechars_utf16, 526 utf8, afflst->getlinenum())) { 527 finishFileMgr(afflst); 528 return 1; 529 } 530 } 531 532 /* parse in the input conversion table */ 533 if (line.compare(0, 5, "ICONV", 5) == 0) { 534 if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) { 535 finishFileMgr(afflst); 536 return 1; 537 } 538 } 539 540 /* parse in the output conversion table */ 541 if (line.compare(0, 5, "OCONV", 5) == 0) { 542 if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) { 543 finishFileMgr(afflst); 544 return 1; 545 } 546 } 547 548 /* parse in the phonetic translation table */ 549 if (line.compare(0, 5, "PHONE", 5) == 0) { 550 if (!parse_phonetable(line, afflst)) { 551 finishFileMgr(afflst); 552 return 1; 553 } 554 } 555 556 /* parse in the checkcompoundpattern table */ 557 if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) { 558 if (!parse_checkcpdtable(line, afflst)) { 559 finishFileMgr(afflst); 560 return 1; 561 } 562 } 563 564 /* parse in the defcompound table */ 565 if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) { 566 if (!parse_defcpdtable(line, afflst)) { 567 finishFileMgr(afflst); 568 return 1; 569 } 570 } 571 572 /* parse in the related character map table */ 573 if (line.compare(0, 3, "MAP", 3) == 0) { 574 if (!parse_maptable(line, afflst)) { 575 finishFileMgr(afflst); 576 return 1; 577 } 578 } 579 580 /* parse in the word breakpoints table */ 581 if (line.compare(0, 5, "BREAK", 5) == 0) { 582 if (!parse_breaktable(line, afflst)) { 583 finishFileMgr(afflst); 584 return 1; 585 } 586 } 587 588 /* parse in the language for language specific codes */ 589 if (line.compare(0, 4, "LANG", 4) == 0) { 590 if (!parse_string(line, lang, afflst->getlinenum())) { 591 finishFileMgr(afflst); 592 return 1; 593 } 594 langnum = get_lang_num(lang); 595 } 596 597 if (line.compare(0, 7, "VERSION", 7) == 0) { 598 size_t startpos = line.find_first_not_of(" \t", 7); 599 if (startpos != std::string::npos) { 600 version = line.substr(startpos); 601 } 602 } 603 604 if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) { 605 if (!parse_num(line, &maxngramsugs, afflst)) { 606 finishFileMgr(afflst); 607 return 1; 608 } 609 } 610 611 if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0) 612 onlymaxdiff = 1; 613 614 if (line.compare(0, 7, "MAXDIFF", 7) == 0) { 615 if (!parse_num(line, &maxdiff, afflst)) { 616 finishFileMgr(afflst); 617 return 1; 618 } 619 } 620 621 if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) { 622 if (!parse_num(line, &maxcpdsugs, afflst)) { 623 finishFileMgr(afflst); 624 return 1; 625 } 626 } 627 628 if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) { 629 nosplitsugs = 1; 630 } 631 632 if (line.compare(0, 9, "FULLSTRIP", 9) == 0) { 633 fullstrip = 1; 634 } 635 636 if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) { 637 sugswithdots = 1; 638 } 639 640 /* parse in the flag used by forbidden words */ 641 if (line.compare(0, 8, "KEEPCASE", 8) == 0) { 642 if (!parse_flag(line, &keepcase, afflst)) { 643 finishFileMgr(afflst); 644 return 1; 645 } 646 } 647 648 /* parse in the flag used by `forceucase' */ 649 if (line.compare(0, 10, "FORCEUCASE", 10) == 0) { 650 if (!parse_flag(line, &forceucase, afflst)) { 651 finishFileMgr(afflst); 652 return 1; 653 } 654 } 655 656 /* parse in the flag used by `warn' */ 657 if (line.compare(0, 4, "WARN", 4) == 0) { 658 if (!parse_flag(line, &warn, afflst)) { 659 finishFileMgr(afflst); 660 return 1; 661 } 662 } 663 664 if (line.compare(0, 10, "FORBIDWARN", 10) == 0) { 665 forbidwarn = 1; 666 } 667 668 /* parse in the flag used by the affix generator */ 669 if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) { 670 if (!parse_flag(line, &substandard, afflst)) { 671 finishFileMgr(afflst); 672 return 1; 673 } 674 } 675 676 if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) { 677 checksharps = 1; 678 } 679 680 /* parse this affix: P - prefix, S - suffix */ 681 // affix type 682 char ft = ' '; 683 if (line.compare(0, 3, "PFX", 3) == 0) 684 ft = complexprefixes ? 'S' : 'P'; 685 if (line.compare(0, 3, "SFX", 3) == 0) 686 ft = complexprefixes ? 'P' : 'S'; 687 if (ft != ' ') { 688 if (dupflags_ini) { 689 memset(dupflags, 0, sizeof(dupflags)); 690 dupflags_ini = 0; 691 } 692 if (!parse_affix(line, ft, afflst, dupflags)) { 693 finishFileMgr(afflst); 694 return 1; 695 } 696 } 697 } 698 699 finishFileMgr(afflst); 700 // affix trees are sorted now 701 702 // now we can speed up performance greatly taking advantage of the 703 // relationship between the affixes and the idea of "subsets". 704 705 // View each prefix as a potential leading subset of another and view 706 // each suffix (reversed) as a potential trailing subset of another. 707 708 // To illustrate this relationship if we know the prefix "ab" is found in the 709 // word to examine, only prefixes that "ab" is a leading subset of need be 710 // examined. 711 // Furthermore is "ab" is not present then none of the prefixes that "ab" is 712 // is a subset need be examined. 713 // The same argument goes for suffix string that are reversed. 714 715 // Then to top this off why not examine the first char of the word to quickly 716 // limit the set of prefixes to examine (i.e. the prefixes to examine must 717 // be leading supersets of the first character of the word (if they exist) 718 719 // To take advantage of this "subset" relationship, we need to add two links 720 // from entry. One to take next if the current prefix is found (call it 721 // nexteq) 722 // and one to take next if the current prefix is not found (call it nextne). 723 724 // Since we have built ordered lists, all that remains is to properly 725 // initialize 726 // the nextne and nexteq pointers that relate them 727 728 process_pfx_order(); 729 process_sfx_order(); 730 731 /* get encoding for CHECKCOMPOUNDCASE */ 732 if (!utf8) { 733 csconv = get_current_cs(get_encoding()); 734 for (int i = 0; i <= 255; i++) { 735 if ((csconv[i].cupper != csconv[i].clower) && 736 (wordchars.find((char)i) == std::string::npos)) { 737 wordchars.push_back((char)i); 738 } 739 } 740 741 } 742 743 // default BREAK definition 744 if (!parsedbreaktable) { 745 breaktable.push_back("-"); 746 breaktable.push_back("^-"); 747 breaktable.push_back("-$"); 748 parsedbreaktable = true; 749 } 750 return 0; 751 } 752 753 // we want to be able to quickly access prefix information 754 // both by prefix flag, and sorted by prefix string itself 755 // so we need to set up two indexes 756 757 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) { 758 PfxEntry* ptr; 759 PfxEntry* pptr; 760 PfxEntry* ep = pfxptr; 761 762 // get the right starting points 763 const char* key = ep->getKey(); 764 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF); 765 766 // first index by flag which must exist 767 ptr = pFlag[flg]; 768 ep->setFlgNxt(ptr); 769 pFlag[flg] = ep; 770 771 // handle the special case of null affix string 772 if (strlen(key) == 0) { 773 // always inset them at head of list at element 0 774 ptr = pStart[0]; 775 ep->setNext(ptr); 776 pStart[0] = ep; 777 return 0; 778 } 779 780 // now handle the normal case 781 ep->setNextEQ(NULL); 782 ep->setNextNE(NULL); 783 784 unsigned char sp = *((const unsigned char*)key); 785 ptr = pStart[sp]; 786 787 // handle the first insert 788 if (!ptr) { 789 pStart[sp] = ep; 790 return 0; 791 } 792 793 // otherwise use binary tree insertion so that a sorted 794 // list can easily be generated later 795 pptr = NULL; 796 for (;;) { 797 pptr = ptr; 798 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) { 799 ptr = ptr->getNextEQ(); 800 if (!ptr) { 801 pptr->setNextEQ(ep); 802 break; 803 } 804 } else { 805 ptr = ptr->getNextNE(); 806 if (!ptr) { 807 pptr->setNextNE(ep); 808 break; 809 } 810 } 811 } 812 return 0; 813 } 814 815 // we want to be able to quickly access suffix information 816 // both by suffix flag, and sorted by the reverse of the 817 // suffix string itself; so we need to set up two indexes 818 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) { 819 820 sfxptr->initReverseWord(); 821 822 SfxEntry* ptr; 823 SfxEntry* pptr; 824 SfxEntry* ep = sfxptr; 825 826 /* get the right starting point */ 827 const char* key = ep->getKey(); 828 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF); 829 830 // first index by flag which must exist 831 ptr = sFlag[flg]; 832 ep->setFlgNxt(ptr); 833 sFlag[flg] = ep; 834 835 // next index by affix string 836 837 // handle the special case of null affix string 838 if (strlen(key) == 0) { 839 // always inset them at head of list at element 0 840 ptr = sStart[0]; 841 ep->setNext(ptr); 842 sStart[0] = ep; 843 return 0; 844 } 845 846 // now handle the normal case 847 ep->setNextEQ(NULL); 848 ep->setNextNE(NULL); 849 850 unsigned char sp = *((const unsigned char*)key); 851 ptr = sStart[sp]; 852 853 // handle the first insert 854 if (!ptr) { 855 sStart[sp] = ep; 856 return 0; 857 } 858 859 // otherwise use binary tree insertion so that a sorted 860 // list can easily be generated later 861 pptr = NULL; 862 for (;;) { 863 pptr = ptr; 864 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) { 865 ptr = ptr->getNextEQ(); 866 if (!ptr) { 867 pptr->setNextEQ(ep); 868 break; 869 } 870 } else { 871 ptr = ptr->getNextNE(); 872 if (!ptr) { 873 pptr->setNextNE(ep); 874 break; 875 } 876 } 877 } 878 return 0; 879 } 880 881 // convert from binary tree to sorted list 882 int AffixMgr::process_pfx_tree_to_list() { 883 for (int i = 1; i < SETSIZE; i++) { 884 pStart[i] = process_pfx_in_order(pStart[i], NULL); 885 } 886 return 0; 887 } 888 889 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) { 890 if (ptr) { 891 nptr = process_pfx_in_order(ptr->getNextNE(), nptr); 892 ptr->setNext(nptr); 893 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr); 894 } 895 return nptr; 896 } 897 898 // convert from binary tree to sorted list 899 int AffixMgr::process_sfx_tree_to_list() { 900 for (int i = 1; i < SETSIZE; i++) { 901 sStart[i] = process_sfx_in_order(sStart[i], NULL); 902 } 903 return 0; 904 } 905 906 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) { 907 if (ptr) { 908 nptr = process_sfx_in_order(ptr->getNextNE(), nptr); 909 ptr->setNext(nptr); 910 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr); 911 } 912 return nptr; 913 } 914 915 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching 916 // using the idea of leading subsets this time 917 int AffixMgr::process_pfx_order() { 918 PfxEntry* ptr; 919 920 // loop through each prefix list starting point 921 for (int i = 1; i < SETSIZE; i++) { 922 ptr = pStart[i]; 923 924 // look through the remainder of the list 925 // and find next entry with affix that 926 // the current one is not a subset of 927 // mark that as destination for NextNE 928 // use next in list that you are a subset 929 // of as NextEQ 930 931 for (; ptr != NULL; ptr = ptr->getNext()) { 932 PfxEntry* nptr = ptr->getNext(); 933 for (; nptr != NULL; nptr = nptr->getNext()) { 934 if (!isSubset(ptr->getKey(), nptr->getKey())) 935 break; 936 } 937 ptr->setNextNE(nptr); 938 ptr->setNextEQ(NULL); 939 if ((ptr->getNext()) && 940 isSubset(ptr->getKey(), (ptr->getNext())->getKey())) 941 ptr->setNextEQ(ptr->getNext()); 942 } 943 944 // now clean up by adding smart search termination strings: 945 // if you are already a superset of the previous prefix 946 // but not a subset of the next, search can end here 947 // so set NextNE properly 948 949 ptr = pStart[i]; 950 for (; ptr != NULL; ptr = ptr->getNext()) { 951 PfxEntry* nptr = ptr->getNext(); 952 PfxEntry* mptr = NULL; 953 for (; nptr != NULL; nptr = nptr->getNext()) { 954 if (!isSubset(ptr->getKey(), nptr->getKey())) 955 break; 956 mptr = nptr; 957 } 958 if (mptr) 959 mptr->setNextNE(NULL); 960 } 961 } 962 return 0; 963 } 964 965 // initialize the SfxEntry links NextEQ and NextNE to speed searching 966 // using the idea of leading subsets this time 967 int AffixMgr::process_sfx_order() { 968 SfxEntry* ptr; 969 970 // loop through each prefix list starting point 971 for (int i = 1; i < SETSIZE; i++) { 972 ptr = sStart[i]; 973 974 // look through the remainder of the list 975 // and find next entry with affix that 976 // the current one is not a subset of 977 // mark that as destination for NextNE 978 // use next in list that you are a subset 979 // of as NextEQ 980 981 for (; ptr != NULL; ptr = ptr->getNext()) { 982 SfxEntry* nptr = ptr->getNext(); 983 for (; nptr != NULL; nptr = nptr->getNext()) { 984 if (!isSubset(ptr->getKey(), nptr->getKey())) 985 break; 986 } 987 ptr->setNextNE(nptr); 988 ptr->setNextEQ(NULL); 989 if ((ptr->getNext()) && 990 isSubset(ptr->getKey(), (ptr->getNext())->getKey())) 991 ptr->setNextEQ(ptr->getNext()); 992 } 993 994 // now clean up by adding smart search termination strings: 995 // if you are already a superset of the previous suffix 996 // but not a subset of the next, search can end here 997 // so set NextNE properly 998 999 ptr = sStart[i]; 1000 for (; ptr != NULL; ptr = ptr->getNext()) { 1001 SfxEntry* nptr = ptr->getNext(); 1002 SfxEntry* mptr = NULL; 1003 for (; nptr != NULL; nptr = nptr->getNext()) { 1004 if (!isSubset(ptr->getKey(), nptr->getKey())) 1005 break; 1006 mptr = nptr; 1007 } 1008 if (mptr) 1009 mptr->setNextNE(NULL); 1010 } 1011 } 1012 return 0; 1013 } 1014 1015 // add flags to the result for dictionary debugging 1016 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) { 1017 char* st = encode_flag(flag); 1018 result.push_back(MSEP_FLD); 1019 result.append(MORPH_FLAG); 1020 if (st) { 1021 result.append(st); 1022 free(st); 1023 } 1024 return result; 1025 } 1026 1027 // calculate the character length of the condition 1028 int AffixMgr::condlen(const char* st) { 1029 int l = 0; 1030 bool group = false; 1031 for (; *st; st++) { 1032 if (*st == '[') { 1033 group = true; 1034 l++; 1035 } else if (*st == ']') 1036 group = false; 1037 else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) 1038 l++; 1039 } 1040 return l; 1041 } 1042 1043 int AffixMgr::encodeit(AffEntry& entry, const char* cs) { 1044 if (strcmp(cs, ".") != 0) { 1045 entry.numconds = (char)condlen(cs); 1046 const size_t cslen = strlen(cs); 1047 const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen); 1048 memcpy(entry.c.conds, cs, short_part); 1049 if (short_part < MAXCONDLEN) { 1050 //blank out the remaining space 1051 memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part); 1052 } else if (cs[MAXCONDLEN]) { 1053 //there is more conditions than fit in fixed space, so its 1054 //a long condition 1055 entry.opts |= aeLONGCOND; 1056 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); 1057 if (!entry.c.l.conds2) 1058 return 1; 1059 } 1060 } else { 1061 entry.numconds = 0; 1062 entry.c.conds[0] = '\0'; 1063 } 1064 return 0; 1065 } 1066 1067 // return 1 if s1 is a leading subset of s2 (dots are for infixes) 1068 inline int AffixMgr::isSubset(const char* s1, const char* s2) { 1069 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { 1070 s1++; 1071 s2++; 1072 } 1073 return (*s1 == '\0'); 1074 } 1075 1076 // check word for prefixes 1077 struct hentry* AffixMgr::prefix_check(const char* word, 1078 int len, 1079 char in_compound, 1080 const FLAG needflag) { 1081 struct hentry* rv = NULL; 1082 1083 pfx = NULL; 1084 pfxappnd = NULL; 1085 sfxappnd = NULL; 1086 sfxextra = 0; 1087 1088 // first handle the special case of 0 length prefixes 1089 PfxEntry* pe = pStart[0]; 1090 while (pe) { 1091 if ( 1092 // fogemorpheme 1093 ((in_compound != IN_CPD_NOT) || 1094 !(pe->getCont() && 1095 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) && 1096 // permit prefixes in compounds 1097 ((in_compound != IN_CPD_END) || 1098 (pe->getCont() && 1099 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) { 1100 // check prefix 1101 rv = pe->checkword(word, len, in_compound, needflag); 1102 if (rv) { 1103 pfx = pe; // BUG: pfx not stateless 1104 return rv; 1105 } 1106 } 1107 pe = pe->getNext(); 1108 } 1109 1110 // now handle the general case 1111 unsigned char sp = *((const unsigned char*)word); 1112 PfxEntry* pptr = pStart[sp]; 1113 1114 while (pptr) { 1115 if (isSubset(pptr->getKey(), word)) { 1116 if ( 1117 // fogemorpheme 1118 ((in_compound != IN_CPD_NOT) || 1119 !(pptr->getCont() && 1120 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) && 1121 // permit prefixes in compounds 1122 ((in_compound != IN_CPD_END) || 1123 (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag, 1124 pptr->getContLen()))))) { 1125 // check prefix 1126 rv = pptr->checkword(word, len, in_compound, needflag); 1127 if (rv) { 1128 pfx = pptr; // BUG: pfx not stateless 1129 return rv; 1130 } 1131 } 1132 pptr = pptr->getNextEQ(); 1133 } else { 1134 pptr = pptr->getNextNE(); 1135 } 1136 } 1137 1138 return NULL; 1139 } 1140 1141 // check word for prefixes and two-level suffixes 1142 struct hentry* AffixMgr::prefix_check_twosfx(const char* word, 1143 int len, 1144 char in_compound, 1145 const FLAG needflag) { 1146 struct hentry* rv = NULL; 1147 1148 pfx = NULL; 1149 sfxappnd = NULL; 1150 sfxextra = 0; 1151 1152 // first handle the special case of 0 length prefixes 1153 PfxEntry* pe = pStart[0]; 1154 1155 while (pe) { 1156 rv = pe->check_twosfx(word, len, in_compound, needflag); 1157 if (rv) 1158 return rv; 1159 pe = pe->getNext(); 1160 } 1161 1162 // now handle the general case 1163 unsigned char sp = *((const unsigned char*)word); 1164 PfxEntry* pptr = pStart[sp]; 1165 1166 while (pptr) { 1167 if (isSubset(pptr->getKey(), word)) { 1168 rv = pptr->check_twosfx(word, len, in_compound, needflag); 1169 if (rv) { 1170 pfx = pptr; 1171 return rv; 1172 } 1173 pptr = pptr->getNextEQ(); 1174 } else { 1175 pptr = pptr->getNextNE(); 1176 } 1177 } 1178 1179 return NULL; 1180 } 1181 1182 // check word for prefixes and morph 1183 std::string AffixMgr::prefix_check_morph(const char* word, 1184 int len, 1185 char in_compound, 1186 const FLAG needflag) { 1187 1188 std::string result; 1189 1190 pfx = NULL; 1191 sfxappnd = NULL; 1192 sfxextra = 0; 1193 1194 // first handle the special case of 0 length prefixes 1195 PfxEntry* pe = pStart[0]; 1196 while (pe) { 1197 std::string st = pe->check_morph(word, len, in_compound, needflag); 1198 if (!st.empty()) { 1199 result.append(st); 1200 } 1201 pe = pe->getNext(); 1202 } 1203 1204 // now handle the general case 1205 unsigned char sp = *((const unsigned char*)word); 1206 PfxEntry* pptr = pStart[sp]; 1207 1208 while (pptr) { 1209 if (isSubset(pptr->getKey(), word)) { 1210 std::string st = pptr->check_morph(word, len, in_compound, needflag); 1211 if (!st.empty()) { 1212 // fogemorpheme 1213 if ((in_compound != IN_CPD_NOT) || 1214 !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, 1215 pptr->getContLen()))))) { 1216 result.append(st); 1217 pfx = pptr; 1218 } 1219 } 1220 pptr = pptr->getNextEQ(); 1221 } else { 1222 pptr = pptr->getNextNE(); 1223 } 1224 } 1225 1226 return result; 1227 } 1228 1229 // check word for prefixes and morph and two-level suffixes 1230 std::string AffixMgr::prefix_check_twosfx_morph(const char* word, 1231 int len, 1232 char in_compound, 1233 const FLAG needflag) { 1234 std::string result; 1235 1236 pfx = NULL; 1237 sfxappnd = NULL; 1238 sfxextra = 0; 1239 1240 // first handle the special case of 0 length prefixes 1241 PfxEntry* pe = pStart[0]; 1242 while (pe) { 1243 std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag); 1244 if (!st.empty()) { 1245 result.append(st); 1246 } 1247 pe = pe->getNext(); 1248 } 1249 1250 // now handle the general case 1251 unsigned char sp = *((const unsigned char*)word); 1252 PfxEntry* pptr = pStart[sp]; 1253 1254 while (pptr) { 1255 if (isSubset(pptr->getKey(), word)) { 1256 std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag); 1257 if (!st.empty()) { 1258 result.append(st); 1259 pfx = pptr; 1260 } 1261 pptr = pptr->getNextEQ(); 1262 } else { 1263 pptr = pptr->getNextNE(); 1264 } 1265 } 1266 1267 return result; 1268 } 1269 1270 // Is word a non-compound with a REP substitution (see checkcompoundrep)? 1271 int AffixMgr::cpdrep_check(const char* word, int wl) { 1272 1273 if ((wl < 2) || get_reptable().empty()) 1274 return 0; 1275 1276 for (size_t i = 0; i < get_reptable().size(); ++i) { 1277 // use only available mid patterns 1278 if (!get_reptable()[i].outstrings[0].empty()) { 1279 const char* r = word; 1280 const size_t lenp = get_reptable()[i].pattern.size(); 1281 // search every occurence of the pattern in the word 1282 while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) { 1283 std::string candidate(word); 1284 candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]); 1285 if (candidate_check(candidate.c_str(), candidate.size())) 1286 return 1; 1287 ++r; // search for the next letter 1288 } 1289 } 1290 } 1291 1292 return 0; 1293 } 1294 1295 // forbid compound words, if they are in the dictionary as a 1296 // word pair separated by space 1297 int AffixMgr::cpdwordpair_check(const char * word, int wl) { 1298 if (wl > 2) { 1299 std::string candidate(word); 1300 for (size_t i = 1; i < candidate.size(); i++) { 1301 // go to end of the UTF-8 character 1302 if (utf8 && ((word[i] & 0xc0) == 0x80)) 1303 continue; 1304 candidate.insert(i, 1, ' '); 1305 if (candidate_check(candidate.c_str(), candidate.size())) 1306 return 1; 1307 candidate.erase(i, 1); 1308 } 1309 } 1310 1311 return 0; 1312 } 1313 1314 // forbid compoundings when there are special patterns at word bound 1315 int AffixMgr::cpdpat_check(const char* word, 1316 int pos, 1317 hentry* r1, 1318 hentry* r2, 1319 const char /*affixed*/) { 1320 for (size_t i = 0; i < checkcpdtable.size(); ++i) { 1321 size_t len; 1322 if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) && 1323 (!r1 || !checkcpdtable[i].cond || 1324 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && 1325 (!r2 || !checkcpdtable[i].cond2 || 1326 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && 1327 // zero length pattern => only TESTAFF 1328 // zero pattern (0/flag) => unmodified stem (zero affixes allowed) 1329 (checkcpdtable[i].pattern.empty() || 1330 ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos && 1331 strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) || 1332 (checkcpdtable[i].pattern[0] != '0' && 1333 ((len = checkcpdtable[i].pattern.size()) != 0) && 1334 strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) { 1335 return 1; 1336 } 1337 } 1338 return 0; 1339 } 1340 1341 // forbid compounding with neighbouring upper and lower case characters at word 1342 // bounds 1343 int AffixMgr::cpdcase_check(const char* word, int pos) { 1344 if (utf8) { 1345 const char* p; 1346 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--) 1347 ; 1348 std::string pair(p); 1349 std::vector<w_char> pair_u; 1350 u8_u16(pair_u, pair); 1351 unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0; 1352 unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0; 1353 if (((unicodetoupper(a, langnum) == a) || 1354 (unicodetoupper(b, langnum) == b)) && 1355 (a != '-') && (b != '-')) 1356 return 1; 1357 } else { 1358 unsigned char a = *(word + pos - 1); 1359 unsigned char b = *(word + pos); 1360 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) 1361 return 1; 1362 } 1363 return 0; 1364 } 1365 1366 struct metachar_data { 1367 signed short btpp; // metacharacter (*, ?) position for backtracking 1368 signed short btwp; // word position for metacharacters 1369 int btnum; // number of matched characters in metacharacter 1370 }; 1371 1372 // check compound patterns 1373 int AffixMgr::defcpd_check(hentry*** words, 1374 short wnum, 1375 hentry* rv, 1376 hentry** def, 1377 char all) { 1378 int w = 0; 1379 1380 if (!*words) { 1381 w = 1; 1382 *words = def; 1383 } 1384 1385 if (!*words) { 1386 return 0; 1387 } 1388 1389 std::vector<metachar_data> btinfo(1); 1390 1391 short bt = 0; 1392 1393 (*words)[wnum] = rv; 1394 1395 // has the last word COMPOUNDRULE flag? 1396 if (rv->alen == 0) { 1397 (*words)[wnum] = NULL; 1398 if (w) 1399 *words = NULL; 1400 return 0; 1401 } 1402 int ok = 0; 1403 for (size_t i = 0; i < defcpdtable.size(); ++i) { 1404 for (size_t j = 0; j < defcpdtable[i].size(); ++j) { 1405 if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' && 1406 TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) { 1407 ok = 1; 1408 break; 1409 } 1410 } 1411 } 1412 if (ok == 0) { 1413 (*words)[wnum] = NULL; 1414 if (w) 1415 *words = NULL; 1416 return 0; 1417 } 1418 1419 for (size_t i = 0; i < defcpdtable.size(); ++i) { 1420 size_t pp = 0; // pattern position 1421 signed short wp = 0; // "words" position 1422 int ok2; 1423 ok = 1; 1424 ok2 = 1; 1425 do { 1426 while ((pp < defcpdtable[i].size()) && (wp <= wnum)) { 1427 if (((pp + 1) < defcpdtable[i].size()) && 1428 ((defcpdtable[i][pp + 1] == '*') || 1429 (defcpdtable[i][pp + 1] == '?'))) { 1430 int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum; 1431 ok2 = 1; 1432 pp += 2; 1433 btinfo[bt].btpp = pp; 1434 btinfo[bt].btwp = wp; 1435 while (wp <= wend) { 1436 if (!(*words)[wp]->alen || 1437 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2], 1438 (*words)[wp]->alen)) { 1439 ok2 = 0; 1440 break; 1441 } 1442 wp++; 1443 } 1444 if (wp <= wnum) 1445 ok2 = 0; 1446 btinfo[bt].btnum = wp - btinfo[bt].btwp; 1447 if (btinfo[bt].btnum > 0) { 1448 ++bt; 1449 btinfo.resize(bt+1); 1450 } 1451 if (ok2) 1452 break; 1453 } else { 1454 ok2 = 1; 1455 if (!(*words)[wp] || !(*words)[wp]->alen || 1456 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp], 1457 (*words)[wp]->alen)) { 1458 ok = 0; 1459 break; 1460 } 1461 pp++; 1462 wp++; 1463 if ((defcpdtable[i].size() == pp) && !(wp > wnum)) 1464 ok = 0; 1465 } 1466 } 1467 if (ok && ok2) { 1468 size_t r = pp; 1469 while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) && 1470 ((defcpdtable[i][r + 1] == '*') || 1471 (defcpdtable[i][r + 1] == '?'))) 1472 r += 2; 1473 if (defcpdtable[i].size() <= r) 1474 return 1; 1475 } 1476 // backtrack 1477 if (bt) 1478 do { 1479 ok = 1; 1480 btinfo[bt - 1].btnum--; 1481 pp = btinfo[bt - 1].btpp; 1482 wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum; 1483 } while ((btinfo[bt - 1].btnum < 0) && --bt); 1484 } while (bt); 1485 1486 if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp))) 1487 return 1; 1488 1489 // check zero ending 1490 while (ok && ok2 && (defcpdtable[i].size() > pp) && 1491 ((pp + 1) < defcpdtable[i].size()) && 1492 ((defcpdtable[i][pp + 1] == '*') || 1493 (defcpdtable[i][pp + 1] == '?'))) 1494 pp += 2; 1495 if (ok && ok2 && (defcpdtable[i].size() <= pp)) 1496 return 1; 1497 } 1498 (*words)[wnum] = NULL; 1499 if (w) 1500 *words = NULL; 1501 return 0; 1502 } 1503 1504 inline int AffixMgr::candidate_check(const char* word, int len) { 1505 1506 struct hentry* rv = lookup(word); 1507 if (rv) 1508 return 1; 1509 1510 // rv = prefix_check(word,len,1); 1511 // if (rv) return 1; 1512 1513 rv = affix_check(word, len); 1514 if (rv) 1515 return 1; 1516 return 0; 1517 } 1518 1519 // calculate number of syllable for compound-checking 1520 short AffixMgr::get_syllable(const std::string& word) { 1521 if (cpdmaxsyllable == 0) 1522 return 0; 1523 1524 short num = 0; 1525 1526 if (!utf8) { 1527 for (size_t i = 0; i < word.size(); ++i) { 1528 if (std::binary_search(cpdvowels.begin(), cpdvowels.end(), 1529 word[i])) { 1530 ++num; 1531 } 1532 } 1533 } else if (!cpdvowels_utf16.empty()) { 1534 std::vector<w_char> w; 1535 u8_u16(w, word); 1536 for (size_t i = 0; i < w.size(); ++i) { 1537 if (std::binary_search(cpdvowels_utf16.begin(), 1538 cpdvowels_utf16.end(), 1539 w[i])) { 1540 ++num; 1541 } 1542 } 1543 } 1544 1545 return num; 1546 } 1547 1548 void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) { 1549 if (utf8) { 1550 int i; 1551 for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) { 1552 for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++) 1553 ; 1554 } 1555 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) { 1556 for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--) 1557 ; 1558 } 1559 } else { 1560 *cmin = cpdmin; 1561 *cmax = len - cpdmin + 1; 1562 } 1563 } 1564 1565 // check if compound word is correctly spelled 1566 // hu_mov_rule = spec. Hungarian rule (XXX) 1567 struct hentry* AffixMgr::compound_check(const std::string& word, 1568 short wordnum, 1569 short numsyllable, 1570 short maxwordnum, 1571 short wnum, 1572 hentry** words = NULL, 1573 hentry** rwords = NULL, 1574 char hu_mov_rule = 0, 1575 char is_sug = 0, 1576 int* info = NULL) { 1577 int i; 1578 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; 1579 struct hentry* rv = NULL; 1580 struct hentry* rv_first; 1581 std::string st; 1582 char ch = '\0'; 1583 int cmin; 1584 int cmax; 1585 int striple = 0; 1586 size_t scpd = 0; 1587 int soldi = 0; 1588 int oldcmin = 0; 1589 int oldcmax = 0; 1590 int oldlen = 0; 1591 int checkedstriple = 0; 1592 char affixed = 0; 1593 hentry** oldwords = words; 1594 size_t len = word.size(); 1595 1596 int checked_prefix; 1597 1598 // add a time limit to handle possible 1599 // combinatorical explosion of the overlapping words 1600 1601 HUNSPELL_THREAD_LOCAL clock_t timelimit; 1602 1603 if (wordnum == 0) { 1604 // get the start time, seeing as we're reusing this set to 0 1605 // to flag timeout, use clock() + 1 to avoid start clock() 1606 // of 0 as being a timeout 1607 timelimit = clock() + 1; 1608 } 1609 else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) { 1610 timelimit = 0; 1611 } 1612 1613 setcminmax(&cmin, &cmax, word.c_str(), len); 1614 1615 st.assign(word); 1616 1617 for (i = cmin; i < cmax; i++) { 1618 // go to end of the UTF-8 character 1619 if (utf8) { 1620 for (; (st[i] & 0xc0) == 0x80; i++) 1621 ; 1622 if (i >= cmax) 1623 return NULL; 1624 } 1625 1626 words = oldwords; 1627 int onlycpdrule = (words) ? 1 : 0; 1628 1629 do { // onlycpdrule loop 1630 1631 oldnumsyllable = numsyllable; 1632 oldwordnum = wordnum; 1633 checked_prefix = 0; 1634 1635 do { // simplified checkcompoundpattern loop 1636 1637 if (timelimit == 0) 1638 return 0; 1639 1640 if (scpd > 0) { 1641 for (; scpd <= checkcpdtable.size() && 1642 (checkcpdtable[scpd - 1].pattern3.empty() || 1643 strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(), 1644 checkcpdtable[scpd - 1].pattern3.size()) != 0); 1645 scpd++) 1646 ; 1647 1648 if (scpd > checkcpdtable.size()) 1649 break; // break simplified checkcompoundpattern loop 1650 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern); 1651 soldi = i; 1652 i += checkcpdtable[scpd - 1].pattern.size(); 1653 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2); 1654 st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos, 1655 word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size())); 1656 1657 oldlen = len; 1658 len += checkcpdtable[scpd - 1].pattern.size() + 1659 checkcpdtable[scpd - 1].pattern2.size() - 1660 checkcpdtable[scpd - 1].pattern3.size(); 1661 oldcmin = cmin; 1662 oldcmax = cmax; 1663 setcminmax(&cmin, &cmax, st.c_str(), len); 1664 1665 cmax = len - cpdmin + 1; 1666 } 1667 1668 ch = st[i]; 1669 st[i] = '\0'; 1670 1671 sfx = NULL; 1672 pfx = NULL; 1673 1674 // FIRST WORD 1675 1676 affixed = 1; 1677 rv = lookup(st.c_str()); // perhaps without prefix 1678 1679 // forbid dictionary stems with COMPOUNDFORBIDFLAG in 1680 // compound words, overriding the effect of COMPOUNDPERMITFLAG 1681 if ((rv) && compoundforbidflag && 1682 TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) 1683 continue; 1684 1685 // search homonym with compound flag 1686 while ((rv) && !hu_mov_rule && 1687 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 1688 !((compoundflag && !words && !onlycpdrule && 1689 TESTAFF(rv->astr, compoundflag, rv->alen)) || 1690 (compoundbegin && !wordnum && !onlycpdrule && 1691 TESTAFF(rv->astr, compoundbegin, rv->alen)) || 1692 (compoundmiddle && wordnum && !words && !onlycpdrule && 1693 TESTAFF(rv->astr, compoundmiddle, rv->alen)) || 1694 (!defcpdtable.empty() && onlycpdrule && 1695 ((!words && !wordnum && 1696 defcpd_check(&words, wnum, rv, rwords, 0)) || 1697 (words && 1698 defcpd_check(&words, wnum, rv, rwords, 0))))) || 1699 (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL && 1700 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) { 1701 rv = rv->next_homonym; 1702 } 1703 1704 if (rv) 1705 affixed = 0; 1706 1707 if (!rv) { 1708 if (onlycpdrule) 1709 break; 1710 if (compoundflag && 1711 !(rv = prefix_check(st.c_str(), i, 1712 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, 1713 compoundflag))) { 1714 if (((rv = suffix_check( 1715 st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag, 1716 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1717 (compoundmoresuffixes && 1718 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) && 1719 !hu_mov_rule && sfx->getCont() && 1720 ((compoundforbidflag && 1721 TESTAFF(sfx->getCont(), compoundforbidflag, 1722 sfx->getContLen())) || 1723 (compoundend && 1724 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { 1725 rv = NULL; 1726 } 1727 } 1728 1729 if (rv || 1730 (((wordnum == 0) && compoundbegin && 1731 ((rv = suffix_check( 1732 st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin, 1733 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1734 (compoundmoresuffixes && 1735 (rv = suffix_check_twosfx( 1736 st.c_str(), i, 0, NULL, 1737 compoundbegin))) || // twofold suffixes + compound 1738 (rv = prefix_check(st.c_str(), i, 1739 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, 1740 compoundbegin)))) || 1741 ((wordnum > 0) && compoundmiddle && 1742 ((rv = suffix_check( 1743 st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle, 1744 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1745 (compoundmoresuffixes && 1746 (rv = suffix_check_twosfx( 1747 st.c_str(), i, 0, NULL, 1748 compoundmiddle))) || // twofold suffixes + compound 1749 (rv = prefix_check(st.c_str(), i, 1750 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, 1751 compoundmiddle)))))) 1752 checked_prefix = 1; 1753 // else check forbiddenwords and needaffix 1754 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1755 TESTAFF(rv->astr, needaffix, rv->alen) || 1756 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1757 (is_sug && nosuggest && 1758 TESTAFF(rv->astr, nosuggest, rv->alen)))) { 1759 st[i] = ch; 1760 // continue; 1761 break; 1762 } 1763 1764 // check non_compound flag in suffix and prefix 1765 if ((rv) && !hu_mov_rule && 1766 ((pfx && pfx->getCont() && 1767 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || 1768 (sfx && sfx->getCont() && 1769 TESTAFF(sfx->getCont(), compoundforbidflag, 1770 sfx->getContLen())))) { 1771 rv = NULL; 1772 } 1773 1774 // check compoundend flag in suffix and prefix 1775 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && 1776 ((pfx && pfx->getCont() && 1777 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) || 1778 (sfx && sfx->getCont() && 1779 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { 1780 rv = NULL; 1781 } 1782 1783 // check compoundmiddle flag in suffix and prefix 1784 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle && 1785 !hu_mov_rule && 1786 ((pfx && pfx->getCont() && 1787 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) || 1788 (sfx && sfx->getCont() && 1789 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) { 1790 rv = NULL; 1791 } 1792 1793 // check forbiddenwords 1794 if ((rv) && (rv->astr) && 1795 (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1796 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1797 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { 1798 return NULL; 1799 } 1800 1801 // increment word number, if the second root has a compoundroot flag 1802 if ((rv) && compoundroot && 1803 (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1804 wordnum++; 1805 } 1806 1807 // first word is acceptable in compound words? 1808 if (((rv) && 1809 (checked_prefix || (words && words[wnum]) || 1810 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1811 ((oldwordnum == 0) && compoundbegin && 1812 TESTAFF(rv->astr, compoundbegin, rv->alen)) || 1813 ((oldwordnum > 0) && compoundmiddle && 1814 TESTAFF(rv->astr, compoundmiddle, rv->alen)) 1815 1816 // LANG_hu section: spec. Hungarian rule 1817 || ((langnum == LANG_hu) && hu_mov_rule && 1818 (TESTAFF( 1819 rv->astr, 'F', 1820 rv->alen) || // XXX hardwired Hungarian dictionary codes 1821 TESTAFF(rv->astr, 'G', rv->alen) || 1822 TESTAFF(rv->astr, 'H', rv->alen))) 1823 // END of LANG_hu section 1824 ) && 1825 ( 1826 // test CHECKCOMPOUNDPATTERN conditions 1827 scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL || 1828 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) && 1829 !((checkcompoundtriple && scpd == 0 && 1830 !words && // test triple letters 1831 (word[i - 1] == word[i]) && 1832 (((i > 1) && (word[i - 1] == word[i - 2])) || 1833 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' 1834 )) || 1835 (checkcompoundcase && scpd == 0 && !words && 1836 cpdcase_check(word.c_str(), i)))) 1837 // LANG_hu section: spec. Hungarian rule 1838 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && 1839 (rv = affix_check(st.c_str(), i)) && 1840 (sfx && sfx->getCont() && 1841 ( // XXX hardwired Hungarian dic. codes 1842 TESTAFF(sfx->getCont(), (unsigned short)'x', 1843 sfx->getContLen()) || 1844 TESTAFF( 1845 sfx->getCont(), (unsigned short)'%', 1846 sfx->getContLen()))))) { // first word is ok condition 1847 1848 // LANG_hu section: spec. Hungarian rule 1849 if (langnum == LANG_hu) { 1850 // calculate syllable number of the word 1851 numsyllable += get_syllable(st.substr(0, i)); 1852 // + 1 word, if syllable number of the prefix > 1 (hungarian 1853 // convention) 1854 if (pfx && (get_syllable(pfx->getKey()) > 1)) 1855 wordnum++; 1856 } 1857 // END of LANG_hu section 1858 1859 // NEXT WORD(S) 1860 rv_first = rv; 1861 st[i] = ch; 1862 1863 do { // striple loop 1864 1865 // check simplifiedtriple 1866 if (simplifiedtriple) { 1867 if (striple) { 1868 checkedstriple = 1; 1869 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" 1870 } else if (i > 2 && word[i - 1] == word[i - 2]) 1871 striple = 1; 1872 } 1873 1874 rv = lookup(st.c_str() + i); // perhaps without prefix 1875 1876 // search homonym with compound flag 1877 while ((rv) && 1878 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 1879 !((compoundflag && !words && 1880 TESTAFF(rv->astr, compoundflag, rv->alen)) || 1881 (compoundend && !words && 1882 TESTAFF(rv->astr, compoundend, rv->alen)) || 1883 (!defcpdtable.empty() && words && 1884 defcpd_check(&words, wnum + 1, rv, NULL, 1))) || 1885 (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL && 1886 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, 1887 rv->alen)))) { 1888 rv = rv->next_homonym; 1889 } 1890 1891 // check FORCEUCASE 1892 if (rv && forceucase && (rv) && 1893 (TESTAFF(rv->astr, forceucase, rv->alen)) && 1894 !(info && *info & SPELL_ORIGCAP)) 1895 rv = NULL; 1896 1897 if (rv && words && words[wnum + 1]) 1898 return rv_first; 1899 1900 oldnumsyllable2 = numsyllable; 1901 oldwordnum2 = wordnum; 1902 1903 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary 1904 // code 1905 if ((rv) && (langnum == LANG_hu) && 1906 (TESTAFF(rv->astr, 'I', rv->alen)) && 1907 !(TESTAFF(rv->astr, 'J', rv->alen))) { 1908 numsyllable--; 1909 } 1910 // END of LANG_hu section 1911 1912 // increment word number, if the second root has a compoundroot flag 1913 if ((rv) && (compoundroot) && 1914 (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1915 wordnum++; 1916 } 1917 1918 // check forbiddenwords 1919 if ((rv) && (rv->astr) && 1920 (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1921 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1922 (is_sug && nosuggest && 1923 TESTAFF(rv->astr, nosuggest, rv->alen)))) 1924 return NULL; 1925 1926 // second word is acceptable, as a root? 1927 // hungarian conventions: compounding is acceptable, 1928 // when compound forms consist of 2 words, or if more, 1929 // then the syllable number of root words must be 6, or lesser. 1930 1931 if ((rv) && 1932 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1933 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && 1934 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || 1935 ((cpdmaxsyllable != 0) && 1936 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <= 1937 cpdmaxsyllable))) && 1938 ( 1939 // test CHECKCOMPOUNDPATTERN 1940 checkcpdtable.empty() || scpd != 0 || 1941 !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) && 1942 ((!checkcompounddup || (rv != rv_first))) 1943 // test CHECKCOMPOUNDPATTERN conditions 1944 && 1945 (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || 1946 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) { 1947 // forbid compound word, if it is a non-compound word with typical 1948 // fault 1949 if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) || 1950 cpdwordpair_check(word.c_str(), len)) 1951 return NULL; 1952 return rv_first; 1953 } 1954 1955 numsyllable = oldnumsyllable2; 1956 wordnum = oldwordnum2; 1957 1958 // perhaps second word has prefix or/and suffix 1959 sfx = NULL; 1960 sfxflag = FLAG_NULL; 1961 rv = (compoundflag && !onlycpdrule) 1962 ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag, 1963 IN_CPD_END) 1964 : NULL; 1965 if (!rv && compoundend && !onlycpdrule) { 1966 sfx = NULL; 1967 pfx = NULL; 1968 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend, 1969 IN_CPD_END); 1970 } 1971 1972 if (!rv && !defcpdtable.empty() && words) { 1973 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END); 1974 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) 1975 return rv_first; 1976 rv = NULL; 1977 } 1978 1979 // test CHECKCOMPOUNDPATTERN conditions (allowed forms) 1980 if (rv && 1981 !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || 1982 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) 1983 rv = NULL; 1984 1985 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) 1986 if (rv && !checkcpdtable.empty() && scpd == 0 && 1987 cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) 1988 rv = NULL; 1989 1990 // check non_compound flag in suffix and prefix 1991 if ((rv) && ((pfx && pfx->getCont() && 1992 TESTAFF(pfx->getCont(), compoundforbidflag, 1993 pfx->getContLen())) || 1994 (sfx && sfx->getCont() && 1995 TESTAFF(sfx->getCont(), compoundforbidflag, 1996 sfx->getContLen())))) { 1997 rv = NULL; 1998 } 1999 2000 // check FORCEUCASE 2001 if (rv && forceucase && (rv) && 2002 (TESTAFF(rv->astr, forceucase, rv->alen)) && 2003 !(info && *info & SPELL_ORIGCAP)) 2004 rv = NULL; 2005 2006 // check forbiddenwords 2007 if ((rv) && (rv->astr) && 2008 (TESTAFF(rv->astr, forbiddenword, rv->alen) || 2009 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 2010 (is_sug && nosuggest && 2011 TESTAFF(rv->astr, nosuggest, rv->alen)))) 2012 return NULL; 2013 2014 // pfxappnd = prefix of word+i, or NULL 2015 // calculate syllable number of prefix. 2016 // hungarian convention: when syllable number of prefix is more, 2017 // than 1, the prefix+word counts as two words. 2018 2019 if (langnum == LANG_hu) { 2020 // calculate syllable number of the word 2021 numsyllable += get_syllable(word.c_str() + i); 2022 2023 // - affix syllable num. 2024 // XXX only second suffix (inflections, not derivations) 2025 if (sfxappnd) { 2026 std::string tmp(sfxappnd); 2027 reverseword(tmp); 2028 numsyllable -= short(get_syllable(tmp) + sfxextra); 2029 } else { 2030 numsyllable -= short(sfxextra); 2031 } 2032 2033 // + 1 word, if syllable number of the prefix > 1 (hungarian 2034 // convention) 2035 if (pfx && (get_syllable(pfx->getKey()) > 1)) 2036 wordnum++; 2037 2038 // increment syllable num, if last word has a SYLLABLENUM flag 2039 // and the suffix is beginning `s' 2040 2041 if (!cpdsyllablenum.empty()) { 2042 switch (sfxflag) { 2043 case 'c': { 2044 numsyllable += 2; 2045 break; 2046 } 2047 case 'J': { 2048 numsyllable += 1; 2049 break; 2050 } 2051 case 'I': { 2052 if (rv && TESTAFF(rv->astr, 'J', rv->alen)) 2053 numsyllable += 1; 2054 break; 2055 } 2056 } 2057 } 2058 } 2059 2060 // increment word number, if the second word has a compoundroot flag 2061 if ((rv) && (compoundroot) && 2062 (TESTAFF(rv->astr, compoundroot, rv->alen))) { 2063 wordnum++; 2064 } 2065 // second word is acceptable, as a word with prefix or/and suffix? 2066 // hungarian conventions: compounding is acceptable, 2067 // when compound forms consist 2 word, otherwise 2068 // the syllable number of root words is 6, or lesser. 2069 if ((rv) && 2070 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || 2071 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && 2072 ((!checkcompounddup || (rv != rv_first)))) { 2073 // forbid compound word, if it is a non-compound word with typical 2074 // fault 2075 if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) || 2076 cpdwordpair_check(word.c_str(), len)) 2077 return NULL; 2078 return rv_first; 2079 } 2080 2081 numsyllable = oldnumsyllable2; 2082 wordnum = oldwordnum2; 2083 2084 // perhaps second word is a compound word (recursive call) 2085 if (wordnum + 2 < maxwordnum) { 2086 rv = compound_check(st.substr(i), wordnum + 1, 2087 numsyllable, maxwordnum, wnum + 1, words, rwords, 0, 2088 is_sug, info); 2089 2090 if (rv && !checkcpdtable.empty() && 2091 ((scpd == 0 && 2092 cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) || 2093 (scpd != 0 && 2094 !cpdpat_check(word.c_str(), i, rv_first, rv, affixed)))) 2095 rv = NULL; 2096 } else { 2097 rv = NULL; 2098 } 2099 if (rv) { 2100 // forbid compound word, if it is a non-compound word with typical 2101 // fault, or a dictionary word pair 2102 2103 if (cpdwordpair_check(word.c_str(), len)) 2104 return NULL; 2105 2106 if (checkcompoundrep || forbiddenword) { 2107 2108 if (checkcompoundrep && cpdrep_check(word.c_str(), len)) 2109 return NULL; 2110 2111 // check first part 2112 if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) { 2113 char r = st[i + rv->blen]; 2114 st[i + rv->blen] = '\0'; 2115 2116 if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) || 2117 cpdwordpair_check(st.c_str(), i + rv->blen)) { 2118 st[ + i + rv->blen] = r; 2119 continue; 2120 } 2121 2122 if (forbiddenword) { 2123 struct hentry* rv2 = lookup(word.c_str()); 2124 if (!rv2) 2125 rv2 = affix_check(word.c_str(), len); 2126 if (rv2 && rv2->astr && 2127 TESTAFF(rv2->astr, forbiddenword, rv2->alen) && 2128 (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) { 2129 return NULL; 2130 } 2131 } 2132 st[i + rv->blen] = r; 2133 } 2134 } 2135 return rv_first; 2136 } 2137 } while (striple && !checkedstriple); // end of striple loop 2138 2139 if (checkedstriple) { 2140 i++; 2141 checkedstriple = 0; 2142 striple = 0; 2143 } 2144 2145 } // first word is ok condition 2146 2147 if (soldi != 0) { 2148 i = soldi; 2149 soldi = 0; 2150 len = oldlen; 2151 cmin = oldcmin; 2152 cmax = oldcmax; 2153 } 2154 scpd++; 2155 2156 } while (!onlycpdrule && simplifiedcpd && 2157 scpd <= checkcpdtable.size()); // end of simplifiedcpd loop 2158 2159 scpd = 0; 2160 wordnum = oldwordnum; 2161 numsyllable = oldnumsyllable; 2162 2163 if (soldi != 0) { 2164 i = soldi; 2165 st.assign(word); // XXX add more optim. 2166 soldi = 0; 2167 } else 2168 st[i] = ch; 2169 2170 } while (!defcpdtable.empty() && oldwordnum == 0 && 2171 onlycpdrule++ < 1); // end of onlycpd loop 2172 } 2173 2174 return NULL; 2175 } 2176 2177 // check if compound word is correctly spelled 2178 // hu_mov_rule = spec. Hungarian rule (XXX) 2179 int AffixMgr::compound_check_morph(const char* word, 2180 int len, 2181 short wordnum, 2182 short numsyllable, 2183 short maxwordnum, 2184 short wnum, 2185 hentry** words, 2186 hentry** rwords, 2187 char hu_mov_rule, 2188 std::string& result, 2189 const std::string* partresult) { 2190 int i; 2191 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; 2192 int ok = 0; 2193 2194 struct hentry* rv = NULL; 2195 struct hentry* rv_first; 2196 std::string st; 2197 char ch; 2198 2199 int checked_prefix; 2200 std::string presult; 2201 2202 int cmin; 2203 int cmax; 2204 2205 char affixed = 0; 2206 hentry** oldwords = words; 2207 2208 // add a time limit to handle possible 2209 // combinatorical explosion of the overlapping words 2210 2211 HUNSPELL_THREAD_LOCAL clock_t timelimit; 2212 2213 if (wordnum == 0) { 2214 // get the start time, seeing as we're reusing this set to 0 2215 // to flag timeout, use clock() + 1 to avoid start clock() 2216 // of 0 as being a timeout 2217 timelimit = clock() + 1; 2218 } 2219 else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) { 2220 timelimit = 0; 2221 } 2222 2223 setcminmax(&cmin, &cmax, word, len); 2224 2225 st.assign(word); 2226 2227 for (i = cmin; i < cmax; i++) { 2228 // go to end of the UTF-8 character 2229 if (utf8) { 2230 for (; (st[i] & 0xc0) == 0x80; i++) 2231 ; 2232 if (i >= cmax) 2233 return 0; 2234 } 2235 2236 words = oldwords; 2237 int onlycpdrule = (words) ? 1 : 0; 2238 2239 do { // onlycpdrule loop 2240 2241 if (timelimit == 0) 2242 return 0; 2243 2244 oldnumsyllable = numsyllable; 2245 oldwordnum = wordnum; 2246 checked_prefix = 0; 2247 2248 ch = st[i]; 2249 st[i] = '\0'; 2250 sfx = NULL; 2251 2252 // FIRST WORD 2253 2254 affixed = 1; 2255 2256 presult.clear(); 2257 if (partresult) 2258 presult.append(*partresult); 2259 2260 rv = lookup(st.c_str()); // perhaps without prefix 2261 2262 // forbid dictionary stems with COMPOUNDFORBIDFLAG in 2263 // compound words, overriding the effect of COMPOUNDPERMITFLAG 2264 if ((rv) && compoundforbidflag && 2265 TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) 2266 continue; 2267 2268 // search homonym with compound flag 2269 while ((rv) && !hu_mov_rule && 2270 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 2271 !((compoundflag && !words && !onlycpdrule && 2272 TESTAFF(rv->astr, compoundflag, rv->alen)) || 2273 (compoundbegin && !wordnum && !onlycpdrule && 2274 TESTAFF(rv->astr, compoundbegin, rv->alen)) || 2275 (compoundmiddle && wordnum && !words && !onlycpdrule && 2276 TESTAFF(rv->astr, compoundmiddle, rv->alen)) || 2277 (!defcpdtable.empty() && onlycpdrule && 2278 ((!words && !wordnum && 2279 defcpd_check(&words, wnum, rv, rwords, 0)) || 2280 (words && 2281 defcpd_check(&words, wnum, rv, rwords, 0))))))) { 2282 rv = rv->next_homonym; 2283 } 2284 2285 if (timelimit == 0) 2286 return 0; 2287 2288 if (rv) 2289 affixed = 0; 2290 2291 if (rv) { 2292 presult.push_back(MSEP_FLD); 2293 presult.append(MORPH_PART); 2294 presult.append(st.c_str()); 2295 if (!HENTRY_FIND(rv, MORPH_STEM)) { 2296 presult.push_back(MSEP_FLD); 2297 presult.append(MORPH_STEM); 2298 presult.append(st.c_str()); 2299 } 2300 if (HENTRY_DATA(rv)) { 2301 presult.push_back(MSEP_FLD); 2302 presult.append(HENTRY_DATA2(rv)); 2303 } 2304 } 2305 2306 if (!rv) { 2307 if (compoundflag && 2308 !(rv = 2309 prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, 2310 compoundflag))) { 2311 if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL, 2312 compoundflag, 2313 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 2314 (compoundmoresuffixes && 2315 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) && 2316 !hu_mov_rule && sfx->getCont() && 2317 ((compoundforbidflag && 2318 TESTAFF(sfx->getCont(), compoundforbidflag, 2319 sfx->getContLen())) || 2320 (compoundend && 2321 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { 2322 rv = NULL; 2323 } 2324 } 2325 2326 if (rv || 2327 (((wordnum == 0) && compoundbegin && 2328 ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL, 2329 compoundbegin, 2330 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 2331 (compoundmoresuffixes && 2332 (rv = suffix_check_twosfx( 2333 st.c_str(), i, 0, NULL, 2334 compoundbegin))) || // twofold suffix+compound 2335 (rv = prefix_check(st.c_str(), i, 2336 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, 2337 compoundbegin)))) || 2338 ((wordnum > 0) && compoundmiddle && 2339 ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL, 2340 compoundmiddle, 2341 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 2342 (compoundmoresuffixes && 2343 (rv = suffix_check_twosfx( 2344 st.c_str(), i, 0, NULL, 2345 compoundmiddle))) || // twofold suffix+compound 2346 (rv = prefix_check(st.c_str(), i, 2347 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, 2348 compoundmiddle)))))) { 2349 std::string p; 2350 if (compoundflag) 2351 p = affix_check_morph(st.c_str(), i, compoundflag); 2352 if (p.empty()) { 2353 if ((wordnum == 0) && compoundbegin) { 2354 p = affix_check_morph(st.c_str(), i, compoundbegin); 2355 } else if ((wordnum > 0) && compoundmiddle) { 2356 p = affix_check_morph(st.c_str(), i, compoundmiddle); 2357 } 2358 } 2359 if (!p.empty()) { 2360 presult.push_back(MSEP_FLD); 2361 presult.append(MORPH_PART); 2362 presult.append(st.c_str()); 2363 line_uniq_app(p, MSEP_REC); 2364 presult.append(p); 2365 } 2366 checked_prefix = 1; 2367 } 2368 // else check forbiddenwords 2369 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 2370 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 2371 TESTAFF(rv->astr, needaffix, rv->alen))) { 2372 st[i] = ch; 2373 continue; 2374 } 2375 2376 // check non_compound flag in suffix and prefix 2377 if ((rv) && !hu_mov_rule && 2378 ((pfx && pfx->getCont() && 2379 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || 2380 (sfx && sfx->getCont() && 2381 TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) { 2382 continue; 2383 } 2384 2385 // check compoundend flag in suffix and prefix 2386 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && 2387 ((pfx && pfx->getCont() && 2388 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) || 2389 (sfx && sfx->getCont() && 2390 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { 2391 continue; 2392 } 2393 2394 // check compoundmiddle flag in suffix and prefix 2395 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle && 2396 !hu_mov_rule && 2397 ((pfx && pfx->getCont() && 2398 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) || 2399 (sfx && sfx->getCont() && 2400 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) { 2401 rv = NULL; 2402 } 2403 2404 // check forbiddenwords 2405 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 2406 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) 2407 continue; 2408 2409 // increment word number, if the second root has a compoundroot flag 2410 if ((rv) && (compoundroot) && 2411 (TESTAFF(rv->astr, compoundroot, rv->alen))) { 2412 wordnum++; 2413 } 2414 2415 // first word is acceptable in compound words? 2416 if (((rv) && 2417 (checked_prefix || (words && words[wnum]) || 2418 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 2419 ((oldwordnum == 0) && compoundbegin && 2420 TESTAFF(rv->astr, compoundbegin, rv->alen)) || 2421 ((oldwordnum > 0) && compoundmiddle && 2422 TESTAFF(rv->astr, compoundmiddle, rv->alen)) 2423 // LANG_hu section: spec. Hungarian rule 2424 || ((langnum == LANG_hu) && // hu_mov_rule 2425 hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) || 2426 TESTAFF(rv->astr, 'G', rv->alen) || 2427 TESTAFF(rv->astr, 'H', rv->alen))) 2428 // END of LANG_hu section 2429 ) && 2430 !((checkcompoundtriple && !words && // test triple letters 2431 (word[i - 1] == word[i]) && 2432 (((i > 1) && (word[i - 1] == word[i - 2])) || 2433 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' 2434 )) || 2435 ( 2436 // test CHECKCOMPOUNDPATTERN 2437 !checkcpdtable.empty() && !words && 2438 cpdpat_check(word, i, rv, NULL, affixed)) || 2439 (checkcompoundcase && !words && cpdcase_check(word, i)))) 2440 // LANG_hu section: spec. Hungarian rule 2441 || 2442 ((!rv) && (langnum == LANG_hu) && hu_mov_rule && 2443 (rv = affix_check(st.c_str(), i)) && 2444 (sfx && sfx->getCont() && 2445 (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) || 2446 TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen())))) 2447 // END of LANG_hu section 2448 ) { 2449 // LANG_hu section: spec. Hungarian rule 2450 if (langnum == LANG_hu) { 2451 // calculate syllable number of the word 2452 numsyllable += get_syllable(st.substr(0, i)); 2453 2454 // + 1 word, if syllable number of the prefix > 1 (hungarian 2455 // convention) 2456 if (pfx && (get_syllable(pfx->getKey()) > 1)) 2457 wordnum++; 2458 } 2459 // END of LANG_hu section 2460 2461 // NEXT WORD(S) 2462 rv_first = rv; 2463 rv = lookup((word + i)); // perhaps without prefix 2464 2465 // search homonym with compound flag 2466 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 2467 !((compoundflag && !words && 2468 TESTAFF(rv->astr, compoundflag, rv->alen)) || 2469 (compoundend && !words && 2470 TESTAFF(rv->astr, compoundend, rv->alen)) || 2471 (!defcpdtable.empty() && words && 2472 defcpd_check(&words, wnum + 1, rv, NULL, 1))))) { 2473 rv = rv->next_homonym; 2474 } 2475 2476 if (rv && words && words[wnum + 1]) { 2477 result.append(presult); 2478 result.push_back(MSEP_FLD); 2479 result.append(MORPH_PART); 2480 result.append(word + i); 2481 if (complexprefixes && HENTRY_DATA(rv)) 2482 result.append(HENTRY_DATA2(rv)); 2483 if (!HENTRY_FIND(rv, MORPH_STEM)) { 2484 result.push_back(MSEP_FLD); 2485 result.append(MORPH_STEM); 2486 result.append(HENTRY_WORD(rv)); 2487 } 2488 // store the pointer of the hash entry 2489 if (!complexprefixes && HENTRY_DATA(rv)) { 2490 result.push_back(MSEP_FLD); 2491 result.append(HENTRY_DATA2(rv)); 2492 } 2493 result.push_back(MSEP_REC); 2494 return 0; 2495 } 2496 2497 oldnumsyllable2 = numsyllable; 2498 oldwordnum2 = wordnum; 2499 2500 // LANG_hu section: spec. Hungarian rule 2501 if ((rv) && (langnum == LANG_hu) && 2502 (TESTAFF(rv->astr, 'I', rv->alen)) && 2503 !(TESTAFF(rv->astr, 'J', rv->alen))) { 2504 numsyllable--; 2505 } 2506 // END of LANG_hu section 2507 // increment word number, if the second root has a compoundroot flag 2508 if ((rv) && (compoundroot) && 2509 (TESTAFF(rv->astr, compoundroot, rv->alen))) { 2510 wordnum++; 2511 } 2512 2513 // check forbiddenwords 2514 if ((rv) && (rv->astr) && 2515 (TESTAFF(rv->astr, forbiddenword, rv->alen) || 2516 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) { 2517 st[i] = ch; 2518 continue; 2519 } 2520 2521 // second word is acceptable, as a root? 2522 // hungarian conventions: compounding is acceptable, 2523 // when compound forms consist of 2 words, or if more, 2524 // then the syllable number of root words must be 6, or lesser. 2525 if ((rv) && 2526 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 2527 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && 2528 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || 2529 ((cpdmaxsyllable != 0) && 2530 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <= 2531 cpdmaxsyllable))) && 2532 ((!checkcompounddup || (rv != rv_first)))) { 2533 // bad compound word 2534 result.append(presult); 2535 result.push_back(MSEP_FLD); 2536 result.append(MORPH_PART); 2537 result.append(word + i); 2538 2539 if (HENTRY_DATA(rv)) { 2540 if (complexprefixes) 2541 result.append(HENTRY_DATA2(rv)); 2542 if (!HENTRY_FIND(rv, MORPH_STEM)) { 2543 result.push_back(MSEP_FLD); 2544 result.append(MORPH_STEM); 2545 result.append(HENTRY_WORD(rv)); 2546 } 2547 // store the pointer of the hash entry 2548 if (!complexprefixes) { 2549 result.push_back(MSEP_FLD); 2550 result.append(HENTRY_DATA2(rv)); 2551 } 2552 } 2553 result.push_back(MSEP_REC); 2554 ok = 1; 2555 } 2556 2557 numsyllable = oldnumsyllable2; 2558 wordnum = oldwordnum2; 2559 2560 // perhaps second word has prefix or/and suffix 2561 sfx = NULL; 2562 sfxflag = FLAG_NULL; 2563 2564 if (compoundflag && !onlycpdrule) 2565 rv = affix_check((word + i), strlen(word + i), compoundflag); 2566 else 2567 rv = NULL; 2568 2569 if (!rv && compoundend && !onlycpdrule) { 2570 sfx = NULL; 2571 pfx = NULL; 2572 rv = affix_check((word + i), strlen(word + i), compoundend); 2573 } 2574 2575 if (!rv && !defcpdtable.empty() && words) { 2576 rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END); 2577 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { 2578 std::string m; 2579 if (compoundflag) 2580 m = affix_check_morph((word + i), strlen(word + i), compoundflag); 2581 if (m.empty() && compoundend) { 2582 m = affix_check_morph((word + i), strlen(word + i), compoundend); 2583 } 2584 result.append(presult); 2585 if (!m.empty()) { 2586 result.push_back(MSEP_FLD); 2587 result.append(MORPH_PART); 2588 result.append(word + i); 2589 line_uniq_app(m, MSEP_REC); 2590 result.append(m); 2591 } 2592 result.push_back(MSEP_REC); 2593 ok = 1; 2594 } 2595 } 2596 2597 // check non_compound flag in suffix and prefix 2598 if ((rv) && 2599 ((pfx && pfx->getCont() && 2600 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || 2601 (sfx && sfx->getCont() && 2602 TESTAFF(sfx->getCont(), compoundforbidflag, 2603 sfx->getContLen())))) { 2604 rv = NULL; 2605 } 2606 2607 // check forbiddenwords 2608 if ((rv) && (rv->astr) && 2609 (TESTAFF(rv->astr, forbiddenword, rv->alen) || 2610 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) && 2611 (!TESTAFF(rv->astr, needaffix, rv->alen))) { 2612 st[i] = ch; 2613 continue; 2614 } 2615 2616 if (langnum == LANG_hu) { 2617 // calculate syllable number of the word 2618 numsyllable += get_syllable(word + i); 2619 2620 // - affix syllable num. 2621 // XXX only second suffix (inflections, not derivations) 2622 if (sfxappnd) { 2623 std::string tmp(sfxappnd); 2624 reverseword(tmp); 2625 numsyllable -= short(get_syllable(tmp) + sfxextra); 2626 } else { 2627 numsyllable -= short(sfxextra); 2628 } 2629 2630 // + 1 word, if syllable number of the prefix > 1 (hungarian 2631 // convention) 2632 if (pfx && (get_syllable(pfx->getKey()) > 1)) 2633 wordnum++; 2634 2635 // increment syllable num, if last word has a SYLLABLENUM flag 2636 // and the suffix is beginning `s' 2637 2638 if (!cpdsyllablenum.empty()) { 2639 switch (sfxflag) { 2640 case 'c': { 2641 numsyllable += 2; 2642 break; 2643 } 2644 case 'J': { 2645 numsyllable += 1; 2646 break; 2647 } 2648 case 'I': { 2649 if (rv && TESTAFF(rv->astr, 'J', rv->alen)) 2650 numsyllable += 1; 2651 break; 2652 } 2653 } 2654 } 2655 } 2656 2657 // increment word number, if the second word has a compoundroot flag 2658 if ((rv) && (compoundroot) && 2659 (TESTAFF(rv->astr, compoundroot, rv->alen))) { 2660 wordnum++; 2661 } 2662 // second word is acceptable, as a word with prefix or/and suffix? 2663 // hungarian conventions: compounding is acceptable, 2664 // when compound forms consist 2 word, otherwise 2665 // the syllable number of root words is 6, or lesser. 2666 if ((rv) && 2667 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || 2668 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && 2669 ((!checkcompounddup || (rv != rv_first)))) { 2670 std::string m; 2671 if (compoundflag) 2672 m = affix_check_morph((word + i), strlen(word + i), compoundflag); 2673 if (m.empty() && compoundend) { 2674 m = affix_check_morph((word + i), strlen(word + i), compoundend); 2675 } 2676 result.append(presult); 2677 if (!m.empty()) { 2678 result.push_back(MSEP_FLD); 2679 result.append(MORPH_PART); 2680 result.append(word + i); 2681 line_uniq_app(m, MSEP_REC); 2682 result.push_back(MSEP_FLD); 2683 result.append(m); 2684 } 2685 result.push_back(MSEP_REC); 2686 ok = 1; 2687 } 2688 2689 numsyllable = oldnumsyllable2; 2690 wordnum = oldwordnum2; 2691 2692 // perhaps second word is a compound word (recursive call) 2693 if ((wordnum + 2 < maxwordnum) && (ok == 0)) { 2694 compound_check_morph((word + i), strlen(word + i), wordnum + 1, 2695 numsyllable, maxwordnum, wnum + 1, words, rwords, 0, 2696 result, &presult); 2697 } else { 2698 rv = NULL; 2699 } 2700 } 2701 st[i] = ch; 2702 wordnum = oldwordnum; 2703 numsyllable = oldnumsyllable; 2704 2705 } while (!defcpdtable.empty() && oldwordnum == 0 && 2706 onlycpdrule++ < 1); // end of onlycpd loop 2707 } 2708 return 0; 2709 } 2710 2711 2712 inline int AffixMgr::isRevSubset(const char* s1, 2713 const char* end_of_s2, 2714 int len) { 2715 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) { 2716 s1++; 2717 end_of_s2--; 2718 len--; 2719 } 2720 return (*s1 == '\0'); 2721 } 2722 2723 // check word for suffixes 2724 struct hentry* AffixMgr::suffix_check(const char* word, 2725 int len, 2726 int sfxopts, 2727 PfxEntry* ppfx, 2728 const FLAG cclass, 2729 const FLAG needflag, 2730 char in_compound) { 2731 struct hentry* rv = NULL; 2732 PfxEntry* ep = ppfx; 2733 2734 // first handle the special case of 0 length suffixes 2735 SfxEntry* se = sStart[0]; 2736 2737 while (se) { 2738 if (!cclass || se->getCont()) { 2739 // suffixes are not allowed in beginning of compounds 2740 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass 2741 // except when signed with compoundpermitflag flag 2742 (se->getCont() && compoundpermitflag && 2743 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) && 2744 (!circumfix || 2745 // no circumfix flag in prefix and suffix 2746 ((!ppfx || !(ep->getCont()) || 2747 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 2748 (!se->getCont() || 2749 !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) || 2750 // circumfix flag in prefix AND suffix 2751 ((ppfx && (ep->getCont()) && 2752 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 2753 (se->getCont() && 2754 (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) && 2755 // fogemorpheme 2756 (in_compound || 2757 !(se->getCont() && 2758 (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) && 2759 // needaffix on prefix or first suffix 2760 (cclass || 2761 !(se->getCont() && 2762 TESTAFF(se->getCont(), needaffix, se->getContLen())) || 2763 (ppfx && 2764 !((ep->getCont()) && 2765 TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) { 2766 rv = se->checkword(word, len, sfxopts, ppfx, 2767 (FLAG)cclass, needflag, 2768 (in_compound ? 0 : onlyincompound)); 2769 if (rv) { 2770 sfx = se; // BUG: sfx not stateless 2771 return rv; 2772 } 2773 } 2774 } 2775 se = se->getNext(); 2776 } 2777 2778 // now handle the general case 2779 if (len == 0) 2780 return NULL; // FULLSTRIP 2781 unsigned char sp = *((const unsigned char*)(word + len - 1)); 2782 SfxEntry* sptr = sStart[sp]; 2783 2784 while (sptr) { 2785 if (isRevSubset(sptr->getKey(), word + len - 1, len)) { 2786 // suffixes are not allowed in beginning of compounds 2787 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass 2788 // except when signed with compoundpermitflag flag 2789 (sptr->getCont() && compoundpermitflag && 2790 TESTAFF(sptr->getCont(), compoundpermitflag, 2791 sptr->getContLen()))) && 2792 (!circumfix || 2793 // no circumfix flag in prefix and suffix 2794 ((!ppfx || !(ep->getCont()) || 2795 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 2796 (!sptr->getCont() || 2797 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) || 2798 // circumfix flag in prefix AND suffix 2799 ((ppfx && (ep->getCont()) && 2800 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 2801 (sptr->getCont() && 2802 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) && 2803 // fogemorpheme 2804 (in_compound || 2805 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, 2806 sptr->getContLen()))))) && 2807 // needaffix on prefix or first suffix 2808 (cclass || 2809 !(sptr->getCont() && 2810 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || 2811 (ppfx && 2812 !((ep->getCont()) && 2813 TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) 2814 if (in_compound != IN_CPD_END || ppfx || 2815 !(sptr->getCont() && 2816 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { 2817 rv = sptr->checkword(word, len, sfxopts, ppfx, 2818 cclass, needflag, 2819 (in_compound ? 0 : onlyincompound)); 2820 if (rv) { 2821 sfx = sptr; // BUG: sfx not stateless 2822 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless 2823 if (!sptr->getCont()) 2824 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless 2825 // LANG_hu section: spec. Hungarian rule 2826 else if (langnum == LANG_hu && sptr->getKeyLen() && 2827 sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' && 2828 sptr->getKey()[1] != 't') { 2829 sfxextra = 1; 2830 } 2831 // END of LANG_hu section 2832 return rv; 2833 } 2834 } 2835 sptr = sptr->getNextEQ(); 2836 } else { 2837 sptr = sptr->getNextNE(); 2838 } 2839 } 2840 2841 return NULL; 2842 } 2843 2844 // check word for two-level suffixes 2845 struct hentry* AffixMgr::suffix_check_twosfx(const char* word, 2846 int len, 2847 int sfxopts, 2848 PfxEntry* ppfx, 2849 const FLAG needflag) { 2850 struct hentry* rv = NULL; 2851 2852 // first handle the special case of 0 length suffixes 2853 SfxEntry* se = sStart[0]; 2854 while (se) { 2855 if (contclasses[se->getFlag()]) { 2856 rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag); 2857 if (rv) 2858 return rv; 2859 } 2860 se = se->getNext(); 2861 } 2862 2863 // now handle the general case 2864 if (len == 0) 2865 return NULL; // FULLSTRIP 2866 unsigned char sp = *((const unsigned char*)(word + len - 1)); 2867 SfxEntry* sptr = sStart[sp]; 2868 2869 while (sptr) { 2870 if (isRevSubset(sptr->getKey(), word + len - 1, len)) { 2871 if (contclasses[sptr->getFlag()]) { 2872 rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag); 2873 if (rv) { 2874 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless 2875 if (!sptr->getCont()) 2876 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless 2877 return rv; 2878 } 2879 } 2880 sptr = sptr->getNextEQ(); 2881 } else { 2882 sptr = sptr->getNextNE(); 2883 } 2884 } 2885 2886 return NULL; 2887 } 2888 2889 // check word for two-level suffixes and morph 2890 std::string AffixMgr::suffix_check_twosfx_morph(const char* word, 2891 int len, 2892 int sfxopts, 2893 PfxEntry* ppfx, 2894 const FLAG needflag) { 2895 std::string result; 2896 std::string result2; 2897 std::string result3; 2898 2899 // first handle the special case of 0 length suffixes 2900 SfxEntry* se = sStart[0]; 2901 while (se) { 2902 if (contclasses[se->getFlag()]) { 2903 std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); 2904 if (!st.empty()) { 2905 if (ppfx) { 2906 if (ppfx->getMorph()) { 2907 result.append(ppfx->getMorph()); 2908 result.push_back(MSEP_FLD); 2909 } else 2910 debugflag(result, ppfx->getFlag()); 2911 } 2912 result.append(st); 2913 if (se->getMorph()) { 2914 result.push_back(MSEP_FLD); 2915 result.append(se->getMorph()); 2916 } else 2917 debugflag(result, se->getFlag()); 2918 result.push_back(MSEP_REC); 2919 } 2920 } 2921 se = se->getNext(); 2922 } 2923 2924 // now handle the general case 2925 if (len == 0) 2926 return std::string(); // FULLSTRIP 2927 unsigned char sp = *((const unsigned char*)(word + len - 1)); 2928 SfxEntry* sptr = sStart[sp]; 2929 2930 while (sptr) { 2931 if (isRevSubset(sptr->getKey(), word + len - 1, len)) { 2932 if (contclasses[sptr->getFlag()]) { 2933 std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); 2934 if (!st.empty()) { 2935 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless 2936 if (!sptr->getCont()) 2937 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless 2938 result2.assign(st); 2939 2940 result3.clear(); 2941 2942 if (sptr->getMorph()) { 2943 result3.push_back(MSEP_FLD); 2944 result3.append(sptr->getMorph()); 2945 } else 2946 debugflag(result3, sptr->getFlag()); 2947 strlinecat(result2, result3); 2948 result2.push_back(MSEP_REC); 2949 result.append(result2); 2950 } 2951 } 2952 sptr = sptr->getNextEQ(); 2953 } else { 2954 sptr = sptr->getNextNE(); 2955 } 2956 } 2957 2958 return result; 2959 } 2960 2961 std::string AffixMgr::suffix_check_morph(const char* word, 2962 int len, 2963 int sfxopts, 2964 PfxEntry* ppfx, 2965 const FLAG cclass, 2966 const FLAG needflag, 2967 char in_compound) { 2968 std::string result; 2969 2970 struct hentry* rv = NULL; 2971 2972 PfxEntry* ep = ppfx; 2973 2974 // first handle the special case of 0 length suffixes 2975 SfxEntry* se = sStart[0]; 2976 while (se) { 2977 if (!cclass || se->getCont()) { 2978 // suffixes are not allowed in beginning of compounds 2979 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass 2980 // except when signed with compoundpermitflag flag 2981 (se->getCont() && compoundpermitflag && 2982 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) && 2983 (!circumfix || 2984 // no circumfix flag in prefix and suffix 2985 ((!ppfx || !(ep->getCont()) || 2986 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 2987 (!se->getCont() || 2988 !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) || 2989 // circumfix flag in prefix AND suffix 2990 ((ppfx && (ep->getCont()) && 2991 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 2992 (se->getCont() && 2993 (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) && 2994 // fogemorpheme 2995 (in_compound || 2996 !((se->getCont() && 2997 (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && 2998 // needaffix on prefix or first suffix 2999 (cclass || 3000 !(se->getCont() && 3001 TESTAFF(se->getCont(), needaffix, se->getContLen())) || 3002 (ppfx && 3003 !((ep->getCont()) && 3004 TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))) 3005 rv = se->checkword(word, len, sfxopts, ppfx, cclass, 3006 needflag, FLAG_NULL); 3007 while (rv) { 3008 if (ppfx) { 3009 if (ppfx->getMorph()) { 3010 result.append(ppfx->getMorph()); 3011 result.push_back(MSEP_FLD); 3012 } else 3013 debugflag(result, ppfx->getFlag()); 3014 } 3015 if (complexprefixes && HENTRY_DATA(rv)) 3016 result.append(HENTRY_DATA2(rv)); 3017 if (!HENTRY_FIND(rv, MORPH_STEM)) { 3018 result.push_back(MSEP_FLD); 3019 result.append(MORPH_STEM); 3020 result.append(HENTRY_WORD(rv)); 3021 } 3022 3023 if (!complexprefixes && HENTRY_DATA(rv)) { 3024 result.push_back(MSEP_FLD); 3025 result.append(HENTRY_DATA2(rv)); 3026 } 3027 if (se->getMorph()) { 3028 result.push_back(MSEP_FLD); 3029 result.append(se->getMorph()); 3030 } else 3031 debugflag(result, se->getFlag()); 3032 result.push_back(MSEP_REC); 3033 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); 3034 } 3035 } 3036 se = se->getNext(); 3037 } 3038 3039 // now handle the general case 3040 if (len == 0) 3041 return std::string(); // FULLSTRIP 3042 unsigned char sp = *((const unsigned char*)(word + len - 1)); 3043 SfxEntry* sptr = sStart[sp]; 3044 3045 while (sptr) { 3046 if (isRevSubset(sptr->getKey(), word + len - 1, len)) { 3047 // suffixes are not allowed in beginning of compounds 3048 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass 3049 // except when signed with compoundpermitflag flag 3050 (sptr->getCont() && compoundpermitflag && 3051 TESTAFF(sptr->getCont(), compoundpermitflag, 3052 sptr->getContLen()))) && 3053 (!circumfix || 3054 // no circumfix flag in prefix and suffix 3055 ((!ppfx || !(ep->getCont()) || 3056 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 3057 (!sptr->getCont() || 3058 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) || 3059 // circumfix flag in prefix AND suffix 3060 ((ppfx && (ep->getCont()) && 3061 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && 3062 (sptr->getCont() && 3063 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) && 3064 // fogemorpheme 3065 (in_compound || 3066 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, 3067 sptr->getContLen()))))) && 3068 // needaffix on first suffix 3069 (cclass || 3070 !(sptr->getCont() && 3071 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))))) 3072 rv = sptr->checkword(word, len, sfxopts, ppfx, cclass, 3073 needflag, FLAG_NULL); 3074 while (rv) { 3075 if (ppfx) { 3076 if (ppfx->getMorph()) { 3077 result.append(ppfx->getMorph()); 3078 result.push_back(MSEP_FLD); 3079 } else 3080 debugflag(result, ppfx->getFlag()); 3081 } 3082 if (complexprefixes && HENTRY_DATA(rv)) 3083 result.append(HENTRY_DATA2(rv)); 3084 if (!HENTRY_FIND(rv, MORPH_STEM)) { 3085 result.push_back(MSEP_FLD); 3086 result.append(MORPH_STEM); 3087 result.append(HENTRY_WORD(rv)); 3088 } 3089 3090 if (!complexprefixes && HENTRY_DATA(rv)) { 3091 result.push_back(MSEP_FLD); 3092 result.append(HENTRY_DATA2(rv)); 3093 } 3094 3095 if (sptr->getMorph()) { 3096 result.push_back(MSEP_FLD); 3097 result.append(sptr->getMorph()); 3098 } else 3099 debugflag(result, sptr->getFlag()); 3100 result.push_back(MSEP_REC); 3101 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); 3102 } 3103 sptr = sptr->getNextEQ(); 3104 } else { 3105 sptr = sptr->getNextNE(); 3106 } 3107 } 3108 3109 return result; 3110 } 3111 3112 // check if word with affixes is correctly spelled 3113 struct hentry* AffixMgr::affix_check(const char* word, 3114 int len, 3115 const FLAG needflag, 3116 char in_compound) { 3117 3118 // check all prefixes (also crossed with suffixes if allowed) 3119 struct hentry* rv = prefix_check(word, len, in_compound, needflag); 3120 if (rv) 3121 return rv; 3122 3123 // if still not found check all suffixes 3124 rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound); 3125 3126 if (havecontclass) { 3127 sfx = NULL; 3128 pfx = NULL; 3129 3130 if (rv) 3131 return rv; 3132 // if still not found check all two-level suffixes 3133 rv = suffix_check_twosfx(word, len, 0, NULL, needflag); 3134 3135 if (rv) 3136 return rv; 3137 // if still not found check all two-level suffixes 3138 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag); 3139 } 3140 3141 return rv; 3142 } 3143 3144 // check if word with affixes is correctly spelled 3145 std::string AffixMgr::affix_check_morph(const char* word, 3146 int len, 3147 const FLAG needflag, 3148 char in_compound) { 3149 std::string result; 3150 3151 // check all prefixes (also crossed with suffixes if allowed) 3152 std::string st = prefix_check_morph(word, len, in_compound); 3153 if (!st.empty()) { 3154 result.append(st); 3155 } 3156 3157 // if still not found check all suffixes 3158 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); 3159 if (!st.empty()) { 3160 result.append(st); 3161 } 3162 3163 if (havecontclass) { 3164 sfx = NULL; 3165 pfx = NULL; 3166 // if still not found check all two-level suffixes 3167 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); 3168 if (!st.empty()) { 3169 result.append(st); 3170 } 3171 3172 // if still not found check all two-level suffixes 3173 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); 3174 if (!st.empty()) { 3175 result.append(st); 3176 } 3177 } 3178 3179 return result; 3180 } 3181 3182 // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields 3183 // in the first line of the inputs 3184 // return 0, if inputs equal 3185 // return 1, if inputs may equal with a secondary suffix 3186 // otherwise return -1 3187 static int morphcmp(const char* s, const char* t) { 3188 int se = 0; 3189 int te = 0; 3190 const char* sl; 3191 const char* tl; 3192 const char* olds; 3193 const char* oldt; 3194 if (!s || !t) 3195 return 1; 3196 olds = s; 3197 sl = strchr(s, '\n'); 3198 s = strstr(s, MORPH_DERI_SFX); 3199 if (!s || (sl && sl < s)) 3200 s = strstr(olds, MORPH_INFL_SFX); 3201 if (!s || (sl && sl < s)) { 3202 s = strstr(olds, MORPH_TERM_SFX); 3203 olds = NULL; 3204 } 3205 oldt = t; 3206 tl = strchr(t, '\n'); 3207 t = strstr(t, MORPH_DERI_SFX); 3208 if (!t || (tl && tl < t)) 3209 t = strstr(oldt, MORPH_INFL_SFX); 3210 if (!t || (tl && tl < t)) { 3211 t = strstr(oldt, MORPH_TERM_SFX); 3212 oldt = NULL; 3213 } 3214 while (s && t && (!sl || sl > s) && (!tl || tl > t)) { 3215 s += MORPH_TAG_LEN; 3216 t += MORPH_TAG_LEN; 3217 se = 0; 3218 te = 0; 3219 while ((*s == *t) && !se && !te) { 3220 s++; 3221 t++; 3222 switch (*s) { 3223 case ' ': 3224 case '\n': 3225 case '\t': 3226 case '\0': 3227 se = 1; 3228 } 3229 switch (*t) { 3230 case ' ': 3231 case '\n': 3232 case '\t': 3233 case '\0': 3234 te = 1; 3235 } 3236 } 3237 if (!se || !te) { 3238 // not terminal suffix difference 3239 if (olds) 3240 return -1; 3241 return 1; 3242 } 3243 olds = s; 3244 s = strstr(s, MORPH_DERI_SFX); 3245 if (!s || (sl && sl < s)) 3246 s = strstr(olds, MORPH_INFL_SFX); 3247 if (!s || (sl && sl < s)) { 3248 s = strstr(olds, MORPH_TERM_SFX); 3249 olds = NULL; 3250 } 3251 oldt = t; 3252 t = strstr(t, MORPH_DERI_SFX); 3253 if (!t || (tl && tl < t)) 3254 t = strstr(oldt, MORPH_INFL_SFX); 3255 if (!t || (tl && tl < t)) { 3256 t = strstr(oldt, MORPH_TERM_SFX); 3257 oldt = NULL; 3258 } 3259 } 3260 if (!s && !t && se && te) 3261 return 0; 3262 return 1; 3263 } 3264 3265 std::string AffixMgr::morphgen(const char* ts, 3266 int wl, 3267 const unsigned short* ap, 3268 unsigned short al, 3269 const char* morph, 3270 const char* targetmorph, 3271 int level) { 3272 // handle suffixes 3273 if (!morph) 3274 return std::string(); 3275 3276 // check substandard flag 3277 if (TESTAFF(ap, substandard, al)) 3278 return std::string(); 3279 3280 if (morphcmp(morph, targetmorph) == 0) 3281 return ts; 3282 3283 size_t stemmorphcatpos; 3284 std::string mymorph; 3285 3286 // use input suffix fields, if exist 3287 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { 3288 mymorph.assign(morph); 3289 mymorph.push_back(MSEP_FLD); 3290 stemmorphcatpos = mymorph.size(); 3291 } else { 3292 stemmorphcatpos = std::string::npos; 3293 } 3294 3295 for (int i = 0; i < al; i++) { 3296 const unsigned char c = (unsigned char)(ap[i] & 0x00FF); 3297 SfxEntry* sptr = sFlag[c]; 3298 while (sptr) { 3299 if (sptr->getFlag() == ap[i] && sptr->getMorph() && 3300 ((sptr->getContLen() == 0) || 3301 // don't generate forms with substandard affixes 3302 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { 3303 const char* stemmorph; 3304 if (stemmorphcatpos != std::string::npos) { 3305 mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph()); 3306 stemmorph = mymorph.c_str(); 3307 } else { 3308 stemmorph = sptr->getMorph(); 3309 } 3310 3311 int cmp = morphcmp(stemmorph, targetmorph); 3312 3313 if (cmp == 0) { 3314 std::string newword = sptr->add(ts, wl); 3315 if (!newword.empty()) { 3316 hentry* check = pHMgr->lookup(newword.c_str()); // XXX extra dic 3317 if (!check || !check->astr || 3318 !(TESTAFF(check->astr, forbiddenword, check->alen) || 3319 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) { 3320 return newword; 3321 } 3322 } 3323 } 3324 3325 // recursive call for secondary suffixes 3326 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && 3327 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { 3328 std::string newword = sptr->add(ts, wl); 3329 if (!newword.empty()) { 3330 std::string newword2 = 3331 morphgen(newword.c_str(), newword.size(), sptr->getCont(), 3332 sptr->getContLen(), stemmorph, targetmorph, 1); 3333 3334 if (!newword2.empty()) { 3335 return newword2; 3336 } 3337 } 3338 } 3339 } 3340 sptr = sptr->getFlgNxt(); 3341 } 3342 } 3343 return std::string(); 3344 } 3345 3346 int AffixMgr::expand_rootword(struct guessword* wlst, 3347 int maxn, 3348 const char* ts, 3349 int wl, 3350 const unsigned short* ap, 3351 unsigned short al, 3352 const char* bad, 3353 int badl, 3354 const char* phon) { 3355 int nh = 0; 3356 // first add root word to list 3357 if ((nh < maxn) && 3358 !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || 3359 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { 3360 wlst[nh].word = mystrdup(ts); 3361 if (!wlst[nh].word) 3362 return 0; 3363 wlst[nh].allow = false; 3364 wlst[nh].orig = NULL; 3365 nh++; 3366 // add special phonetic version 3367 if (phon && (nh < maxn)) { 3368 wlst[nh].word = mystrdup(phon); 3369 if (!wlst[nh].word) 3370 return nh - 1; 3371 wlst[nh].allow = false; 3372 wlst[nh].orig = mystrdup(ts); 3373 if (!wlst[nh].orig) 3374 return nh - 1; 3375 nh++; 3376 } 3377 } 3378 3379 // handle suffixes 3380 for (int i = 0; i < al; i++) { 3381 const unsigned char c = (unsigned char)(ap[i] & 0x00FF); 3382 SfxEntry* sptr = sFlag[c]; 3383 while (sptr) { 3384 if ((sptr->getFlag() == ap[i]) && 3385 (!sptr->getKeyLen() || 3386 ((badl > sptr->getKeyLen()) && 3387 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && 3388 // check needaffix flag 3389 !(sptr->getCont() && 3390 ((needaffix && 3391 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || 3392 (circumfix && 3393 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || 3394 (onlyincompound && 3395 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) { 3396 std::string newword = sptr->add(ts, wl); 3397 if (!newword.empty()) { 3398 if (nh < maxn) { 3399 wlst[nh].word = mystrdup(newword.c_str()); 3400 wlst[nh].allow = sptr->allowCross(); 3401 wlst[nh].orig = NULL; 3402 nh++; 3403 // add special phonetic version 3404 if (phon && (nh < maxn)) { 3405 std::string prefix(phon); 3406 std::string key(sptr->getKey()); 3407 reverseword(key); 3408 prefix.append(key); 3409 wlst[nh].word = mystrdup(prefix.c_str()); 3410 if (!wlst[nh].word) 3411 return nh - 1; 3412 wlst[nh].allow = false; 3413 wlst[nh].orig = mystrdup(newword.c_str()); 3414 if (!wlst[nh].orig) 3415 return nh - 1; 3416 nh++; 3417 } 3418 } 3419 } 3420 } 3421 sptr = sptr->getFlgNxt(); 3422 } 3423 } 3424 3425 int n = nh; 3426 3427 // handle cross products of prefixes and suffixes 3428 for (int j = 1; j < n; j++) 3429 if (wlst[j].allow) { 3430 for (int k = 0; k < al; k++) { 3431 const unsigned char c = (unsigned char)(ap[k] & 0x00FF); 3432 PfxEntry* cptr = pFlag[c]; 3433 while (cptr) { 3434 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && 3435 (!cptr->getKeyLen() || 3436 ((badl > cptr->getKeyLen()) && 3437 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { 3438 int l1 = strlen(wlst[j].word); 3439 std::string newword = cptr->add(wlst[j].word, l1); 3440 if (!newword.empty()) { 3441 if (nh < maxn) { 3442 wlst[nh].word = mystrdup(newword.c_str()); 3443 wlst[nh].allow = cptr->allowCross(); 3444 wlst[nh].orig = NULL; 3445 nh++; 3446 } 3447 } 3448 } 3449 cptr = cptr->getFlgNxt(); 3450 } 3451 } 3452 } 3453 3454 // now handle pure prefixes 3455 for (int m = 0; m < al; m++) { 3456 const unsigned char c = (unsigned char)(ap[m] & 0x00FF); 3457 PfxEntry* ptr = pFlag[c]; 3458 while (ptr) { 3459 if ((ptr->getFlag() == ap[m]) && 3460 (!ptr->getKeyLen() || 3461 ((badl > ptr->getKeyLen()) && 3462 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && 3463 // check needaffix flag 3464 !(ptr->getCont() && 3465 ((needaffix && 3466 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || 3467 (circumfix && 3468 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || 3469 (onlyincompound && 3470 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) { 3471 std::string newword = ptr->add(ts, wl); 3472 if (!newword.empty()) { 3473 if (nh < maxn) { 3474 wlst[nh].word = mystrdup(newword.c_str()); 3475 wlst[nh].allow = ptr->allowCross(); 3476 wlst[nh].orig = NULL; 3477 nh++; 3478 } 3479 } 3480 } 3481 ptr = ptr->getFlgNxt(); 3482 } 3483 } 3484 3485 return nh; 3486 } 3487 3488 // return replacing table 3489 const std::vector<replentry>& AffixMgr::get_reptable() const { 3490 return pHMgr->get_reptable(); 3491 } 3492 3493 // return iconv table 3494 RepList* AffixMgr::get_iconvtable() const { 3495 if (!iconvtable) 3496 return NULL; 3497 return iconvtable; 3498 } 3499 3500 // return oconv table 3501 RepList* AffixMgr::get_oconvtable() const { 3502 if (!oconvtable) 3503 return NULL; 3504 return oconvtable; 3505 } 3506 3507 // return replacing table 3508 struct phonetable* AffixMgr::get_phonetable() const { 3509 if (!phone) 3510 return NULL; 3511 return phone; 3512 } 3513 3514 // return character map table 3515 const std::vector<mapentry>& AffixMgr::get_maptable() const { 3516 return maptable; 3517 } 3518 3519 // return character map table 3520 const std::vector<std::string>& AffixMgr::get_breaktable() const { 3521 return breaktable; 3522 } 3523 3524 // return text encoding of dictionary 3525 const std::string& AffixMgr::get_encoding() { 3526 if (encoding.empty()) 3527 encoding = SPELL_ENCODING; 3528 return encoding; 3529 } 3530 3531 // return text encoding of dictionary 3532 int AffixMgr::get_langnum() const { 3533 return langnum; 3534 } 3535 3536 // return double prefix option 3537 int AffixMgr::get_complexprefixes() const { 3538 return complexprefixes; 3539 } 3540 3541 // return FULLSTRIP option 3542 int AffixMgr::get_fullstrip() const { 3543 return fullstrip; 3544 } 3545 3546 FLAG AffixMgr::get_keepcase() const { 3547 return keepcase; 3548 } 3549 3550 FLAG AffixMgr::get_forceucase() const { 3551 return forceucase; 3552 } 3553 3554 FLAG AffixMgr::get_warn() const { 3555 return warn; 3556 } 3557 3558 int AffixMgr::get_forbidwarn() const { 3559 return forbidwarn; 3560 } 3561 3562 int AffixMgr::get_checksharps() const { 3563 return checksharps; 3564 } 3565 3566 char* AffixMgr::encode_flag(unsigned short aflag) const { 3567 return pHMgr->encode_flag(aflag); 3568 } 3569 3570 // return the preferred ignore string for suggestions 3571 const char* AffixMgr::get_ignore() const { 3572 if (ignorechars.empty()) 3573 return NULL; 3574 return ignorechars.c_str(); 3575 } 3576 3577 // return the preferred ignore string for suggestions 3578 const std::vector<w_char>& AffixMgr::get_ignore_utf16() const { 3579 return ignorechars_utf16; 3580 } 3581 3582 // return the keyboard string for suggestions 3583 char* AffixMgr::get_key_string() { 3584 if (keystring.empty()) 3585 keystring = SPELL_KEYSTRING; 3586 return mystrdup(keystring.c_str()); 3587 } 3588 3589 // return the preferred try string for suggestions 3590 char* AffixMgr::get_try_string() const { 3591 if (trystring.empty()) 3592 return NULL; 3593 return mystrdup(trystring.c_str()); 3594 } 3595 3596 // return the preferred try string for suggestions 3597 const std::string& AffixMgr::get_wordchars() const { 3598 return wordchars; 3599 } 3600 3601 const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const { 3602 return wordchars_utf16; 3603 } 3604 3605 // is there compounding? 3606 int AffixMgr::get_compound() const { 3607 return compoundflag || compoundbegin || !defcpdtable.empty(); 3608 } 3609 3610 // return the compound words control flag 3611 FLAG AffixMgr::get_compoundflag() const { 3612 return compoundflag; 3613 } 3614 3615 // return the forbidden words control flag 3616 FLAG AffixMgr::get_forbiddenword() const { 3617 return forbiddenword; 3618 } 3619 3620 // return the forbidden words control flag 3621 FLAG AffixMgr::get_nosuggest() const { 3622 return nosuggest; 3623 } 3624 3625 // return the forbidden words control flag 3626 FLAG AffixMgr::get_nongramsuggest() const { 3627 return nongramsuggest; 3628 } 3629 3630 // return the substandard root/affix control flag 3631 FLAG AffixMgr::get_substandard() const { 3632 return substandard; 3633 } 3634 3635 // return the forbidden words flag modify flag 3636 FLAG AffixMgr::get_needaffix() const { 3637 return needaffix; 3638 } 3639 3640 // return the onlyincompound flag 3641 FLAG AffixMgr::get_onlyincompound() const { 3642 return onlyincompound; 3643 } 3644 3645 // return the value of suffix 3646 const std::string& AffixMgr::get_version() const { 3647 return version; 3648 } 3649 3650 // utility method to look up root words in hash table 3651 struct hentry* AffixMgr::lookup(const char* word) { 3652 struct hentry* he = NULL; 3653 for (size_t i = 0; i < alldic.size() && !he; ++i) { 3654 he = alldic[i]->lookup(word); 3655 } 3656 return he; 3657 } 3658 3659 // return the value of suffix 3660 int AffixMgr::have_contclass() const { 3661 return havecontclass; 3662 } 3663 3664 // return utf8 3665 int AffixMgr::get_utf8() const { 3666 return utf8; 3667 } 3668 3669 int AffixMgr::get_maxngramsugs(void) const { 3670 return maxngramsugs; 3671 } 3672 3673 int AffixMgr::get_maxcpdsugs(void) const { 3674 return maxcpdsugs; 3675 } 3676 3677 int AffixMgr::get_maxdiff(void) const { 3678 return maxdiff; 3679 } 3680 3681 int AffixMgr::get_onlymaxdiff(void) const { 3682 return onlymaxdiff; 3683 } 3684 3685 // return nosplitsugs 3686 int AffixMgr::get_nosplitsugs(void) const { 3687 return nosplitsugs; 3688 } 3689 3690 // return sugswithdots 3691 int AffixMgr::get_sugswithdots(void) const { 3692 return sugswithdots; 3693 } 3694 3695 /* parse flag */ 3696 bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) { 3697 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { 3698 HUNSPELL_WARNING( 3699 stderr, 3700 "error: line %d: multiple definitions of an affix file parameter\n", 3701 af->getlinenum()); 3702 return false; 3703 } 3704 std::string s; 3705 if (!parse_string(line, s, af->getlinenum())) 3706 return false; 3707 *out = pHMgr->decode_flag(s.c_str()); 3708 return true; 3709 } 3710 3711 /* parse num */ 3712 bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) { 3713 if (*out != -1) { 3714 HUNSPELL_WARNING( 3715 stderr, 3716 "error: line %d: multiple definitions of an affix file parameter\n", 3717 af->getlinenum()); 3718 return false; 3719 } 3720 std::string s; 3721 if (!parse_string(line, s, af->getlinenum())) 3722 return false; 3723 *out = atoi(s.c_str()); 3724 return true; 3725 } 3726 3727 /* parse in the max syllablecount of compound words and */ 3728 bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) { 3729 int i = 0; 3730 int np = 0; 3731 std::string::const_iterator iter = line.begin(); 3732 std::string::const_iterator start_piece = mystrsep(line, iter); 3733 while (start_piece != line.end()) { 3734 switch (i) { 3735 case 0: { 3736 np++; 3737 break; 3738 } 3739 case 1: { 3740 cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str()); 3741 np++; 3742 break; 3743 } 3744 case 2: { 3745 if (!utf8) { 3746 cpdvowels.assign(start_piece, iter); 3747 std::sort(cpdvowels.begin(), cpdvowels.end()); 3748 } else { 3749 std::string piece(start_piece, iter); 3750 u8_u16(cpdvowels_utf16, piece); 3751 std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end()); 3752 } 3753 np++; 3754 break; 3755 } 3756 default: 3757 break; 3758 } 3759 ++i; 3760 start_piece = mystrsep(line, iter); 3761 } 3762 if (np < 2) { 3763 HUNSPELL_WARNING(stderr, 3764 "error: line %d: missing compoundsyllable information\n", 3765 af->getlinenum()); 3766 return false; 3767 } 3768 if (np == 2) 3769 cpdvowels = "AEIOUaeiou"; 3770 return true; 3771 } 3772 3773 bool AffixMgr::parse_convtable(const std::string& line, 3774 FileMgr* af, 3775 RepList** rl, 3776 const std::string& keyword) { 3777 if (*rl) { 3778 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", 3779 af->getlinenum()); 3780 return false; 3781 } 3782 int i = 0; 3783 int np = 0; 3784 int numrl = 0; 3785 std::string::const_iterator iter = line.begin(); 3786 std::string::const_iterator start_piece = mystrsep(line, iter); 3787 while (start_piece != line.end()) { 3788 switch (i) { 3789 case 0: { 3790 np++; 3791 break; 3792 } 3793 case 1: { 3794 numrl = atoi(std::string(start_piece, iter).c_str()); 3795 if (numrl < 1) { 3796 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", 3797 af->getlinenum()); 3798 return false; 3799 } 3800 *rl = new RepList(numrl); 3801 if (!*rl) 3802 return false; 3803 np++; 3804 break; 3805 } 3806 default: 3807 break; 3808 } 3809 ++i; 3810 start_piece = mystrsep(line, iter); 3811 } 3812 if (np != 2) { 3813 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 3814 af->getlinenum()); 3815 return false; 3816 } 3817 3818 /* now parse the num lines to read in the remainder of the table */ 3819 for (int j = 0; j < numrl; j++) { 3820 std::string nl; 3821 if (!af->getline(nl)) 3822 return false; 3823 mychomp(nl); 3824 i = 0; 3825 std::string pattern; 3826 std::string pattern2; 3827 iter = nl.begin(); 3828 start_piece = mystrsep(nl, iter); 3829 while (start_piece != nl.end()) { 3830 { 3831 switch (i) { 3832 case 0: { 3833 if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) { 3834 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 3835 af->getlinenum()); 3836 delete *rl; 3837 *rl = NULL; 3838 return false; 3839 } 3840 break; 3841 } 3842 case 1: { 3843 pattern.assign(start_piece, iter); 3844 break; 3845 } 3846 case 2: { 3847 pattern2.assign(start_piece, iter); 3848 break; 3849 } 3850 default: 3851 break; 3852 } 3853 ++i; 3854 } 3855 start_piece = mystrsep(nl, iter); 3856 } 3857 if (pattern.empty() || pattern2.empty()) { 3858 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 3859 af->getlinenum()); 3860 return false; 3861 } 3862 (*rl)->add(pattern, pattern2); 3863 } 3864 return true; 3865 } 3866 3867 /* parse in the typical fault correcting table */ 3868 bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) { 3869 if (phone) { 3870 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", 3871 af->getlinenum()); 3872 return false; 3873 } 3874 int num = -1; 3875 int i = 0; 3876 int np = 0; 3877 std::string::const_iterator iter = line.begin(); 3878 std::string::const_iterator start_piece = mystrsep(line, iter); 3879 while (start_piece != line.end()) { 3880 switch (i) { 3881 case 0: { 3882 np++; 3883 break; 3884 } 3885 case 1: { 3886 num = atoi(std::string(start_piece, iter).c_str()); 3887 if (num < 1) { 3888 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 3889 af->getlinenum()); 3890 return false; 3891 } 3892 phone = new phonetable; 3893 phone->utf8 = (char)utf8; 3894 np++; 3895 break; 3896 } 3897 default: 3898 break; 3899 } 3900 ++i; 3901 start_piece = mystrsep(line, iter); 3902 } 3903 if (np != 2) { 3904 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 3905 af->getlinenum()); 3906 return false; 3907 } 3908 3909 /* now parse the phone->num lines to read in the remainder of the table */ 3910 for (int j = 0; j < num; ++j) { 3911 std::string nl; 3912 if (!af->getline(nl)) 3913 return false; 3914 mychomp(nl); 3915 i = 0; 3916 const size_t old_size = phone->rules.size(); 3917 iter = nl.begin(); 3918 start_piece = mystrsep(nl, iter); 3919 while (start_piece != nl.end()) { 3920 { 3921 switch (i) { 3922 case 0: { 3923 if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) { 3924 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 3925 af->getlinenum()); 3926 return false; 3927 } 3928 break; 3929 } 3930 case 1: { 3931 phone->rules.push_back(std::string(start_piece, iter)); 3932 break; 3933 } 3934 case 2: { 3935 phone->rules.push_back(std::string(start_piece, iter)); 3936 mystrrep(phone->rules.back(), "_", ""); 3937 break; 3938 } 3939 default: 3940 break; 3941 } 3942 ++i; 3943 } 3944 start_piece = mystrsep(nl, iter); 3945 } 3946 if (phone->rules.size() != old_size + 2) { 3947 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 3948 af->getlinenum()); 3949 phone->rules.clear(); 3950 return false; 3951 } 3952 } 3953 phone->rules.push_back(""); 3954 phone->rules.push_back(""); 3955 init_phonet_hash(*phone); 3956 return true; 3957 } 3958 3959 /* parse in the checkcompoundpattern table */ 3960 bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) { 3961 if (parsedcheckcpd) { 3962 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", 3963 af->getlinenum()); 3964 return false; 3965 } 3966 parsedcheckcpd = true; 3967 int numcheckcpd = -1; 3968 int i = 0; 3969 int np = 0; 3970 std::string::const_iterator iter = line.begin(); 3971 std::string::const_iterator start_piece = mystrsep(line, iter); 3972 while (start_piece != line.end()) { 3973 switch (i) { 3974 case 0: { 3975 np++; 3976 break; 3977 } 3978 case 1: { 3979 numcheckcpd = atoi(std::string(start_piece, iter).c_str()); 3980 if (numcheckcpd < 1) { 3981 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 3982 af->getlinenum()); 3983 return false; 3984 } 3985 checkcpdtable.reserve(numcheckcpd); 3986 np++; 3987 break; 3988 } 3989 default: 3990 break; 3991 } 3992 ++i; 3993 start_piece = mystrsep(line, iter); 3994 } 3995 if (np != 2) { 3996 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 3997 af->getlinenum()); 3998 return false; 3999 } 4000 4001 /* now parse the numcheckcpd lines to read in the remainder of the table */ 4002 for (int j = 0; j < numcheckcpd; ++j) { 4003 std::string nl; 4004 if (!af->getline(nl)) 4005 return false; 4006 mychomp(nl); 4007 i = 0; 4008 checkcpdtable.push_back(patentry()); 4009 iter = nl.begin(); 4010 start_piece = mystrsep(nl, iter); 4011 while (start_piece != nl.end()) { 4012 switch (i) { 4013 case 0: { 4014 if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) { 4015 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4016 af->getlinenum()); 4017 return false; 4018 } 4019 break; 4020 } 4021 case 1: { 4022 checkcpdtable.back().pattern.assign(start_piece, iter); 4023 size_t slash_pos = checkcpdtable.back().pattern.find('/'); 4024 if (slash_pos != std::string::npos) { 4025 std::string chunk(checkcpdtable.back().pattern, slash_pos + 1); 4026 checkcpdtable.back().pattern.resize(slash_pos); 4027 checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str()); 4028 } 4029 break; 4030 } 4031 case 2: { 4032 checkcpdtable.back().pattern2.assign(start_piece, iter); 4033 size_t slash_pos = checkcpdtable.back().pattern2.find('/'); 4034 if (slash_pos != std::string::npos) { 4035 std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1); 4036 checkcpdtable.back().pattern2.resize(slash_pos); 4037 checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str()); 4038 } 4039 break; 4040 } 4041 case 3: { 4042 checkcpdtable.back().pattern3.assign(start_piece, iter); 4043 simplifiedcpd = 1; 4044 break; 4045 } 4046 default: 4047 break; 4048 } 4049 i++; 4050 start_piece = mystrsep(nl, iter); 4051 } 4052 } 4053 return true; 4054 } 4055 4056 /* parse in the compound rule table */ 4057 bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) { 4058 if (parseddefcpd) { 4059 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", 4060 af->getlinenum()); 4061 return false; 4062 } 4063 parseddefcpd = true; 4064 int numdefcpd = -1; 4065 int i = 0; 4066 int np = 0; 4067 std::string::const_iterator iter = line.begin(); 4068 std::string::const_iterator start_piece = mystrsep(line, iter); 4069 while (start_piece != line.end()) { 4070 switch (i) { 4071 case 0: { 4072 np++; 4073 break; 4074 } 4075 case 1: { 4076 numdefcpd = atoi(std::string(start_piece, iter).c_str()); 4077 if (numdefcpd < 1) { 4078 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 4079 af->getlinenum()); 4080 return false; 4081 } 4082 defcpdtable.reserve(numdefcpd); 4083 np++; 4084 break; 4085 } 4086 default: 4087 break; 4088 } 4089 ++i; 4090 start_piece = mystrsep(line, iter); 4091 } 4092 if (np != 2) { 4093 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 4094 af->getlinenum()); 4095 return false; 4096 } 4097 4098 /* now parse the numdefcpd lines to read in the remainder of the table */ 4099 for (int j = 0; j < numdefcpd; ++j) { 4100 std::string nl; 4101 if (!af->getline(nl)) 4102 return false; 4103 mychomp(nl); 4104 i = 0; 4105 defcpdtable.push_back(flagentry()); 4106 iter = nl.begin(); 4107 start_piece = mystrsep(nl, iter); 4108 while (start_piece != nl.end()) { 4109 switch (i) { 4110 case 0: { 4111 if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) { 4112 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4113 af->getlinenum()); 4114 numdefcpd = 0; 4115 return false; 4116 } 4117 break; 4118 } 4119 case 1: { // handle parenthesized flags 4120 if (std::find(start_piece, iter, '(') != iter) { 4121 for (std::string::const_iterator k = start_piece; k != iter; ++k) { 4122 std::string::const_iterator chb = k; 4123 std::string::const_iterator che = k + 1; 4124 if (*k == '(') { 4125 std::string::const_iterator parpos = std::find(k, iter, ')'); 4126 if (parpos != iter) { 4127 chb = k + 1; 4128 che = parpos; 4129 k = parpos; 4130 } 4131 } 4132 4133 if (*chb == '*' || *chb == '?') { 4134 defcpdtable.back().push_back((FLAG)*chb); 4135 } else { 4136 pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af); 4137 } 4138 } 4139 } else { 4140 pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af); 4141 } 4142 break; 4143 } 4144 default: 4145 break; 4146 } 4147 ++i; 4148 start_piece = mystrsep(nl, iter); 4149 } 4150 if (defcpdtable.back().empty()) { 4151 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4152 af->getlinenum()); 4153 return false; 4154 } 4155 } 4156 return true; 4157 } 4158 4159 /* parse in the character map table */ 4160 bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) { 4161 if (parsedmaptable) { 4162 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", 4163 af->getlinenum()); 4164 return false; 4165 } 4166 parsedmaptable = true; 4167 int nummap = -1; 4168 int i = 0; 4169 int np = 0; 4170 std::string::const_iterator iter = line.begin(); 4171 std::string::const_iterator start_piece = mystrsep(line, iter); 4172 while (start_piece != line.end()) { 4173 switch (i) { 4174 case 0: { 4175 np++; 4176 break; 4177 } 4178 case 1: { 4179 nummap = atoi(std::string(start_piece, iter).c_str()); 4180 if (nummap < 1) { 4181 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 4182 af->getlinenum()); 4183 return false; 4184 } 4185 maptable.reserve(nummap); 4186 np++; 4187 break; 4188 } 4189 default: 4190 break; 4191 } 4192 ++i; 4193 start_piece = mystrsep(line, iter); 4194 } 4195 if (np != 2) { 4196 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 4197 af->getlinenum()); 4198 return false; 4199 } 4200 4201 /* now parse the nummap lines to read in the remainder of the table */ 4202 for (int j = 0; j < nummap; ++j) { 4203 std::string nl; 4204 if (!af->getline(nl)) 4205 return false; 4206 mychomp(nl); 4207 i = 0; 4208 maptable.push_back(mapentry()); 4209 iter = nl.begin(); 4210 start_piece = mystrsep(nl, iter); 4211 while (start_piece != nl.end()) { 4212 switch (i) { 4213 case 0: { 4214 if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) { 4215 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4216 af->getlinenum()); 4217 nummap = 0; 4218 return false; 4219 } 4220 break; 4221 } 4222 case 1: { 4223 for (std::string::const_iterator k = start_piece; k != iter; ++k) { 4224 std::string::const_iterator chb = k; 4225 std::string::const_iterator che = k + 1; 4226 if (*k == '(') { 4227 std::string::const_iterator parpos = std::find(k, iter, ')'); 4228 if (parpos != iter) { 4229 chb = k + 1; 4230 che = parpos; 4231 k = parpos; 4232 } 4233 } else { 4234 if (utf8 && (*k & 0xc0) == 0xc0) { 4235 ++k; 4236 while (k != iter && (*k & 0xc0) == 0x80) 4237 ++k; 4238 che = k; 4239 --k; 4240 } 4241 } 4242 maptable.back().push_back(std::string(chb, che)); 4243 } 4244 break; 4245 } 4246 default: 4247 break; 4248 } 4249 ++i; 4250 start_piece = mystrsep(nl, iter); 4251 } 4252 if (maptable.back().empty()) { 4253 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4254 af->getlinenum()); 4255 return false; 4256 } 4257 } 4258 return true; 4259 } 4260 4261 /* parse in the word breakpoint table */ 4262 bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) { 4263 if (parsedbreaktable) { 4264 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", 4265 af->getlinenum()); 4266 return false; 4267 } 4268 parsedbreaktable = true; 4269 int numbreak = -1; 4270 int i = 0; 4271 int np = 0; 4272 std::string::const_iterator iter = line.begin(); 4273 std::string::const_iterator start_piece = mystrsep(line, iter); 4274 while (start_piece != line.end()) { 4275 switch (i) { 4276 case 0: { 4277 np++; 4278 break; 4279 } 4280 case 1: { 4281 numbreak = atoi(std::string(start_piece, iter).c_str()); 4282 if (numbreak < 0) { 4283 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 4284 af->getlinenum()); 4285 return false; 4286 } 4287 if (numbreak == 0) 4288 return true; 4289 breaktable.reserve(numbreak); 4290 np++; 4291 break; 4292 } 4293 default: 4294 break; 4295 } 4296 ++i; 4297 start_piece = mystrsep(line, iter); 4298 } 4299 if (np != 2) { 4300 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 4301 af->getlinenum()); 4302 return false; 4303 } 4304 4305 /* now parse the numbreak lines to read in the remainder of the table */ 4306 for (int j = 0; j < numbreak; ++j) { 4307 std::string nl; 4308 if (!af->getline(nl)) 4309 return false; 4310 mychomp(nl); 4311 i = 0; 4312 iter = nl.begin(); 4313 start_piece = mystrsep(nl, iter); 4314 while (start_piece != nl.end()) { 4315 switch (i) { 4316 case 0: { 4317 if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) { 4318 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4319 af->getlinenum()); 4320 numbreak = 0; 4321 return false; 4322 } 4323 break; 4324 } 4325 case 1: { 4326 breaktable.push_back(std::string(start_piece, iter)); 4327 break; 4328 } 4329 default: 4330 break; 4331 } 4332 ++i; 4333 start_piece = mystrsep(nl, iter); 4334 } 4335 } 4336 4337 if (breaktable.size() != static_cast<size_t>(numbreak)) { 4338 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", 4339 af->getlinenum()); 4340 return false; 4341 } 4342 4343 return true; 4344 } 4345 4346 void AffixMgr::reverse_condition(std::string& piece) { 4347 if (piece.empty()) 4348 return; 4349 4350 int neg = 0; 4351 for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) { 4352 switch (*k) { 4353 case '[': { 4354 if (neg) 4355 *(k - 1) = '['; 4356 else 4357 *k = ']'; 4358 break; 4359 } 4360 case ']': { 4361 *k = '['; 4362 if (neg) 4363 *(k - 1) = '^'; 4364 neg = 0; 4365 break; 4366 } 4367 case '^': { 4368 if (*(k - 1) == ']') 4369 neg = 1; 4370 else if (neg) 4371 *(k - 1) = *k; 4372 break; 4373 } 4374 default: { 4375 if (neg) 4376 *(k - 1) = *k; 4377 } 4378 } 4379 } 4380 } 4381 4382 class entries_container { 4383 std::vector<AffEntry*> entries; 4384 AffixMgr* m_mgr; 4385 char m_at; 4386 public: 4387 entries_container(char at, AffixMgr* mgr) 4388 : m_mgr(mgr) 4389 , m_at(at) { 4390 } 4391 void release() { 4392 entries.clear(); 4393 } 4394 void initialize(int numents, 4395 char opts, unsigned short aflag) { 4396 entries.reserve(numents); 4397 4398 if (m_at == 'P') { 4399 entries.push_back(new PfxEntry(m_mgr)); 4400 } else { 4401 entries.push_back(new SfxEntry(m_mgr)); 4402 } 4403 4404 entries.back()->opts = opts; 4405 entries.back()->aflag = aflag; 4406 } 4407 4408 AffEntry* add_entry(char opts) { 4409 if (m_at == 'P') { 4410 entries.push_back(new PfxEntry(m_mgr)); 4411 } else { 4412 entries.push_back(new SfxEntry(m_mgr)); 4413 } 4414 AffEntry* ret = entries.back(); 4415 ret->opts = entries[0]->opts & opts; 4416 return ret; 4417 } 4418 4419 AffEntry* first_entry() { 4420 return entries.empty() ? NULL : entries[0]; 4421 } 4422 4423 ~entries_container() { 4424 for (size_t i = 0; i < entries.size(); ++i) { 4425 delete entries[i]; 4426 } 4427 } 4428 4429 std::vector<AffEntry*>::iterator begin() { return entries.begin(); } 4430 std::vector<AffEntry*>::iterator end() { return entries.end(); } 4431 }; 4432 4433 bool AffixMgr::parse_affix(const std::string& line, 4434 const char at, 4435 FileMgr* af, 4436 char* dupflags) { 4437 int numents = 0; // number of AffEntry structures to parse 4438 4439 unsigned short aflag = 0; // affix char identifier 4440 4441 char ff = 0; 4442 entries_container affentries(at, this); 4443 4444 int i = 0; 4445 4446 // checking lines with bad syntax 4447 #ifdef DEBUG 4448 int basefieldnum = 0; 4449 #endif 4450 4451 // split affix header line into pieces 4452 4453 int np = 0; 4454 std::string::const_iterator iter = line.begin(); 4455 std::string::const_iterator start_piece = mystrsep(line, iter); 4456 while (start_piece != line.end()) { 4457 switch (i) { 4458 // piece 1 - is type of affix 4459 case 0: { 4460 np++; 4461 break; 4462 } 4463 4464 // piece 2 - is affix char 4465 case 1: { 4466 np++; 4467 aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str()); 4468 if (((at == 'S') && (dupflags[aflag] & dupSFX)) || 4469 ((at == 'P') && (dupflags[aflag] & dupPFX))) { 4470 HUNSPELL_WARNING( 4471 stderr, 4472 "error: line %d: multiple definitions of an affix flag\n", 4473 af->getlinenum()); 4474 } 4475 dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX); 4476 break; 4477 } 4478 // piece 3 - is cross product indicator 4479 case 2: { 4480 np++; 4481 if (*start_piece == 'Y') 4482 ff = aeXPRODUCT; 4483 break; 4484 } 4485 4486 // piece 4 - is number of affentries 4487 case 3: { 4488 np++; 4489 numents = atoi(std::string(start_piece, iter).c_str()); 4490 if ((numents <= 0) || ((std::numeric_limits<size_t>::max() / 4491 sizeof(AffEntry)) < static_cast<size_t>(numents))) { 4492 char* err = pHMgr->encode_flag(aflag); 4493 if (err) { 4494 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 4495 af->getlinenum()); 4496 free(err); 4497 } 4498 return false; 4499 } 4500 4501 char opts = ff; 4502 if (utf8) 4503 opts |= aeUTF8; 4504 if (pHMgr->is_aliasf()) 4505 opts |= aeALIASF; 4506 if (pHMgr->is_aliasm()) 4507 opts |= aeALIASM; 4508 affentries.initialize(numents, opts, aflag); 4509 } 4510 4511 default: 4512 break; 4513 } 4514 ++i; 4515 start_piece = mystrsep(line, iter); 4516 } 4517 // check to make sure we parsed enough pieces 4518 if (np != 4) { 4519 char* err = pHMgr->encode_flag(aflag); 4520 if (err) { 4521 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", 4522 af->getlinenum()); 4523 free(err); 4524 } 4525 return false; 4526 } 4527 4528 // now parse numents affentries for this affix 4529 AffEntry* entry = affentries.first_entry(); 4530 for (int ent = 0; ent < numents; ++ent) { 4531 std::string nl; 4532 if (!af->getline(nl)) 4533 return false; 4534 mychomp(nl); 4535 4536 iter = nl.begin(); 4537 i = 0; 4538 np = 0; 4539 4540 // split line into pieces 4541 start_piece = mystrsep(nl, iter); 4542 while (start_piece != nl.end()) { 4543 switch (i) { 4544 // piece 1 - is type 4545 case 0: { 4546 np++; 4547 if (ent != 0) 4548 entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM)); 4549 break; 4550 } 4551 4552 // piece 2 - is affix char 4553 case 1: { 4554 np++; 4555 std::string chunk(start_piece, iter); 4556 if (pHMgr->decode_flag(chunk.c_str()) != aflag) { 4557 char* err = pHMgr->encode_flag(aflag); 4558 if (err) { 4559 HUNSPELL_WARNING(stderr, 4560 "error: line %d: affix %s is corrupt\n", 4561 af->getlinenum(), err); 4562 free(err); 4563 } 4564 return false; 4565 } 4566 4567 if (ent != 0) { 4568 AffEntry* start_entry = affentries.first_entry(); 4569 entry->aflag = start_entry->aflag; 4570 } 4571 break; 4572 } 4573 4574 // piece 3 - is string to strip or 0 for null 4575 case 2: { 4576 np++; 4577 entry->strip = std::string(start_piece, iter); 4578 if (complexprefixes) { 4579 if (utf8) 4580 reverseword_utf(entry->strip); 4581 else 4582 reverseword(entry->strip); 4583 } 4584 if (entry->strip.compare("0") == 0) { 4585 entry->strip.clear(); 4586 } 4587 break; 4588 } 4589 4590 // piece 4 - is affix string or 0 for null 4591 case 3: { 4592 entry->morphcode = NULL; 4593 entry->contclass = NULL; 4594 entry->contclasslen = 0; 4595 np++; 4596 std::string::const_iterator dash = std::find(start_piece, iter, '/'); 4597 if (dash != iter) { 4598 entry->appnd = std::string(start_piece, dash); 4599 std::string dash_str(dash + 1, iter); 4600 4601 if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { 4602 if (utf8) { 4603 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); 4604 } else { 4605 remove_ignored_chars(entry->appnd, ignorechars); 4606 } 4607 } 4608 4609 if (complexprefixes) { 4610 if (utf8) 4611 reverseword_utf(entry->appnd); 4612 else 4613 reverseword(entry->appnd); 4614 } 4615 4616 if (pHMgr->is_aliasf()) { 4617 int index = atoi(dash_str.c_str()); 4618 entry->contclasslen = (unsigned short)pHMgr->get_aliasf( 4619 index, &(entry->contclass), af); 4620 if (!entry->contclasslen) 4621 HUNSPELL_WARNING(stderr, 4622 "error: bad affix flag alias: \"%s\"\n", 4623 dash_str.c_str()); 4624 } else { 4625 entry->contclasslen = (unsigned short)pHMgr->decode_flags( 4626 &(entry->contclass), dash_str.c_str(), af); 4627 std::sort(entry->contclass, entry->contclass + entry->contclasslen); 4628 } 4629 4630 havecontclass = 1; 4631 for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { 4632 contclasses[(entry->contclass)[_i]] = 1; 4633 } 4634 } else { 4635 entry->appnd = std::string(start_piece, iter); 4636 4637 if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { 4638 if (utf8) { 4639 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); 4640 } else { 4641 remove_ignored_chars(entry->appnd, ignorechars); 4642 } 4643 } 4644 4645 if (complexprefixes) { 4646 if (utf8) 4647 reverseword_utf(entry->appnd); 4648 else 4649 reverseword(entry->appnd); 4650 } 4651 } 4652 4653 if (entry->appnd.compare("0") == 0) { 4654 entry->appnd.clear(); 4655 } 4656 break; 4657 } 4658 4659 // piece 5 - is the conditions descriptions 4660 case 4: { 4661 std::string chunk(start_piece, iter); 4662 np++; 4663 if (complexprefixes) { 4664 if (utf8) 4665 reverseword_utf(chunk); 4666 else 4667 reverseword(chunk); 4668 reverse_condition(chunk); 4669 } 4670 if (!entry->strip.empty() && chunk != "." && 4671 redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(), 4672 af->getlinenum())) 4673 chunk = "."; 4674 if (at == 'S') { 4675 reverseword(chunk); 4676 reverse_condition(chunk); 4677 } 4678 if (encodeit(*entry, chunk.c_str())) 4679 return false; 4680 break; 4681 } 4682 4683 case 5: { 4684 std::string chunk(start_piece, iter); 4685 np++; 4686 if (pHMgr->is_aliasm()) { 4687 int index = atoi(chunk.c_str()); 4688 entry->morphcode = pHMgr->get_aliasm(index); 4689 } else { 4690 if (complexprefixes) { // XXX - fix me for morph. gen. 4691 if (utf8) 4692 reverseword_utf(chunk); 4693 else 4694 reverseword(chunk); 4695 } 4696 // add the remaining of the line 4697 std::string::const_iterator end = nl.end(); 4698 if (iter != end) { 4699 chunk.append(iter, end); 4700 } 4701 entry->morphcode = mystrdup(chunk.c_str()); 4702 if (!entry->morphcode) 4703 return false; 4704 } 4705 break; 4706 } 4707 default: 4708 break; 4709 } 4710 i++; 4711 start_piece = mystrsep(nl, iter); 4712 } 4713 // check to make sure we parsed enough pieces 4714 if (np < 4) { 4715 char* err = pHMgr->encode_flag(aflag); 4716 if (err) { 4717 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", 4718 af->getlinenum(), err); 4719 free(err); 4720 } 4721 return false; 4722 } 4723 4724 #ifdef DEBUG 4725 // detect unnecessary fields, excepting comments 4726 if (basefieldnum) { 4727 int fieldnum = 4728 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6); 4729 if (fieldnum != basefieldnum) 4730 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", 4731 af->getlinenum()); 4732 } else { 4733 basefieldnum = 4734 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6); 4735 } 4736 #endif 4737 } 4738 4739 // now create SfxEntry or PfxEntry objects and use links to 4740 // build an ordered (sorted by affix string) list 4741 std::vector<AffEntry*>::iterator start = affentries.begin(); 4742 std::vector<AffEntry*>::iterator end = affentries.end(); 4743 for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) { 4744 if (at == 'P') { 4745 build_pfxtree(static_cast<PfxEntry*>(*affentry)); 4746 } else { 4747 build_sfxtree(static_cast<SfxEntry*>(*affentry)); 4748 } 4749 } 4750 4751 //contents belong to AffixMgr now 4752 affentries.release(); 4753 4754 return true; 4755 } 4756 4757 int AffixMgr::redundant_condition(char ft, 4758 const char* strip, 4759 int stripl, 4760 const char* cond, 4761 int linenum) { 4762 int condl = strlen(cond); 4763 int i; 4764 int j; 4765 int neg; 4766 int in; 4767 if (ft == 'P') { // prefix 4768 if (strncmp(strip, cond, condl) == 0) 4769 return 1; 4770 if (utf8) { 4771 } else { 4772 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { 4773 if (cond[j] != '[') { 4774 if (cond[j] != strip[i]) { 4775 HUNSPELL_WARNING(stderr, 4776 "warning: line %d: incompatible stripping " 4777 "characters and condition\n", 4778 linenum); 4779 return 0; 4780 } 4781 } else { 4782 neg = (cond[j + 1] == '^') ? 1 : 0; 4783 in = 0; 4784 do { 4785 j++; 4786 if (strip[i] == cond[j]) 4787 in = 1; 4788 } while ((j < (condl - 1)) && (cond[j] != ']')); 4789 if (j == (condl - 1) && (cond[j] != ']')) { 4790 HUNSPELL_WARNING(stderr, 4791 "error: line %d: missing ] in condition:\n%s\n", 4792 linenum, cond); 4793 return 0; 4794 } 4795 if ((!neg && !in) || (neg && in)) { 4796 HUNSPELL_WARNING(stderr, 4797 "warning: line %d: incompatible stripping " 4798 "characters and condition\n", 4799 linenum); 4800 return 0; 4801 } 4802 } 4803 } 4804 if (j >= condl) 4805 return 1; 4806 } 4807 } else { // suffix 4808 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) 4809 return 1; 4810 if (utf8) { 4811 } else { 4812 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { 4813 if (cond[j] != ']') { 4814 if (cond[j] != strip[i]) { 4815 HUNSPELL_WARNING(stderr, 4816 "warning: line %d: incompatible stripping " 4817 "characters and condition\n", 4818 linenum); 4819 return 0; 4820 } 4821 } else { 4822 in = 0; 4823 do { 4824 j--; 4825 if (strip[i] == cond[j]) 4826 in = 1; 4827 } while ((j > 0) && (cond[j] != '[')); 4828 if ((j == 0) && (cond[j] != '[')) { 4829 HUNSPELL_WARNING(stderr, 4830 "error: line: %d: missing ] in condition:\n%s\n", 4831 linenum, cond); 4832 return 0; 4833 } 4834 neg = (cond[j + 1] == '^') ? 1 : 0; 4835 if ((!neg && !in) || (neg && in)) { 4836 HUNSPELL_WARNING(stderr, 4837 "warning: line %d: incompatible stripping " 4838 "characters and condition\n", 4839 linenum); 4840 return 0; 4841 } 4842 } 4843 } 4844 if (j < 0) 4845 return 1; 4846 } 4847 } 4848 return 0; 4849 } 4850 4851 std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff, 4852 int len, 4853 const char* root_word) { 4854 std::vector<std::string> slst; 4855 short unsigned* start_ptr = suff; 4856 for (int j = 0; j < SETSIZE; j++) { 4857 SfxEntry* ptr = sStart[j]; 4858 while (ptr) { 4859 suff = start_ptr; 4860 for (int i = 0; i < len; i++) { 4861 if ((*suff) == ptr->getFlag()) { 4862 std::string nw(root_word); 4863 nw.append(ptr->getAffix()); 4864 hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0); 4865 if (ht) { 4866 slst.push_back(nw); 4867 } 4868 } 4869 suff++; 4870 } 4871 ptr = ptr->getNext(); 4872 } 4873 } 4874 return slst; 4875 }