affentry.cxx (34175B)
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 * 4 * Copyright (C) 2002-2022 Németh László 5 * 6 * The contents of this file are subject to the Mozilla Public License Version 7 * 1.1 (the "License"); you may not use this file except in compliance with 8 * the License. You may obtain a copy of the License at 9 * http://www.mozilla.org/MPL/ 10 * 11 * Software distributed under the License is distributed on an "AS IS" basis, 12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 13 * for the specific language governing rights and limitations under the 14 * License. 15 * 16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. 17 * 18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, 19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, 20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, 21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, 22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen 23 * 24 * Alternatively, the contents of this file may be used under the terms of 25 * either the GNU General Public License Version 2 or later (the "GPL"), or 26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 * in which case the provisions of the GPL or the LGPL are applicable instead 28 * of those above. If you wish to allow use of your version of this file only 29 * under the terms of either the GPL or the LGPL, and not to allow others to 30 * use your version of this file under the terms of the MPL, indicate your 31 * decision by deleting the provisions above and replace them with the notice 32 * and other provisions required by the GPL or the LGPL. If you do not delete 33 * the provisions above, a recipient may use your version of this file under 34 * the terms of any one of the MPL, the GPL or the LGPL. 35 * 36 * ***** END LICENSE BLOCK ***** */ 37 /* 38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada 39 * And Contributors. All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 52 * 3. All modifications to the source code must be clearly marked as 53 * such. Binary redistributions based on modified source code 54 * must be clearly marked as modified versions in the documentation 55 * and/or other materials provided with the distribution. 56 * 57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS 58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 */ 70 71 #include <stdlib.h> 72 #include <string.h> 73 #include <stdio.h> 74 #include <ctype.h> 75 76 #include "affentry.hxx" 77 #include "csutil.hxx" 78 79 AffEntry::~AffEntry() { 80 if (opts & aeLONGCOND) 81 free(c.l.conds2); 82 if (morphcode && !(opts & aeALIASM)) 83 free(morphcode); 84 if (contclass && !(opts & aeALIASF)) 85 free(contclass); 86 } 87 88 PfxEntry::PfxEntry(AffixMgr* pmgr) 89 // register affix manager 90 : pmyMgr(pmgr), 91 next(NULL), 92 nexteq(NULL), 93 nextne(NULL), 94 flgnxt(NULL) { 95 } 96 97 // add prefix to this word assuming conditions hold 98 std::string PfxEntry::add(const char* word, size_t len) { 99 std::string result; 100 if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) && 101 (len >= numconds) && test_condition(word) && 102 (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) { 103 /* we have a match so add prefix */ 104 result.assign(appnd); 105 result.append(word + strip.size()); 106 } 107 return result; 108 } 109 110 inline char* PfxEntry::nextchar(char* p) { 111 if (p) { 112 p++; 113 if (opts & aeLONGCOND) { 114 // jump to the 2nd part of the condition 115 if (p == c.conds + MAXCONDLEN_1) 116 return c.l.conds2; 117 // end of the MAXCONDLEN length condition 118 } else if (p == c.conds + MAXCONDLEN) 119 return NULL; 120 return *p ? p : NULL; 121 } 122 return NULL; 123 } 124 125 inline int PfxEntry::test_condition(const char* st) { 126 const char* pos = NULL; // group with pos input position 127 bool neg = false; // complementer 128 bool ingroup = false; // character in the group 129 if (numconds == 0) 130 return 1; 131 char* p = c.conds; 132 while (1) { 133 switch (*p) { 134 case '\0': 135 return 1; 136 case '[': { 137 neg = false; 138 ingroup = false; 139 p = nextchar(p); 140 pos = st; 141 break; 142 } 143 case '^': { 144 p = nextchar(p); 145 neg = true; 146 break; 147 } 148 case ']': { 149 if (bool(neg) == bool(ingroup)) 150 return 0; 151 pos = NULL; 152 p = nextchar(p); 153 // skip the next character 154 if (!ingroup && *st) 155 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++) 156 ; 157 if (*st == '\0' && p) 158 return 0; // word <= condition 159 break; 160 } 161 case '.': 162 if (!pos) { // dots are not metacharacters in groups: [.] 163 p = nextchar(p); 164 // skip the next character 165 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++) 166 ; 167 if (*st == '\0' && p) 168 return 0; // word <= condition 169 break; 170 } 171 /* FALLTHROUGH */ 172 default: { 173 if (*st == *p) { 174 st++; 175 p = nextchar(p); 176 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte 177 while (p && (*p & 0xc0) == 0x80) { // character 178 if (*p != *st) { 179 if (!pos) 180 return 0; 181 st = pos; 182 break; 183 } 184 p = nextchar(p); 185 st++; 186 } 187 if (pos && st != pos) { 188 ingroup = true; 189 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { 190 } 191 } 192 } else if (pos) { 193 ingroup = true; 194 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { 195 } 196 } 197 } else if (pos) { // group 198 p = nextchar(p); 199 } else 200 return 0; 201 } 202 } 203 if (!p) 204 return 1; 205 } 206 } 207 208 // check if this prefix entry matches 209 struct hentry* PfxEntry::checkword(const char* word, 210 int len, 211 char in_compound, 212 const FLAG needflag) { 213 struct hentry* he; // hash entry of root word or NULL 214 215 // on entry prefix is 0 length or already matches the beginning of the word. 216 // So if the remaining root word has positive length 217 // and if there are enough chars in root word and added back strip chars 218 // to meet the number of characters conditions, then test it 219 220 int tmpl = len - appnd.size(); // length of tmpword 221 222 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { 223 // generate new root word by removing prefix and adding 224 // back any characters that would have been stripped 225 226 std::string tmpword(strip); 227 tmpword.append(word + appnd.size(), tmpl); 228 229 // now make sure all of the conditions on characters 230 // are met. Please see the appendix at the end of 231 // this file for more info on exactly what is being 232 // tested 233 234 // if all conditions are met then check if resulting 235 // root word in the dictionary 236 237 if (test_condition(tmpword.c_str())) { 238 tmpl += strip.size(); 239 if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) { 240 do { 241 if (TESTAFF(he->astr, aflag, he->alen) && 242 // forbid single prefixes with needaffix flag 243 !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && 244 // needflag 245 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || 246 (contclass && TESTAFF(contclass, needflag, contclasslen)))) 247 return he; 248 he = he->next_homonym; // check homonyms 249 } while (he); 250 } 251 252 // prefix matched but no root word was found 253 // if aeXPRODUCT is allowed, try again but now 254 // ross checked combined with a suffix 255 256 // if ((opts & aeXPRODUCT) && in_compound) { 257 if ((opts & aeXPRODUCT)) { 258 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this, 259 FLAG_NULL, needflag, in_compound); 260 if (he) 261 return he; 262 } 263 } 264 } 265 return NULL; 266 } 267 268 // check if this prefix entry matches 269 struct hentry* PfxEntry::check_twosfx(const char* word, 270 int len, 271 char in_compound, 272 const FLAG needflag) { 273 // on entry prefix is 0 length or already matches the beginning of the word. 274 // So if the remaining root word has positive length 275 // and if there are enough chars in root word and added back strip chars 276 // to meet the number of characters conditions, then test it 277 278 int tmpl = len - appnd.size(); // length of tmpword 279 280 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 281 (tmpl + strip.size() >= numconds)) { 282 // generate new root word by removing prefix and adding 283 // back any characters that would have been stripped 284 285 std::string tmpword(strip); 286 tmpword.append(word + appnd.size()); 287 288 // now make sure all of the conditions on characters 289 // are met. Please see the appendix at the end of 290 // this file for more info on exactly what is being 291 // tested 292 293 // if all conditions are met then check if resulting 294 // root word in the dictionary 295 296 if (test_condition(tmpword.c_str())) { 297 tmpl += strip.size(); 298 299 // prefix matched but no root word was found 300 // if aeXPRODUCT is allowed, try again but now 301 // cross checked combined with a suffix 302 303 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { 304 // hash entry of root word or NULL 305 struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this, 306 needflag); 307 if (he) 308 return he; 309 } 310 } 311 } 312 return NULL; 313 } 314 315 // check if this prefix entry matches 316 std::string PfxEntry::check_twosfx_morph(const char* word, 317 int len, 318 char in_compound, 319 const FLAG needflag) { 320 std::string result; 321 // on entry prefix is 0 length or already matches the beginning of the word. 322 // So if the remaining root word has positive length 323 // and if there are enough chars in root word and added back strip chars 324 // to meet the number of characters conditions, then test it 325 int tmpl = len - appnd.size(); // length of tmpword 326 327 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 328 (tmpl + strip.size() >= numconds)) { 329 // generate new root word by removing prefix and adding 330 // back any characters that would have been stripped 331 332 std::string tmpword(strip); 333 tmpword.append(word + appnd.size()); 334 335 // now make sure all of the conditions on characters 336 // are met. Please see the appendix at the end of 337 // this file for more info on exactly what is being 338 // tested 339 340 // if all conditions are met then check if resulting 341 // root word in the dictionary 342 343 if (test_condition(tmpword.c_str())) { 344 tmpl += strip.size(); 345 346 // prefix matched but no root word was found 347 // if aeXPRODUCT is allowed, try again but now 348 // ross checked combined with a suffix 349 350 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { 351 result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl, 352 aeXPRODUCT, 353 this, needflag); 354 } 355 } 356 } 357 return result; 358 } 359 360 // check if this prefix entry matches 361 std::string PfxEntry::check_morph(const char* word, 362 int len, 363 char in_compound, 364 const FLAG needflag) { 365 std::string result; 366 367 // on entry prefix is 0 length or already matches the beginning of the word. 368 // So if the remaining root word has positive length 369 // and if there are enough chars in root word and added back strip chars 370 // to meet the number of characters conditions, then test it 371 372 int tmpl = len - appnd.size(); // length of tmpword 373 374 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 375 (tmpl + strip.size() >= numconds)) { 376 // generate new root word by removing prefix and adding 377 // back any characters that would have been stripped 378 379 std::string tmpword(strip); 380 tmpword.append(word + appnd.size()); 381 382 // now make sure all of the conditions on characters 383 // are met. Please see the appendix at the end of 384 // this file for more info on exactly what is being 385 // tested 386 387 // if all conditions are met then check if resulting 388 // root word in the dictionary 389 390 if (test_condition(tmpword.c_str())) { 391 tmpl += strip.size(); 392 struct hentry* he; // hash entry of root word or NULL 393 if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) { 394 do { 395 if (TESTAFF(he->astr, aflag, he->alen) && 396 // forbid single prefixes with needaffix flag 397 !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && 398 // needflag 399 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || 400 (contclass && TESTAFF(contclass, needflag, contclasslen)))) { 401 if (morphcode) { 402 result.push_back(MSEP_FLD); 403 result.append(morphcode); 404 } else 405 result.append(getKey()); 406 if (!HENTRY_FIND(he, MORPH_STEM)) { 407 result.push_back(MSEP_FLD); 408 result.append(MORPH_STEM); 409 result.append(HENTRY_WORD(he)); 410 } 411 // store the pointer of the hash entry 412 if (HENTRY_DATA(he)) { 413 result.push_back(MSEP_FLD); 414 result.append(HENTRY_DATA2(he)); 415 } else { 416 // return with debug information 417 char* flag = pmyMgr->encode_flag(getFlag()); 418 result.push_back(MSEP_FLD); 419 result.append(MORPH_FLAG); 420 result.append(flag); 421 free(flag); 422 } 423 result.push_back(MSEP_REC); 424 } 425 he = he->next_homonym; 426 } while (he); 427 } 428 429 // prefix matched but no root word was found 430 // if aeXPRODUCT is allowed, try again but now 431 // ross checked combined with a suffix 432 433 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { 434 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this, 435 FLAG_NULL, needflag); 436 if (!st.empty()) { 437 result.append(st); 438 } 439 } 440 } 441 } 442 443 return result; 444 } 445 446 SfxEntry::SfxEntry(AffixMgr* pmgr) 447 : pmyMgr(pmgr) // register affix manager 448 , 449 next(NULL), 450 nexteq(NULL), 451 nextne(NULL), 452 flgnxt(NULL), 453 l_morph(NULL), 454 r_morph(NULL), 455 eq_morph(NULL) { 456 } 457 458 // add suffix to this word assuming conditions hold 459 std::string SfxEntry::add(const char* word, size_t len) { 460 std::string result; 461 /* make sure all conditions match */ 462 if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) && 463 (len >= numconds) && test_condition(word + len, word) && 464 (!strip.size() || 465 (strcmp(word + len - strip.size(), strip.c_str()) == 0))) { 466 result.assign(word); 467 /* we have a match so add suffix */ 468 result.replace(len - strip.size(), std::string::npos, appnd); 469 } 470 return result; 471 } 472 473 inline char* SfxEntry::nextchar(char* p) { 474 if (p) { 475 p++; 476 if (opts & aeLONGCOND) { 477 // jump to the 2nd part of the condition 478 if (p == c.l.conds1 + MAXCONDLEN_1) 479 return c.l.conds2; 480 // end of the MAXCONDLEN length condition 481 } else if (p == c.conds + MAXCONDLEN) 482 return NULL; 483 return *p ? p : NULL; 484 } 485 return NULL; 486 } 487 488 inline int SfxEntry::test_condition(const char* st, const char* beg) { 489 const char* pos = NULL; // group with pos input position 490 bool neg = false; // complementer 491 bool ingroup = false; // character in the group 492 if (numconds == 0) 493 return 1; 494 char* p = c.conds; 495 st--; 496 int i = 1; 497 while (1) { 498 switch (*p) { 499 case '\0': 500 return 1; 501 case '[': 502 p = nextchar(p); 503 pos = st; 504 break; 505 case '^': 506 p = nextchar(p); 507 neg = true; 508 break; 509 case ']': 510 if (!neg && !ingroup) 511 return 0; 512 i++; 513 // skip the next character 514 if (!ingroup) { 515 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--) 516 ; 517 st--; 518 } 519 pos = NULL; 520 neg = false; 521 ingroup = false; 522 p = nextchar(p); 523 if (st < beg && p) 524 return 0; // word <= condition 525 break; 526 case '.': 527 if (!pos) { 528 // dots are not metacharacters in groups: [.] 529 p = nextchar(p); 530 // skip the next character 531 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; 532 st--) 533 ; 534 if (st < beg) { // word <= condition 535 if (p) 536 return 0; 537 else 538 return 1; 539 } 540 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character 541 st--; 542 if (st < beg) { // word <= condition 543 if (p) 544 return 0; 545 else 546 return 1; 547 } 548 } 549 break; 550 } 551 /* FALLTHROUGH */ 552 default: { 553 if (*st == *p) { 554 p = nextchar(p); 555 if ((opts & aeUTF8) && (*st & 0x80)) { 556 st--; 557 while (p && (st >= beg)) { 558 if (*p != *st) { 559 if (!pos) 560 return 0; 561 st = pos; 562 break; 563 } 564 // first byte of the UTF-8 multibyte character 565 if ((*p & 0xc0) != 0x80) 566 break; 567 p = nextchar(p); 568 st--; 569 } 570 if (pos && st != pos) { 571 if (neg) 572 return 0; 573 else if (i == numconds) 574 return 1; 575 ingroup = true; 576 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { 577 } 578 st--; 579 } 580 if (p && *p != ']') 581 p = nextchar(p); 582 } else if (pos) { 583 if (neg) 584 return 0; 585 else if (i == numconds) 586 return 1; 587 ingroup = true; 588 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) { 589 } 590 // if (p && *p != ']') p = nextchar(p); 591 st--; 592 } 593 if (!pos) { 594 i++; 595 st--; 596 } 597 if (st < beg && p && *p != ']') 598 return 0; // word <= condition 599 } else if (pos) { // group 600 p = nextchar(p); 601 } else 602 return 0; 603 } 604 } 605 if (!p) 606 return 1; 607 } 608 } 609 610 // see if this suffix is present in the word 611 struct hentry* SfxEntry::checkword(const char* word, 612 int len, 613 int optflags, 614 PfxEntry* ppfx, 615 const FLAG cclass, 616 const FLAG needflag, 617 const FLAG badflag) { 618 struct hentry* he; // hash entry pointer 619 PfxEntry* ep = ppfx; 620 621 // if this suffix is being cross checked with a prefix 622 // but it does not support cross products skip it 623 624 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) 625 return NULL; 626 627 // upon entry suffix is 0 length or already matches the end of the word. 628 // So if the remaining root word has positive length 629 // and if there are enough chars in root word and added back strip chars 630 // to meet the number of characters conditions, then test it 631 632 int tmpl = len - appnd.size(); // length of tmpword 633 // the second condition is not enough for UTF-8 strings 634 // it checked in test_condition() 635 636 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 637 (tmpl + strip.size() >= numconds)) { 638 // generate new root word by removing suffix and adding 639 // back any characters that would have been stripped or 640 // or null terminating the shorter string 641 642 std::string tmpstring(word, tmpl); 643 if (strip.size()) { 644 tmpstring.append(strip); 645 } 646 647 const char* tmpword = tmpstring.c_str(); 648 const char* endword = tmpword + tmpstring.size(); 649 650 // now make sure all of the conditions on characters 651 // are met. Please see the appendix at the end of 652 // this file for more info on exactly what is being 653 // tested 654 655 // if all conditions are met then check if resulting 656 // root word in the dictionary 657 658 if (test_condition(endword, tmpword)) { 659 #ifdef SZOSZABLYA_POSSIBLE_ROOTS 660 fprintf(stdout, "%s %s %c\n", word, tmpword, aflag); 661 #endif 662 if ((he = pmyMgr->lookup(tmpword)) != NULL) { 663 do { 664 // check conditional suffix (enabled by prefix) 665 if ((TESTAFF(he->astr, aflag, he->alen) || 666 (ep && ep->getCont() && 667 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && 668 (((optflags & aeXPRODUCT) == 0) || 669 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) || 670 // enabled by prefix 671 ((contclass) && 672 (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) && 673 // handle cont. class 674 ((!cclass) || 675 ((contclass) && TESTAFF(contclass, cclass, contclasslen))) && 676 // check only in compound homonyms (bad flags) 677 (!badflag || !TESTAFF(he->astr, badflag, he->alen)) && 678 // handle required flag 679 ((!needflag) || 680 (TESTAFF(he->astr, needflag, he->alen) || 681 ((contclass) && TESTAFF(contclass, needflag, contclasslen))))) 682 return he; 683 he = he->next_homonym; // check homonyms 684 } while (he); 685 } 686 } 687 } 688 return NULL; 689 } 690 691 // see if two-level suffix is present in the word 692 struct hentry* SfxEntry::check_twosfx(const char* word, 693 int len, 694 int optflags, 695 PfxEntry* ppfx, 696 const FLAG needflag) { 697 PfxEntry* ep = ppfx; 698 699 // if this suffix is being cross checked with a prefix 700 // but it does not support cross products skip it 701 702 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) 703 return NULL; 704 705 // upon entry suffix is 0 length or already matches the end of the word. 706 // So if the remaining root word has positive length 707 // and if there are enough chars in root word and added back strip chars 708 // to meet the number of characters conditions, then test it 709 710 int tmpl = len - appnd.size(); // length of tmpword 711 712 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 713 (tmpl + strip.size() >= numconds)) { 714 // generate new root word by removing suffix and adding 715 // back any characters that would have been stripped or 716 // or null terminating the shorter string 717 718 std::string tmpword(word); 719 tmpword.resize(tmpl); 720 tmpword.append(strip); 721 tmpl += strip.size(); 722 723 const char* beg = tmpword.c_str(); 724 const char* end = beg + tmpl; 725 726 // now make sure all of the conditions on characters 727 // are met. Please see the appendix at the end of 728 // this file for more info on exactly what is being 729 // tested 730 731 // if all conditions are met then recall suffix_check 732 733 if (test_condition(end, beg)) { 734 struct hentry* he; // hash entry pointer 735 if (ppfx) { 736 // handle conditional suffix 737 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) 738 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, 739 (FLAG)aflag, needflag, IN_CPD_NOT); 740 else 741 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, 742 (FLAG)aflag, needflag, IN_CPD_NOT); 743 } else { 744 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, 745 (FLAG)aflag, needflag, IN_CPD_NOT); 746 } 747 if (he) 748 return he; 749 } 750 } 751 return NULL; 752 } 753 754 // see if two-level suffix is present in the word 755 std::string SfxEntry::check_twosfx_morph(const char* word, 756 int len, 757 int optflags, 758 PfxEntry* ppfx, 759 const FLAG needflag) { 760 PfxEntry* ep = ppfx; 761 762 std::string result; 763 764 // if this suffix is being cross checked with a prefix 765 // but it does not support cross products skip it 766 767 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) 768 return result; 769 770 // upon entry suffix is 0 length or already matches the end of the word. 771 // So if the remaining root word has positive length 772 // and if there are enough chars in root word and added back strip chars 773 // to meet the number of characters conditions, then test it 774 775 int tmpl = len - appnd.size(); // length of tmpword 776 777 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 778 (tmpl + strip.size() >= numconds)) { 779 // generate new root word by removing suffix and adding 780 // back any characters that would have been stripped or 781 // or null terminating the shorter string 782 783 std::string tmpword(word); 784 tmpword.resize(tmpl); 785 tmpword.append(strip); 786 tmpl += strip.size(); 787 788 const char* beg = tmpword.c_str(); 789 const char* end = beg + tmpl; 790 791 // now make sure all of the conditions on characters 792 // are met. Please see the appendix at the end of 793 // this file for more info on exactly what is being 794 // tested 795 796 // if all conditions are met then recall suffix_check 797 798 if (test_condition(end, beg)) { 799 if (ppfx) { 800 // handle conditional suffix 801 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { 802 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, 803 needflag); 804 if (!st.empty()) { 805 if (ppfx->getMorph()) { 806 result.append(ppfx->getMorph()); 807 result.push_back(MSEP_FLD); 808 } 809 result.append(st); 810 mychomp(result); 811 } 812 } else { 813 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag, 814 needflag); 815 if (!st.empty()) { 816 result.append(st); 817 mychomp(result); 818 } 819 } 820 } else { 821 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag); 822 if (!st.empty()) { 823 result.append(st); 824 mychomp(result); 825 } 826 } 827 } 828 } 829 return result; 830 } 831 832 // get next homonym with same affix 833 struct hentry* SfxEntry::get_next_homonym(struct hentry* he, 834 int optflags, 835 PfxEntry* ppfx, 836 const FLAG cclass, 837 const FLAG needflag) { 838 PfxEntry* ep = ppfx; 839 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; 840 841 while (he->next_homonym) { 842 he = he->next_homonym; 843 if ((TESTAFF(he->astr, aflag, he->alen) || 844 (ep && ep->getCont() && 845 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && 846 ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) || 847 // handle conditional suffix 848 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) && 849 // handle cont. class 850 ((!cclass) || 851 ((contclass) && TESTAFF(contclass, cclass, contclasslen))) && 852 // handle required flag 853 ((!needflag) || 854 (TESTAFF(he->astr, needflag, he->alen) || 855 ((contclass) && TESTAFF(contclass, needflag, contclasslen))))) 856 return he; 857 } 858 return NULL; 859 } 860 861 void SfxEntry::initReverseWord() { 862 rappnd = appnd; 863 reverseword(rappnd); 864 } 865 866 #if 0 867 868 Appendix: Understanding Affix Code 869 870 871 An affix is either a prefix or a suffix attached to root words to make 872 other words. 873 874 Basically a Prefix or a Suffix is set of AffEntry objects 875 which store information about the prefix or suffix along 876 with supporting routines to check if a word has a particular 877 prefix or suffix or a combination. 878 879 The structure affentry is defined as follows: 880 881 struct affentry 882 { 883 unsigned short aflag; // ID used to represent the affix 884 std::string strip; // string to strip before adding affix 885 std::string appnd; // the affix string to add 886 char numconds; // the number of conditions that must be met 887 char opts; // flag: aeXPRODUCT- combine both prefix and suffix 888 char conds[SETSIZE]; // array which encodes the conditions to be met 889 }; 890 891 892 Here is a suffix borrowed from the en_US.aff file. This file 893 is whitespace delimited. 894 895 SFX D Y 4 896 SFX D 0 e d 897 SFX D y ied [^aeiou]y 898 SFX D 0 ed [^ey] 899 SFX D 0 ed [aeiou]y 900 901 This information can be interpreted as follows: 902 903 In the first line has 4 fields 904 905 Field 906 ----- 907 1 SFX - indicates this is a suffix 908 2 D - is the name of the character flag which represents this suffix 909 3 Y - indicates it can be combined with prefixes (cross product) 910 4 4 - indicates that sequence of 4 affentry structures are needed to 911 properly store the affix information 912 913 The remaining lines describe the unique information for the 4 SfxEntry 914 objects that make up this affix. Each line can be interpreted 915 as follows: (note fields 1 and 2 are as a check against line 1 info) 916 917 Field 918 ----- 919 1 SFX - indicates this is a suffix 920 2 D - is the name of the character flag for this affix 921 3 y - the string of chars to strip off before adding affix 922 (a 0 here indicates the NULL string) 923 4 ied - the string of affix characters to add 924 5 [^aeiou]y - the conditions which must be met before the affix 925 can be applied 926 927 Field 5 is interesting. Since this is a suffix, field 5 tells us that 928 there are 2 conditions that must be met. The first condition is that 929 the next to the last character in the word must *NOT* be any of the 930 following "a", "e", "i", "o" or "u". The second condition is that 931 the last character of the word must end in "y". 932 933 So how can we encode this information concisely and be able to 934 test for both conditions in a fast manner? The answer is found 935 but studying the wonderful ispell code of Geoff Kuenning, et.al. 936 (now available under a normal BSD license). 937 938 If we set up a conds array of 256 bytes indexed (0 to 255) and access it 939 using a character (cast to an unsigned char) of a string, we have 8 bits 940 of information we can store about that character. Specifically we 941 could use each bit to say if that character is allowed in any of the 942 last (or first for prefixes) 8 characters of the word. 943 944 Basically, each character at one end of the word (up to the number 945 of conditions) is used to index into the conds array and the resulting 946 value found there says whether the that character is valid for a 947 specific character position in the word. 948 949 For prefixes, it does this by setting bit 0 if that char is valid 950 in the first position, bit 1 if valid in the second position, and so on. 951 952 If a bit is not set, then that char is not valid for that postion in the 953 word. 954 955 If working with suffixes bit 0 is used for the character closest 956 to the front, bit 1 for the next character towards the end, ..., 957 with bit numconds-1 representing the last char at the end of the string. 958 959 Note: since entries in the conds[] are 8 bits, only 8 conditions 960 (read that only 8 character positions) can be examined at one 961 end of a word (the beginning for prefixes and the end for suffixes. 962 963 So to make this clearer, lets encode the conds array values for the 964 first two affentries for the suffix D described earlier. 965 966 967 For the first affentry: 968 numconds = 1 (only examine the last character) 969 970 conds['e'] = (1 << 0) (the word must end in an E) 971 all others are all 0 972 973 For the second affentry: 974 numconds = 2 (only examine the last two characters) 975 976 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) 977 where X is all characters *but* a, e, i, o, or u 978 979 980 conds['y'] = (1 << 1) (the last char must be a y) 981 all other bits for all other entries in the conds array are zero 982 983 #endif