[ tor-browser ].git.dasho

affentry.cxx (34175B)
      1 /* ***** BEGIN LICENSE BLOCK *****
      2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
      3 *
      4 * Copyright (C) 2002-2022 Németh László
      5 *
      6 * The contents of this file are subject to the Mozilla Public License Version
      7 * 1.1 (the "License"); you may not use this file except in compliance with
      8 * the License. You may obtain a copy of the License at
      9 * http://www.mozilla.org/MPL/
     10 *
     11 * Software distributed under the License is distributed on an "AS IS" basis,
     12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
     13 * for the specific language governing rights and limitations under the
     14 * License.
     15 *
     16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
     17 *
     18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
     19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
     20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
     21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
     22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
     23 *
     24 * Alternatively, the contents of this file may be used under the terms of
     25 * either the GNU General Public License Version 2 or later (the "GPL"), or
     26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
     27 * in which case the provisions of the GPL or the LGPL are applicable instead
     28 * of those above. If you wish to allow use of your version of this file only
     29 * under the terms of either the GPL or the LGPL, and not to allow others to
     30 * use your version of this file under the terms of the MPL, indicate your
     31 * decision by deleting the provisions above and replace them with the notice
     32 * and other provisions required by the GPL or the LGPL. If you do not delete
     33 * the provisions above, a recipient may use your version of this file under
     34 * the terms of any one of the MPL, the GPL or the LGPL.
     35 *
     36 * ***** END LICENSE BLOCK ***** */
     37 /*
     38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
     39 * And Contributors.  All rights reserved.
     40 *
     41 * Redistribution and use in source and binary forms, with or without
     42 * modification, are permitted provided that the following conditions
     43 * are met:
     44 *
     45 * 1. Redistributions of source code must retain the above copyright
     46 *    notice, this list of conditions and the following disclaimer.
     47 *
     48 * 2. Redistributions in binary form must reproduce the above copyright
     49 *    notice, this list of conditions and the following disclaimer in the
     50 *    documentation and/or other materials provided with the distribution.
     51 *
     52 * 3. All modifications to the source code must be clearly marked as
     53 *    such.  Binary redistributions based on modified source code
     54 *    must be clearly marked as modified versions in the documentation
     55 *    and/or other materials provided with the distribution.
     56 *
     57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
     58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
     61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68 * SUCH DAMAGE.
     69 */
     70 
     71 #include <stdlib.h>
     72 #include <string.h>
     73 #include <stdio.h>
     74 #include <ctype.h>
     75 
     76 #include "affentry.hxx"
     77 #include "csutil.hxx"
     78 
     79 AffEntry::~AffEntry() {
     80  if (opts & aeLONGCOND)
     81    free(c.l.conds2);
     82  if (morphcode && !(opts & aeALIASM))
     83    free(morphcode);
     84  if (contclass && !(opts & aeALIASF))
     85    free(contclass);
     86 }
     87 
     88 PfxEntry::PfxEntry(AffixMgr* pmgr)
     89    // register affix manager
     90    : pmyMgr(pmgr),
     91      next(NULL),
     92      nexteq(NULL),
     93      nextne(NULL),
     94      flgnxt(NULL) {
     95 }
     96 
     97 // add prefix to this word assuming conditions hold
     98 std::string PfxEntry::add(const char* word, size_t len) {
     99  std::string result;
    100  if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
    101      (len >= numconds) && test_condition(word) &&
    102      (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
    103    /* we have a match so add prefix */
    104    result.assign(appnd);
    105    result.append(word + strip.size());
    106  }
    107  return result;
    108 }
    109 
    110 inline char* PfxEntry::nextchar(char* p) {
    111  if (p) {
    112    p++;
    113    if (opts & aeLONGCOND) {
    114      // jump to the 2nd part of the condition
    115      if (p == c.conds + MAXCONDLEN_1)
    116        return c.l.conds2;
    117      // end of the MAXCONDLEN length condition
    118    } else if (p == c.conds + MAXCONDLEN)
    119      return NULL;
    120    return *p ? p : NULL;
    121  }
    122  return NULL;
    123 }
    124 
    125 inline int PfxEntry::test_condition(const char* st) {
    126  const char* pos = NULL;  // group with pos input position
    127  bool neg = false;        // complementer
    128  bool ingroup = false;    // character in the group
    129  if (numconds == 0)
    130    return 1;
    131  char* p = c.conds;
    132  while (1) {
    133    switch (*p) {
    134      case '\0':
    135        return 1;
    136      case '[': {
    137        neg = false;
    138        ingroup = false;
    139        p = nextchar(p);
    140        pos = st;
    141        break;
    142      }
    143      case '^': {
    144        p = nextchar(p);
    145        neg = true;
    146        break;
    147      }
    148      case ']': {
    149        if (bool(neg) == bool(ingroup))
    150          return 0;
    151        pos = NULL;
    152        p = nextchar(p);
    153        // skip the next character
    154        if (!ingroup && *st)
    155          for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
    156            ;
    157        if (*st == '\0' && p)
    158          return 0;  // word <= condition
    159        break;
    160      }
    161      case '.':
    162        if (!pos) {  // dots are not metacharacters in groups: [.]
    163          p = nextchar(p);
    164          // skip the next character
    165          for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
    166            ;
    167          if (*st == '\0' && p)
    168            return 0;  // word <= condition
    169          break;
    170        }
    171      /* FALLTHROUGH */
    172      default: {
    173        if (*st == *p) {
    174          st++;
    175          p = nextchar(p);
    176          if ((opts & aeUTF8) && (*(st - 1) & 0x80)) {  // multibyte
    177            while (p && (*p & 0xc0) == 0x80) {          // character
    178              if (*p != *st) {
    179                if (!pos)
    180                  return 0;
    181                st = pos;
    182                break;
    183              }
    184              p = nextchar(p);
    185              st++;
    186            }
    187            if (pos && st != pos) {
    188              ingroup = true;
    189              while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
    190              }
    191            }
    192          } else if (pos) {
    193            ingroup = true;
    194            while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
    195            }
    196          }
    197        } else if (pos) {  // group
    198          p = nextchar(p);
    199        } else
    200          return 0;
    201      }
    202    }
    203    if (!p)
    204      return 1;
    205  }
    206 }
    207 
    208 // check if this prefix entry matches
    209 struct hentry* PfxEntry::checkword(const char* word,
    210                                   int len,
    211                                   char in_compound,
    212                                   const FLAG needflag) {
    213  struct hentry* he;  // hash entry of root word or NULL
    214 
    215  // on entry prefix is 0 length or already matches the beginning of the word.
    216  // So if the remaining root word has positive length
    217  // and if there are enough chars in root word and added back strip chars
    218  // to meet the number of characters conditions, then test it
    219 
    220  int tmpl = len - appnd.size(); // length of tmpword
    221 
    222  if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
    223    // generate new root word by removing prefix and adding
    224    // back any characters that would have been stripped
    225 
    226    std::string tmpword(strip);
    227    tmpword.append(word + appnd.size(), tmpl);
    228 
    229    // now make sure all of the conditions on characters
    230    // are met.  Please see the appendix at the end of
    231    // this file for more info on exactly what is being
    232    // tested
    233 
    234    // if all conditions are met then check if resulting
    235    // root word in the dictionary
    236 
    237    if (test_condition(tmpword.c_str())) {
    238      tmpl += strip.size();
    239      if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
    240        do {
    241          if (TESTAFF(he->astr, aflag, he->alen) &&
    242              // forbid single prefixes with needaffix flag
    243              !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
    244              // needflag
    245              ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
    246               (contclass && TESTAFF(contclass, needflag, contclasslen))))
    247            return he;
    248          he = he->next_homonym;  // check homonyms
    249        } while (he);
    250      }
    251 
    252      // prefix matched but no root word was found
    253      // if aeXPRODUCT is allowed, try again but now
    254      // ross checked combined with a suffix
    255 
    256      // if ((opts & aeXPRODUCT) && in_compound) {
    257      if ((opts & aeXPRODUCT)) {
    258        he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
    259                                  FLAG_NULL, needflag, in_compound);
    260        if (he)
    261          return he;
    262      }
    263    }
    264  }
    265  return NULL;
    266 }
    267 
    268 // check if this prefix entry matches
    269 struct hentry* PfxEntry::check_twosfx(const char* word,
    270                                      int len,
    271                                      char in_compound,
    272                                      const FLAG needflag) {
    273  // on entry prefix is 0 length or already matches the beginning of the word.
    274  // So if the remaining root word has positive length
    275  // and if there are enough chars in root word and added back strip chars
    276  // to meet the number of characters conditions, then test it
    277 
    278  int tmpl = len - appnd.size(); // length of tmpword
    279 
    280  if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
    281      (tmpl + strip.size() >= numconds)) {
    282    // generate new root word by removing prefix and adding
    283    // back any characters that would have been stripped
    284 
    285    std::string tmpword(strip);
    286    tmpword.append(word + appnd.size());
    287 
    288    // now make sure all of the conditions on characters
    289    // are met.  Please see the appendix at the end of
    290    // this file for more info on exactly what is being
    291    // tested
    292 
    293    // if all conditions are met then check if resulting
    294    // root word in the dictionary
    295 
    296    if (test_condition(tmpword.c_str())) {
    297      tmpl += strip.size();
    298 
    299      // prefix matched but no root word was found
    300      // if aeXPRODUCT is allowed, try again but now
    301      // cross checked combined with a suffix
    302 
    303      if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
    304        // hash entry of root word or NULL
    305        struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
    306                                                        needflag);
    307        if (he)
    308          return he;
    309      }
    310    }
    311  }
    312  return NULL;
    313 }
    314 
    315 // check if this prefix entry matches
    316 std::string PfxEntry::check_twosfx_morph(const char* word,
    317                                         int len,
    318                                         char in_compound,
    319                                         const FLAG needflag) {
    320  std::string result;
    321  // on entry prefix is 0 length or already matches the beginning of the word.
    322  // So if the remaining root word has positive length
    323  // and if there are enough chars in root word and added back strip chars
    324  // to meet the number of characters conditions, then test it
    325  int tmpl = len - appnd.size(); // length of tmpword
    326 
    327  if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
    328      (tmpl + strip.size() >= numconds)) {
    329    // generate new root word by removing prefix and adding
    330    // back any characters that would have been stripped
    331 
    332    std::string tmpword(strip);
    333    tmpword.append(word + appnd.size());
    334 
    335    // now make sure all of the conditions on characters
    336    // are met.  Please see the appendix at the end of
    337    // this file for more info on exactly what is being
    338    // tested
    339 
    340    // if all conditions are met then check if resulting
    341    // root word in the dictionary
    342 
    343    if (test_condition(tmpword.c_str())) {
    344      tmpl += strip.size();
    345 
    346      // prefix matched but no root word was found
    347      // if aeXPRODUCT is allowed, try again but now
    348      // ross checked combined with a suffix
    349 
    350      if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
    351        result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
    352                                                   aeXPRODUCT,
    353                                                   this, needflag);
    354      }
    355    }
    356  }
    357  return result;
    358 }
    359 
    360 // check if this prefix entry matches
    361 std::string PfxEntry::check_morph(const char* word,
    362                                  int len,
    363                                  char in_compound,
    364                                  const FLAG needflag) {
    365  std::string result;
    366 
    367  // on entry prefix is 0 length or already matches the beginning of the word.
    368  // So if the remaining root word has positive length
    369  // and if there are enough chars in root word and added back strip chars
    370  // to meet the number of characters conditions, then test it
    371 
    372  int tmpl = len - appnd.size(); // length of tmpword
    373 
    374  if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
    375      (tmpl + strip.size() >= numconds)) {
    376    // generate new root word by removing prefix and adding
    377    // back any characters that would have been stripped
    378 
    379    std::string tmpword(strip);
    380    tmpword.append(word + appnd.size());
    381 
    382    // now make sure all of the conditions on characters
    383    // are met.  Please see the appendix at the end of
    384    // this file for more info on exactly what is being
    385    // tested
    386 
    387    // if all conditions are met then check if resulting
    388    // root word in the dictionary
    389 
    390    if (test_condition(tmpword.c_str())) {
    391      tmpl += strip.size();
    392      struct hentry* he;  // hash entry of root word or NULL
    393      if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
    394        do {
    395          if (TESTAFF(he->astr, aflag, he->alen) &&
    396              // forbid single prefixes with needaffix flag
    397              !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
    398              // needflag
    399              ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
    400               (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
    401            if (morphcode) {
    402              result.push_back(MSEP_FLD);
    403              result.append(morphcode);
    404            } else
    405              result.append(getKey());
    406            if (!HENTRY_FIND(he, MORPH_STEM)) {
    407              result.push_back(MSEP_FLD);
    408              result.append(MORPH_STEM);
    409              result.append(HENTRY_WORD(he));
    410            }
    411            // store the pointer of the hash entry
    412            if (HENTRY_DATA(he)) {
    413              result.push_back(MSEP_FLD);
    414              result.append(HENTRY_DATA2(he));
    415            } else {
    416              // return with debug information
    417              char* flag = pmyMgr->encode_flag(getFlag());
    418              result.push_back(MSEP_FLD);
    419              result.append(MORPH_FLAG);
    420              result.append(flag);
    421              free(flag);
    422            }
    423            result.push_back(MSEP_REC);
    424          }
    425          he = he->next_homonym;
    426        } while (he);
    427      }
    428 
    429      // prefix matched but no root word was found
    430      // if aeXPRODUCT is allowed, try again but now
    431      // ross checked combined with a suffix
    432 
    433      if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
    434        std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
    435                                                    FLAG_NULL, needflag);
    436        if (!st.empty()) {
    437          result.append(st);
    438        }
    439      }
    440    }
    441  }
    442 
    443  return result;
    444 }
    445 
    446 SfxEntry::SfxEntry(AffixMgr* pmgr)
    447    : pmyMgr(pmgr)  // register affix manager
    448      ,
    449      next(NULL),
    450      nexteq(NULL),
    451      nextne(NULL),
    452      flgnxt(NULL),
    453      l_morph(NULL),
    454      r_morph(NULL),
    455      eq_morph(NULL) {
    456 }
    457 
    458 // add suffix to this word assuming conditions hold
    459 std::string SfxEntry::add(const char* word, size_t len) {
    460  std::string result;
    461  /* make sure all conditions match */
    462  if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
    463      (len >= numconds) && test_condition(word + len, word) &&
    464      (!strip.size() ||
    465       (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
    466    result.assign(word);
    467    /* we have a match so add suffix */
    468    result.replace(len - strip.size(), std::string::npos, appnd);
    469  }
    470  return result;
    471 }
    472 
    473 inline char* SfxEntry::nextchar(char* p) {
    474  if (p) {
    475    p++;
    476    if (opts & aeLONGCOND) {
    477      // jump to the 2nd part of the condition
    478      if (p == c.l.conds1 + MAXCONDLEN_1)
    479        return c.l.conds2;
    480      // end of the MAXCONDLEN length condition
    481    } else if (p == c.conds + MAXCONDLEN)
    482      return NULL;
    483    return *p ? p : NULL;
    484  }
    485  return NULL;
    486 }
    487 
    488 inline int SfxEntry::test_condition(const char* st, const char* beg) {
    489  const char* pos = NULL;  // group with pos input position
    490  bool neg = false;        // complementer
    491  bool ingroup = false;    // character in the group
    492  if (numconds == 0)
    493    return 1;
    494  char* p = c.conds;
    495  st--;
    496  int i = 1;
    497  while (1) {
    498    switch (*p) {
    499      case '\0':
    500        return 1;
    501      case '[':
    502        p = nextchar(p);
    503        pos = st;
    504        break;
    505      case '^':
    506        p = nextchar(p);
    507        neg = true;
    508        break;
    509      case ']':
    510        if (!neg && !ingroup)
    511          return 0;
    512        i++;
    513        // skip the next character
    514        if (!ingroup) {
    515          for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
    516            ;
    517          st--;
    518        }
    519        pos = NULL;
    520        neg = false;
    521        ingroup = false;
    522        p = nextchar(p);
    523        if (st < beg && p)
    524          return 0;  // word <= condition
    525        break;
    526      case '.':
    527        if (!pos) {
    528          // dots are not metacharacters in groups: [.]
    529          p = nextchar(p);
    530          // skip the next character
    531          for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
    532               st--)
    533            ;
    534          if (st < beg) {  // word <= condition
    535            if (p)
    536              return 0;
    537            else
    538              return 1;
    539          }
    540          if ((opts & aeUTF8) && (*st & 0x80)) {  // head of the UTF-8 character
    541            st--;
    542            if (st < beg) {  // word <= condition
    543              if (p)
    544                return 0;
    545              else
    546                return 1;
    547            }
    548          }
    549          break;
    550        }
    551      /* FALLTHROUGH */
    552      default: {
    553        if (*st == *p) {
    554          p = nextchar(p);
    555          if ((opts & aeUTF8) && (*st & 0x80)) {
    556            st--;
    557            while (p && (st >= beg)) {
    558              if (*p != *st) {
    559                if (!pos)
    560                  return 0;
    561                st = pos;
    562                break;
    563              }
    564              // first byte of the UTF-8 multibyte character
    565              if ((*p & 0xc0) != 0x80)
    566                break;
    567              p = nextchar(p);
    568              st--;
    569            }
    570            if (pos && st != pos) {
    571              if (neg)
    572                return 0;
    573              else if (i == numconds)
    574                return 1;
    575              ingroup = true;
    576              while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
    577              }
    578              st--;
    579            }
    580            if (p && *p != ']')
    581              p = nextchar(p);
    582          } else if (pos) {
    583            if (neg)
    584              return 0;
    585            else if (i == numconds)
    586              return 1;
    587            ingroup = true;
    588            while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
    589            }
    590            //			if (p && *p != ']') p = nextchar(p);
    591            st--;
    592          }
    593          if (!pos) {
    594            i++;
    595            st--;
    596          }
    597          if (st < beg && p && *p != ']')
    598            return 0;      // word <= condition
    599        } else if (pos) {  // group
    600          p = nextchar(p);
    601        } else
    602          return 0;
    603      }
    604    }
    605    if (!p)
    606      return 1;
    607  }
    608 }
    609 
    610 // see if this suffix is present in the word
    611 struct hentry* SfxEntry::checkword(const char* word,
    612                                   int len,
    613                                   int optflags,
    614                                   PfxEntry* ppfx,
    615                                   const FLAG cclass,
    616                                   const FLAG needflag,
    617                                   const FLAG badflag) {
    618  struct hentry* he;  // hash entry pointer
    619  PfxEntry* ep = ppfx;
    620 
    621  // if this suffix is being cross checked with a prefix
    622  // but it does not support cross products skip it
    623 
    624  if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
    625    return NULL;
    626 
    627  // upon entry suffix is 0 length or already matches the end of the word.
    628  // So if the remaining root word has positive length
    629  // and if there are enough chars in root word and added back strip chars
    630  // to meet the number of characters conditions, then test it
    631 
    632  int tmpl = len - appnd.size(); // length of tmpword
    633  // the second condition is not enough for UTF-8 strings
    634  // it checked in test_condition()
    635 
    636  if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
    637      (tmpl + strip.size() >= numconds)) {
    638    // generate new root word by removing suffix and adding
    639    // back any characters that would have been stripped or
    640    // or null terminating the shorter string
    641 
    642    std::string tmpstring(word, tmpl);
    643    if (strip.size()) {
    644      tmpstring.append(strip);
    645    }
    646 
    647    const char* tmpword = tmpstring.c_str();
    648    const char* endword = tmpword + tmpstring.size();
    649 
    650    // now make sure all of the conditions on characters
    651    // are met.  Please see the appendix at the end of
    652    // this file for more info on exactly what is being
    653    // tested
    654 
    655    // if all conditions are met then check if resulting
    656    // root word in the dictionary
    657 
    658    if (test_condition(endword, tmpword)) {
    659 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
    660      fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
    661 #endif
    662      if ((he = pmyMgr->lookup(tmpword)) != NULL) {
    663        do {
    664          // check conditional suffix (enabled by prefix)
    665          if ((TESTAFF(he->astr, aflag, he->alen) ||
    666               (ep && ep->getCont() &&
    667                TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
    668              (((optflags & aeXPRODUCT) == 0) ||
    669               (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
    670               // enabled by prefix
    671               ((contclass) &&
    672                (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
    673              // handle cont. class
    674              ((!cclass) ||
    675               ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
    676              // check only in compound homonyms (bad flags)
    677              (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
    678              // handle required flag
    679              ((!needflag) ||
    680               (TESTAFF(he->astr, needflag, he->alen) ||
    681                ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
    682            return he;
    683          he = he->next_homonym;  // check homonyms
    684        } while (he);
    685      }
    686    }
    687  }
    688  return NULL;
    689 }
    690 
    691 // see if two-level suffix is present in the word
    692 struct hentry* SfxEntry::check_twosfx(const char* word,
    693                                      int len,
    694                                      int optflags,
    695                                      PfxEntry* ppfx,
    696                                      const FLAG needflag) {
    697  PfxEntry* ep = ppfx;
    698 
    699  // if this suffix is being cross checked with a prefix
    700  // but it does not support cross products skip it
    701 
    702  if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
    703    return NULL;
    704 
    705  // upon entry suffix is 0 length or already matches the end of the word.
    706  // So if the remaining root word has positive length
    707  // and if there are enough chars in root word and added back strip chars
    708  // to meet the number of characters conditions, then test it
    709 
    710  int tmpl = len - appnd.size(); // length of tmpword
    711 
    712  if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
    713      (tmpl + strip.size() >= numconds)) {
    714    // generate new root word by removing suffix and adding
    715    // back any characters that would have been stripped or
    716    // or null terminating the shorter string
    717 
    718    std::string tmpword(word);
    719    tmpword.resize(tmpl);
    720    tmpword.append(strip);
    721    tmpl += strip.size();
    722 
    723    const char* beg = tmpword.c_str();
    724    const char* end = beg + tmpl;
    725 
    726    // now make sure all of the conditions on characters
    727    // are met.  Please see the appendix at the end of
    728    // this file for more info on exactly what is being
    729    // tested
    730 
    731    // if all conditions are met then recall suffix_check
    732 
    733    if (test_condition(end, beg)) {
    734      struct hentry* he;  // hash entry pointer
    735      if (ppfx) {
    736        // handle conditional suffix
    737        if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
    738          he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
    739                                    (FLAG)aflag, needflag, IN_CPD_NOT);
    740        else
    741          he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx,
    742                                    (FLAG)aflag, needflag, IN_CPD_NOT);
    743      } else {
    744        he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
    745                                  (FLAG)aflag, needflag, IN_CPD_NOT);
    746      }
    747      if (he)
    748        return he;
    749    }
    750  }
    751  return NULL;
    752 }
    753 
    754 // see if two-level suffix is present in the word
    755 std::string SfxEntry::check_twosfx_morph(const char* word,
    756                                         int len,
    757                                         int optflags,
    758                                         PfxEntry* ppfx,
    759                                         const FLAG needflag) {
    760  PfxEntry* ep = ppfx;
    761 
    762  std::string result;
    763 
    764  // if this suffix is being cross checked with a prefix
    765  // but it does not support cross products skip it
    766 
    767  if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
    768    return result;
    769 
    770  // upon entry suffix is 0 length or already matches the end of the word.
    771  // So if the remaining root word has positive length
    772  // and if there are enough chars in root word and added back strip chars
    773  // to meet the number of characters conditions, then test it
    774 
    775  int tmpl = len - appnd.size(); // length of tmpword
    776 
    777  if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
    778      (tmpl + strip.size() >= numconds)) {
    779    // generate new root word by removing suffix and adding
    780    // back any characters that would have been stripped or
    781    // or null terminating the shorter string
    782 
    783    std::string tmpword(word);
    784    tmpword.resize(tmpl);
    785    tmpword.append(strip);
    786    tmpl += strip.size();
    787 
    788    const char* beg = tmpword.c_str();
    789    const char* end = beg + tmpl;
    790 
    791    // now make sure all of the conditions on characters
    792    // are met.  Please see the appendix at the end of
    793    // this file for more info on exactly what is being
    794    // tested
    795 
    796    // if all conditions are met then recall suffix_check
    797 
    798    if (test_condition(end, beg)) {
    799      if (ppfx) {
    800        // handle conditional suffix
    801        if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
    802          std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
    803                                                      needflag);
    804          if (!st.empty()) {
    805            if (ppfx->getMorph()) {
    806              result.append(ppfx->getMorph());
    807              result.push_back(MSEP_FLD);
    808            }
    809            result.append(st);
    810            mychomp(result);
    811          }
    812        } else {
    813          std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
    814                                                      needflag);
    815          if (!st.empty()) {
    816            result.append(st);
    817            mychomp(result);
    818          }
    819        }
    820      } else {
    821        std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
    822        if (!st.empty()) {
    823          result.append(st);
    824          mychomp(result);
    825        }
    826      }
    827    }
    828  }
    829  return result;
    830 }
    831 
    832 // get next homonym with same affix
    833 struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
    834                                          int optflags,
    835                                          PfxEntry* ppfx,
    836                                          const FLAG cclass,
    837                                          const FLAG needflag) {
    838  PfxEntry* ep = ppfx;
    839  FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
    840 
    841  while (he->next_homonym) {
    842    he = he->next_homonym;
    843    if ((TESTAFF(he->astr, aflag, he->alen) ||
    844         (ep && ep->getCont() &&
    845          TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
    846        ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
    847         // handle conditional suffix
    848         ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
    849        // handle cont. class
    850        ((!cclass) ||
    851         ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
    852        // handle required flag
    853        ((!needflag) ||
    854         (TESTAFF(he->astr, needflag, he->alen) ||
    855          ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
    856      return he;
    857  }
    858  return NULL;
    859 }
    860 
    861 void SfxEntry::initReverseWord() {
    862  rappnd = appnd;
    863  reverseword(rappnd);
    864 }
    865 
    866 #if 0
    867 
    868 Appendix:  Understanding Affix Code
    869 
    870 
    871 An affix is either a  prefix or a suffix attached to root words to make 
    872 other words.
    873 
    874 Basically a Prefix or a Suffix is set of AffEntry objects
    875 which store information about the prefix or suffix along 
    876 with supporting routines to check if a word has a particular 
    877 prefix or suffix or a combination.
    878 
    879 The structure affentry is defined as follows:
    880 
    881 struct affentry
    882 {
    883   unsigned short aflag;    // ID used to represent the affix
    884   std::string strip;       // string to strip before adding affix
    885   std::string appnd;       // the affix string to add
    886   char numconds;           // the number of conditions that must be met
    887   char opts;               // flag: aeXPRODUCT- combine both prefix and suffix 
    888   char   conds[SETSIZE];   // array which encodes the conditions to be met
    889 };
    890 
    891 
    892 Here is a suffix borrowed from the en_US.aff file.  This file 
    893 is whitespace delimited.
    894 
    895 SFX D Y 4 
    896 SFX D   0     e          d
    897 SFX D   y     ied        [^aeiou]y
    898 SFX D   0     ed         [^ey]
    899 SFX D   0     ed         [aeiou]y
    900 
    901 This information can be interpreted as follows:
    902 
    903 In the first line has 4 fields
    904 
    905 Field
    906 -----
    907 1     SFX - indicates this is a suffix
    908 2     D   - is the name of the character flag which represents this suffix
    909 3     Y   - indicates it can be combined with prefixes (cross product)
    910 4     4   - indicates that sequence of 4 affentry structures are needed to
    911               properly store the affix information
    912 
    913 The remaining lines describe the unique information for the 4 SfxEntry 
    914 objects that make up this affix.  Each line can be interpreted
    915 as follows: (note fields 1 and 2 are as a check against line 1 info)
    916 
    917 Field
    918 -----
    919 1     SFX         - indicates this is a suffix
    920 2     D           - is the name of the character flag for this affix
    921 3     y           - the string of chars to strip off before adding affix
    922                         (a 0 here indicates the NULL string)
    923 4     ied         - the string of affix characters to add
    924 5     [^aeiou]y   - the conditions which must be met before the affix
    925                    can be applied
    926 
    927 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
    928 there are 2 conditions that must be met.  The first condition is that 
    929 the next to the last character in the word must *NOT* be any of the 
    930 following "a", "e", "i", "o" or "u".  The second condition is that
    931 the last character of the word must end in "y".
    932 
    933 So how can we encode this information concisely and be able to 
    934 test for both conditions in a fast manner?  The answer is found
    935 but studying the wonderful ispell code of Geoff Kuenning, et.al. 
    936 (now available under a normal BSD license).
    937 
    938 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
    939 using a character (cast to an unsigned char) of a string, we have 8 bits
    940 of information we can store about that character.  Specifically we
    941 could use each bit to say if that character is allowed in any of the 
    942 last (or first for prefixes) 8 characters of the word.
    943 
    944 Basically, each character at one end of the word (up to the number 
    945 of conditions) is used to index into the conds array and the resulting 
    946 value found there says whether the that character is valid for a 
    947 specific character position in the word.  
    948 
    949 For prefixes, it does this by setting bit 0 if that char is valid 
    950 in the first position, bit 1 if valid in the second position, and so on. 
    951 
    952 If a bit is not set, then that char is not valid for that postion in the
    953 word.
    954 
    955 If working with suffixes bit 0 is used for the character closest 
    956 to the front, bit 1 for the next character towards the end, ..., 
    957 with bit numconds-1 representing the last char at the end of the string. 
    958 
    959 Note: since entries in the conds[] are 8 bits, only 8 conditions 
    960 (read that only 8 character positions) can be examined at one
    961 end of a word (the beginning for prefixes and the end for suffixes.
    962 
    963 So to make this clearer, lets encode the conds array values for the 
    964 first two affentries for the suffix D described earlier.
    965 
    966 
    967  For the first affentry:    
    968     numconds = 1             (only examine the last character)
    969 
    970     conds['e'] =  (1 << 0)   (the word must end in an E)
    971     all others are all 0
    972 
    973  For the second affentry:
    974     numconds = 2             (only examine the last two characters)     
    975 
    976     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
    977         where X is all characters *but* a, e, i, o, or u
    978         
    979 
    980     conds['y'] = (1 << 1)     (the last char must be a y)
    981     all other bits for all other entries in the conds array are zero
    982 
    983 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE