tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

affixmgr.cxx (152435B)


      1 /* ***** BEGIN LICENSE BLOCK *****
      2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
      3 *
      4 * Copyright (C) 2002-2022 Németh László
      5 *
      6 * The contents of this file are subject to the Mozilla Public License Version
      7 * 1.1 (the "License"); you may not use this file except in compliance with
      8 * the License. You may obtain a copy of the License at
      9 * http://www.mozilla.org/MPL/
     10 *
     11 * Software distributed under the License is distributed on an "AS IS" basis,
     12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
     13 * for the specific language governing rights and limitations under the
     14 * License.
     15 *
     16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
     17 *
     18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
     19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
     20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
     21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
     22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
     23 *
     24 * Alternatively, the contents of this file may be used under the terms of
     25 * either the GNU General Public License Version 2 or later (the "GPL"), or
     26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
     27 * in which case the provisions of the GPL or the LGPL are applicable instead
     28 * of those above. If you wish to allow use of your version of this file only
     29 * under the terms of either the GPL or the LGPL, and not to allow others to
     30 * use your version of this file under the terms of the MPL, indicate your
     31 * decision by deleting the provisions above and replace them with the notice
     32 * and other provisions required by the GPL or the LGPL. If you do not delete
     33 * the provisions above, a recipient may use your version of this file under
     34 * the terms of any one of the MPL, the GPL or the LGPL.
     35 *
     36 * ***** END LICENSE BLOCK ***** */
     37 /*
     38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
     39 * And Contributors.  All rights reserved.
     40 *
     41 * Redistribution and use in source and binary forms, with or without
     42 * modification, are permitted provided that the following conditions
     43 * are met:
     44 *
     45 * 1. Redistributions of source code must retain the above copyright
     46 *    notice, this list of conditions and the following disclaimer.
     47 *
     48 * 2. Redistributions in binary form must reproduce the above copyright
     49 *    notice, this list of conditions and the following disclaimer in the
     50 *    documentation and/or other materials provided with the distribution.
     51 *
     52 * 3. All modifications to the source code must be clearly marked as
     53 *    such.  Binary redistributions based on modified source code
     54 *    must be clearly marked as modified versions in the documentation
     55 *    and/or other materials provided with the distribution.
     56 *
     57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
     58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
     61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68 * SUCH DAMAGE.
     69 */
     70 
     71 #include <stdlib.h>
     72 #include <string.h>
     73 #include <stdio.h>
     74 #include <ctype.h>
     75 #include <time.h>
     76 
     77 #include <algorithm>
     78 #include <limits>
     79 #include <string>
     80 #include <vector>
     81 
     82 #include "affixmgr.hxx"
     83 #include "affentry.hxx"
     84 #include "langnum.hxx"
     85 
     86 #include "csutil.hxx"
     87 
     88 AffixMgr::AffixMgr(const char* affpath,
     89                   const std::vector<HashMgr*>& ptr,
     90                   const char* key)
     91  : alldic(ptr)
     92  , pHMgr(ptr[0]) {
     93 
     94  // register hash manager and load affix data from aff file
     95  csconv = NULL;
     96  utf8 = 0;
     97  complexprefixes = 0;
     98  parsedmaptable = false;
     99  parsedbreaktable = false;
    100  iconvtable = NULL;
    101  oconvtable = NULL;
    102  // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
    103  simplifiedcpd = 0;
    104  parsedcheckcpd = false;
    105  parseddefcpd = false;
    106  phone = NULL;
    107  compoundflag = FLAG_NULL;        // permits word in compound forms
    108  compoundbegin = FLAG_NULL;       // may be first word in compound forms
    109  compoundmiddle = FLAG_NULL;      // may be middle word in compound forms
    110  compoundend = FLAG_NULL;         // may be last word in compound forms
    111  compoundroot = FLAG_NULL;        // compound word signing flag
    112  compoundpermitflag = FLAG_NULL;  // compound permitting flag for suffixed word
    113  compoundforbidflag = FLAG_NULL;  // compound fordidden flag for suffixed word
    114  compoundmoresuffixes = 0;        // allow more suffixes within compound words
    115  checkcompounddup = 0;            // forbid double words in compounds
    116  checkcompoundrep = 0;  // forbid bad compounds (may be non-compound word with
    117                         // a REP substitution)
    118  checkcompoundcase =
    119      0;  // forbid upper and lowercase combinations at word bounds
    120  checkcompoundtriple = 0;  // forbid compounds with triple letters
    121  simplifiedtriple = 0;     // allow simplified triple letters in compounds
    122                            // (Schiff+fahrt -> Schiffahrt)
    123  forbiddenword = FORBIDDENWORD;  // forbidden word signing flag
    124  nosuggest = FLAG_NULL;  // don't suggest words signed with NOSUGGEST flag
    125  nongramsuggest = FLAG_NULL;
    126  langnum = 0;  // language code (see http://l10n.openoffice.org/languages.html)
    127  needaffix = FLAG_NULL;  // forbidden root, allowed only with suffixes
    128  cpdwordmax = -1;        // default: unlimited wordcount in compound words
    129  cpdmin = -1;            // undefined
    130  cpdmaxsyllable = 0;     // default: unlimited syllablecount in compound words
    131  pfxappnd = NULL;  // previous prefix for counting syllables of the prefix BUG
    132  sfxappnd = NULL;  // previous suffix for counting syllables of the suffix BUG
    133  sfxextra = 0;     // modifier for syllable count of sfxappnd BUG
    134  checknum = 0;               // checking numbers, and word with numbers
    135  havecontclass = 0;  // flags of possible continuing classes (double affix)
    136  // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
    137  // in morhological description in dictionary file. It's often combined with
    138  // PSEUDOROOT.
    139  lemma_present = FLAG_NULL;
    140  circumfix = FLAG_NULL;
    141  onlyincompound = FLAG_NULL;
    142  maxngramsugs = -1;  // undefined
    143  maxdiff = -1;       // undefined
    144  onlymaxdiff = 0;
    145  maxcpdsugs = -1;  // undefined
    146  nosplitsugs = 0;
    147  sugswithdots = 0;
    148  keepcase = 0;
    149  forceucase = 0;
    150  warn = 0;
    151  forbidwarn = 0;
    152  checksharps = 0;
    153  substandard = FLAG_NULL;
    154  fullstrip = 0;
    155 
    156  sfx = NULL;
    157  pfx = NULL;
    158 
    159  for (int i = 0; i < SETSIZE; i++) {
    160    pStart[i] = NULL;
    161    sStart[i] = NULL;
    162    pFlag[i] = NULL;
    163    sFlag[i] = NULL;
    164  }
    165 
    166  for (int j = 0; j < CONTSIZE; j++) {
    167    contclasses[j] = 0;
    168  }
    169 
    170  if (parse_file(affpath, key)) {
    171    HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath);
    172  }
    173 
    174  if (cpdmin == -1)
    175    cpdmin = MINCPDLEN;
    176 }
    177 
    178 AffixMgr::~AffixMgr() {
    179  // pass through linked prefix entries and clean up
    180  for (int i = 0; i < SETSIZE; i++) {
    181    pFlag[i] = NULL;
    182    PfxEntry* ptr = pStart[i];
    183    PfxEntry* nptr = NULL;
    184    while (ptr) {
    185      nptr = ptr->getNext();
    186      delete (ptr);
    187      ptr = nptr;
    188      nptr = NULL;
    189    }
    190  }
    191 
    192  // pass through linked suffix entries and clean up
    193  for (int j = 0; j < SETSIZE; j++) {
    194    sFlag[j] = NULL;
    195    SfxEntry* ptr = sStart[j];
    196    SfxEntry* nptr = NULL;
    197    while (ptr) {
    198      nptr = ptr->getNext();
    199      delete (ptr);
    200      ptr = nptr;
    201      nptr = NULL;
    202    }
    203    sStart[j] = NULL;
    204  }
    205 
    206  delete iconvtable;
    207  delete oconvtable;
    208  delete phone;
    209 
    210  FREE_FLAG(compoundflag);
    211  FREE_FLAG(compoundbegin);
    212  FREE_FLAG(compoundmiddle);
    213  FREE_FLAG(compoundend);
    214  FREE_FLAG(compoundpermitflag);
    215  FREE_FLAG(compoundforbidflag);
    216  FREE_FLAG(compoundroot);
    217  FREE_FLAG(forbiddenword);
    218  FREE_FLAG(nosuggest);
    219  FREE_FLAG(nongramsuggest);
    220  FREE_FLAG(needaffix);
    221  FREE_FLAG(lemma_present);
    222  FREE_FLAG(circumfix);
    223  FREE_FLAG(onlyincompound);
    224 
    225  cpdwordmax = 0;
    226  pHMgr = NULL;
    227  cpdmin = 0;
    228  cpdmaxsyllable = 0;
    229  free_utf_tbl();
    230  checknum = 0;
    231 #ifdef MOZILLA_CLIENT
    232  delete[] csconv;
    233 #endif
    234 }
    235 
    236 void AffixMgr::finishFileMgr(FileMgr* afflst) {
    237  delete afflst;
    238 
    239  // convert affix trees to sorted list
    240  process_pfx_tree_to_list();
    241  process_sfx_tree_to_list();
    242 }
    243 
    244 // read in aff file and build up prefix and suffix entry objects
    245 int AffixMgr::parse_file(const char* affpath, const char* key) {
    246 
    247  // checking flag duplication
    248  char dupflags[CONTSIZE];
    249  char dupflags_ini = 1;
    250 
    251  // first line indicator for removing byte order mark
    252  int firstline = 1;
    253 
    254  // open the affix file
    255  FileMgr* afflst = new FileMgr(affpath, key);
    256  if (!afflst) {
    257    HUNSPELL_WARNING(
    258        stderr, "error: could not open affix description file %s\n", affpath);
    259    return 1;
    260  }
    261 
    262  // step one is to parse the affix file building up the internal
    263  // affix data structures
    264 
    265  // read in each line ignoring any that do not
    266  // start with a known line type indicator
    267  std::string line;
    268  while (afflst->getline(line)) {
    269    mychomp(line);
    270 
    271    /* remove byte order mark */
    272    if (firstline) {
    273      firstline = 0;
    274      // Affix file begins with byte order mark: possible incompatibility with
    275      // old Hunspell versions
    276      if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
    277        line.erase(0, 3);
    278      }
    279    }
    280 
    281    /* parse in the keyboard string */
    282    if (line.compare(0, 3, "KEY", 3) == 0) {
    283      if (!parse_string(line, keystring, afflst->getlinenum())) {
    284        finishFileMgr(afflst);
    285        return 1;
    286      }
    287    }
    288 
    289    /* parse in the try string */
    290    if (line.compare(0, 3, "TRY", 3) == 0) {
    291      if (!parse_string(line, trystring, afflst->getlinenum())) {
    292        finishFileMgr(afflst);
    293        return 1;
    294      }
    295    }
    296 
    297    /* parse in the name of the character set used by the .dict and .aff */
    298    if (line.compare(0, 3, "SET", 3) == 0) {
    299      if (!parse_string(line, encoding, afflst->getlinenum())) {
    300        finishFileMgr(afflst);
    301        return 1;
    302      }
    303      if (encoding == "UTF-8") {
    304        utf8 = 1;
    305 #ifndef OPENOFFICEORG
    306 #ifndef MOZILLA_CLIENT
    307        initialize_utf_tbl();
    308 #endif
    309 #endif
    310      }
    311    }
    312 
    313    /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
    314     * writing system */
    315    if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
    316      complexprefixes = 1;
    317 
    318    /* parse in the flag used by the controlled compound words */
    319    if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
    320      if (!parse_flag(line, &compoundflag, afflst)) {
    321        finishFileMgr(afflst);
    322        return 1;
    323      }
    324    }
    325 
    326    /* parse in the flag used by compound words */
    327    if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
    328      if (complexprefixes) {
    329        if (!parse_flag(line, &compoundend, afflst)) {
    330          finishFileMgr(afflst);
    331          return 1;
    332        }
    333      } else {
    334        if (!parse_flag(line, &compoundbegin, afflst)) {
    335          finishFileMgr(afflst);
    336          return 1;
    337        }
    338      }
    339    }
    340 
    341    /* parse in the flag used by compound words */
    342    if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
    343      if (!parse_flag(line, &compoundmiddle, afflst)) {
    344        finishFileMgr(afflst);
    345        return 1;
    346      }
    347    }
    348 
    349    /* parse in the flag used by compound words */
    350    if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
    351      if (complexprefixes) {
    352        if (!parse_flag(line, &compoundbegin, afflst)) {
    353          finishFileMgr(afflst);
    354          return 1;
    355        }
    356      } else {
    357        if (!parse_flag(line, &compoundend, afflst)) {
    358          finishFileMgr(afflst);
    359          return 1;
    360        }
    361      }
    362    }
    363 
    364    /* parse in the data used by compound_check() method */
    365    if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
    366      if (!parse_num(line, &cpdwordmax, afflst)) {
    367        finishFileMgr(afflst);
    368        return 1;
    369      }
    370    }
    371 
    372    /* parse in the flag sign compounds in dictionary */
    373    if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
    374      if (!parse_flag(line, &compoundroot, afflst)) {
    375        finishFileMgr(afflst);
    376        return 1;
    377      }
    378    }
    379 
    380    /* parse in the flag used by compound_check() method */
    381    if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
    382      if (!parse_flag(line, &compoundpermitflag, afflst)) {
    383        finishFileMgr(afflst);
    384        return 1;
    385      }
    386    }
    387 
    388    /* parse in the flag used by compound_check() method */
    389    if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
    390      if (!parse_flag(line, &compoundforbidflag, afflst)) {
    391        finishFileMgr(afflst);
    392        return 1;
    393      }
    394    }
    395 
    396    if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
    397      compoundmoresuffixes = 1;
    398    }
    399 
    400    if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
    401      checkcompounddup = 1;
    402    }
    403 
    404    if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
    405      checkcompoundrep = 1;
    406    }
    407 
    408    if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
    409      checkcompoundtriple = 1;
    410    }
    411 
    412    if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
    413      simplifiedtriple = 1;
    414    }
    415 
    416    if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
    417      checkcompoundcase = 1;
    418    }
    419 
    420    if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
    421      if (!parse_flag(line, &nosuggest, afflst)) {
    422        finishFileMgr(afflst);
    423        return 1;
    424      }
    425    }
    426 
    427    if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
    428      if (!parse_flag(line, &nongramsuggest, afflst)) {
    429        finishFileMgr(afflst);
    430        return 1;
    431      }
    432    }
    433 
    434    /* parse in the flag used by forbidden words */
    435    if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
    436      if (!parse_flag(line, &forbiddenword, afflst)) {
    437        finishFileMgr(afflst);
    438        return 1;
    439      }
    440    }
    441 
    442    /* parse in the flag used by forbidden words (is deprecated) */
    443    if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
    444      if (!parse_flag(line, &lemma_present, afflst)) {
    445        finishFileMgr(afflst);
    446        return 1;
    447      }
    448    }
    449 
    450    /* parse in the flag used by circumfixes */
    451    if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
    452      if (!parse_flag(line, &circumfix, afflst)) {
    453        finishFileMgr(afflst);
    454        return 1;
    455      }
    456    }
    457 
    458    /* parse in the flag used by fogemorphemes */
    459    if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
    460      if (!parse_flag(line, &onlyincompound, afflst)) {
    461        finishFileMgr(afflst);
    462        return 1;
    463      }
    464    }
    465 
    466    /* parse in the flag used by `needaffixs' (is deprecated) */
    467    if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
    468      if (!parse_flag(line, &needaffix, afflst)) {
    469        finishFileMgr(afflst);
    470        return 1;
    471      }
    472    }
    473 
    474    /* parse in the flag used by `needaffixs' */
    475    if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
    476      if (!parse_flag(line, &needaffix, afflst)) {
    477        finishFileMgr(afflst);
    478        return 1;
    479      }
    480    }
    481 
    482    /* parse in the minimal length for words in compounds */
    483    if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
    484      if (!parse_num(line, &cpdmin, afflst)) {
    485        finishFileMgr(afflst);
    486        return 1;
    487      }
    488      if (cpdmin < 1)
    489        cpdmin = 1;
    490    }
    491 
    492    /* parse in the max. words and syllables in compounds */
    493    if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
    494      if (!parse_cpdsyllable(line, afflst)) {
    495        finishFileMgr(afflst);
    496        return 1;
    497      }
    498    }
    499 
    500    /* parse in the flag used by compound_check() method */
    501    if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
    502      if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
    503        finishFileMgr(afflst);
    504        return 1;
    505      }
    506    }
    507 
    508    /* parse in the flag used by the controlled compound words */
    509    if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
    510      checknum = 1;
    511    }
    512 
    513    /* parse in the extra word characters */
    514    if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
    515      if (!parse_array(line, wordchars, wordchars_utf16,
    516                       utf8, afflst->getlinenum())) {
    517        finishFileMgr(afflst);
    518        return 1;
    519      }
    520    }
    521 
    522    /* parse in the ignored characters (for example, Arabic optional diacretics
    523     * charachters */
    524    if (line.compare(0, 6, "IGNORE", 6) == 0) {
    525      if (!parse_array(line, ignorechars, ignorechars_utf16,
    526                       utf8, afflst->getlinenum())) {
    527        finishFileMgr(afflst);
    528        return 1;
    529      }
    530    }
    531 
    532    /* parse in the input conversion table */
    533    if (line.compare(0, 5, "ICONV", 5) == 0) {
    534      if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
    535        finishFileMgr(afflst);
    536        return 1;
    537      }
    538    }
    539 
    540    /* parse in the output conversion table */
    541    if (line.compare(0, 5, "OCONV", 5) == 0) {
    542      if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
    543        finishFileMgr(afflst);
    544        return 1;
    545      }
    546    }
    547 
    548    /* parse in the phonetic translation table */
    549    if (line.compare(0, 5, "PHONE", 5) == 0) {
    550      if (!parse_phonetable(line, afflst)) {
    551        finishFileMgr(afflst);
    552        return 1;
    553      }
    554    }
    555 
    556    /* parse in the checkcompoundpattern table */
    557    if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
    558      if (!parse_checkcpdtable(line, afflst)) {
    559        finishFileMgr(afflst);
    560        return 1;
    561      }
    562    }
    563 
    564    /* parse in the defcompound table */
    565    if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
    566      if (!parse_defcpdtable(line, afflst)) {
    567        finishFileMgr(afflst);
    568        return 1;
    569      }
    570    }
    571 
    572    /* parse in the related character map table */
    573    if (line.compare(0, 3, "MAP", 3) == 0) {
    574      if (!parse_maptable(line, afflst)) {
    575        finishFileMgr(afflst);
    576        return 1;
    577      }
    578    }
    579 
    580    /* parse in the word breakpoints table */
    581    if (line.compare(0, 5, "BREAK", 5) == 0) {
    582      if (!parse_breaktable(line, afflst)) {
    583        finishFileMgr(afflst);
    584        return 1;
    585      }
    586    }
    587 
    588    /* parse in the language for language specific codes */
    589    if (line.compare(0, 4, "LANG", 4) == 0) {
    590      if (!parse_string(line, lang, afflst->getlinenum())) {
    591        finishFileMgr(afflst);
    592        return 1;
    593      }
    594      langnum = get_lang_num(lang);
    595    }
    596 
    597    if (line.compare(0, 7, "VERSION", 7) == 0) {
    598      size_t startpos = line.find_first_not_of(" \t", 7);
    599      if (startpos != std::string::npos) {
    600          version = line.substr(startpos);
    601      }
    602    }
    603 
    604    if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
    605      if (!parse_num(line, &maxngramsugs, afflst)) {
    606        finishFileMgr(afflst);
    607        return 1;
    608      }
    609    }
    610 
    611    if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
    612      onlymaxdiff = 1;
    613 
    614    if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
    615      if (!parse_num(line, &maxdiff, afflst)) {
    616        finishFileMgr(afflst);
    617        return 1;
    618      }
    619    }
    620 
    621    if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
    622      if (!parse_num(line, &maxcpdsugs, afflst)) {
    623        finishFileMgr(afflst);
    624        return 1;
    625      }
    626    }
    627 
    628    if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
    629      nosplitsugs = 1;
    630    }
    631 
    632    if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
    633      fullstrip = 1;
    634    }
    635 
    636    if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
    637      sugswithdots = 1;
    638    }
    639 
    640    /* parse in the flag used by forbidden words */
    641    if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
    642      if (!parse_flag(line, &keepcase, afflst)) {
    643        finishFileMgr(afflst);
    644        return 1;
    645      }
    646    }
    647 
    648    /* parse in the flag used by `forceucase' */
    649    if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
    650      if (!parse_flag(line, &forceucase, afflst)) {
    651        finishFileMgr(afflst);
    652        return 1;
    653      }
    654    }
    655 
    656    /* parse in the flag used by `warn' */
    657    if (line.compare(0, 4, "WARN", 4) == 0) {
    658      if (!parse_flag(line, &warn, afflst)) {
    659        finishFileMgr(afflst);
    660        return 1;
    661      }
    662    }
    663 
    664    if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
    665      forbidwarn = 1;
    666    }
    667 
    668    /* parse in the flag used by the affix generator */
    669    if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
    670      if (!parse_flag(line, &substandard, afflst)) {
    671        finishFileMgr(afflst);
    672        return 1;
    673      }
    674    }
    675 
    676    if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
    677      checksharps = 1;
    678    }
    679 
    680    /* parse this affix: P - prefix, S - suffix */
    681    // affix type
    682    char ft = ' ';
    683    if (line.compare(0, 3, "PFX", 3) == 0)
    684      ft = complexprefixes ? 'S' : 'P';
    685    if (line.compare(0, 3, "SFX", 3) == 0)
    686      ft = complexprefixes ? 'P' : 'S';
    687    if (ft != ' ') {
    688      if (dupflags_ini) {
    689        memset(dupflags, 0, sizeof(dupflags));
    690        dupflags_ini = 0;
    691      }
    692      if (!parse_affix(line, ft, afflst, dupflags)) {
    693        finishFileMgr(afflst);
    694        return 1;
    695      }
    696    }
    697  }
    698 
    699  finishFileMgr(afflst);
    700  // affix trees are sorted now
    701 
    702  // now we can speed up performance greatly taking advantage of the
    703  // relationship between the affixes and the idea of "subsets".
    704 
    705  // View each prefix as a potential leading subset of another and view
    706  // each suffix (reversed) as a potential trailing subset of another.
    707 
    708  // To illustrate this relationship if we know the prefix "ab" is found in the
    709  // word to examine, only prefixes that "ab" is a leading subset of need be
    710  // examined.
    711  // Furthermore is "ab" is not present then none of the prefixes that "ab" is
    712  // is a subset need be examined.
    713  // The same argument goes for suffix string that are reversed.
    714 
    715  // Then to top this off why not examine the first char of the word to quickly
    716  // limit the set of prefixes to examine (i.e. the prefixes to examine must
    717  // be leading supersets of the first character of the word (if they exist)
    718 
    719  // To take advantage of this "subset" relationship, we need to add two links
    720  // from entry.  One to take next if the current prefix is found (call it
    721  // nexteq)
    722  // and one to take next if the current prefix is not found (call it nextne).
    723 
    724  // Since we have built ordered lists, all that remains is to properly
    725  // initialize
    726  // the nextne and nexteq pointers that relate them
    727 
    728  process_pfx_order();
    729  process_sfx_order();
    730 
    731  /* get encoding for CHECKCOMPOUNDCASE */
    732  if (!utf8) {
    733    csconv = get_current_cs(get_encoding());
    734    for (int i = 0; i <= 255; i++) {
    735      if ((csconv[i].cupper != csconv[i].clower) &&
    736          (wordchars.find((char)i) == std::string::npos)) {
    737        wordchars.push_back((char)i);
    738      }
    739    }
    740 
    741  }
    742 
    743  // default BREAK definition
    744  if (!parsedbreaktable) {
    745    breaktable.push_back("-");
    746    breaktable.push_back("^-");
    747    breaktable.push_back("-$");
    748    parsedbreaktable = true;
    749  }
    750  return 0;
    751 }
    752 
    753 // we want to be able to quickly access prefix information
    754 // both by prefix flag, and sorted by prefix string itself
    755 // so we need to set up two indexes
    756 
    757 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
    758  PfxEntry* ptr;
    759  PfxEntry* pptr;
    760  PfxEntry* ep = pfxptr;
    761 
    762  // get the right starting points
    763  const char* key = ep->getKey();
    764  const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
    765 
    766  // first index by flag which must exist
    767  ptr = pFlag[flg];
    768  ep->setFlgNxt(ptr);
    769  pFlag[flg] = ep;
    770 
    771  // handle the special case of null affix string
    772  if (strlen(key) == 0) {
    773    // always inset them at head of list at element 0
    774    ptr = pStart[0];
    775    ep->setNext(ptr);
    776    pStart[0] = ep;
    777    return 0;
    778  }
    779 
    780  // now handle the normal case
    781  ep->setNextEQ(NULL);
    782  ep->setNextNE(NULL);
    783 
    784  unsigned char sp = *((const unsigned char*)key);
    785  ptr = pStart[sp];
    786 
    787  // handle the first insert
    788  if (!ptr) {
    789    pStart[sp] = ep;
    790    return 0;
    791  }
    792 
    793  // otherwise use binary tree insertion so that a sorted
    794  // list can easily be generated later
    795  pptr = NULL;
    796  for (;;) {
    797    pptr = ptr;
    798    if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
    799      ptr = ptr->getNextEQ();
    800      if (!ptr) {
    801        pptr->setNextEQ(ep);
    802        break;
    803      }
    804    } else {
    805      ptr = ptr->getNextNE();
    806      if (!ptr) {
    807        pptr->setNextNE(ep);
    808        break;
    809      }
    810    }
    811  }
    812  return 0;
    813 }
    814 
    815 // we want to be able to quickly access suffix information
    816 // both by suffix flag, and sorted by the reverse of the
    817 // suffix string itself; so we need to set up two indexes
    818 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
    819 
    820  sfxptr->initReverseWord();
    821 
    822  SfxEntry* ptr;
    823  SfxEntry* pptr;
    824  SfxEntry* ep = sfxptr;
    825 
    826  /* get the right starting point */
    827  const char* key = ep->getKey();
    828  const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
    829 
    830  // first index by flag which must exist
    831  ptr = sFlag[flg];
    832  ep->setFlgNxt(ptr);
    833  sFlag[flg] = ep;
    834 
    835  // next index by affix string
    836 
    837  // handle the special case of null affix string
    838  if (strlen(key) == 0) {
    839    // always inset them at head of list at element 0
    840    ptr = sStart[0];
    841    ep->setNext(ptr);
    842    sStart[0] = ep;
    843    return 0;
    844  }
    845 
    846  // now handle the normal case
    847  ep->setNextEQ(NULL);
    848  ep->setNextNE(NULL);
    849 
    850  unsigned char sp = *((const unsigned char*)key);
    851  ptr = sStart[sp];
    852 
    853  // handle the first insert
    854  if (!ptr) {
    855    sStart[sp] = ep;
    856    return 0;
    857  }
    858 
    859  // otherwise use binary tree insertion so that a sorted
    860  // list can easily be generated later
    861  pptr = NULL;
    862  for (;;) {
    863    pptr = ptr;
    864    if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
    865      ptr = ptr->getNextEQ();
    866      if (!ptr) {
    867        pptr->setNextEQ(ep);
    868        break;
    869      }
    870    } else {
    871      ptr = ptr->getNextNE();
    872      if (!ptr) {
    873        pptr->setNextNE(ep);
    874        break;
    875      }
    876    }
    877  }
    878  return 0;
    879 }
    880 
    881 // convert from binary tree to sorted list
    882 int AffixMgr::process_pfx_tree_to_list() {
    883  for (int i = 1; i < SETSIZE; i++) {
    884    pStart[i] = process_pfx_in_order(pStart[i], NULL);
    885  }
    886  return 0;
    887 }
    888 
    889 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
    890  if (ptr) {
    891    nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
    892    ptr->setNext(nptr);
    893    nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
    894  }
    895  return nptr;
    896 }
    897 
    898 // convert from binary tree to sorted list
    899 int AffixMgr::process_sfx_tree_to_list() {
    900  for (int i = 1; i < SETSIZE; i++) {
    901    sStart[i] = process_sfx_in_order(sStart[i], NULL);
    902  }
    903  return 0;
    904 }
    905 
    906 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
    907  if (ptr) {
    908    nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
    909    ptr->setNext(nptr);
    910    nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
    911  }
    912  return nptr;
    913 }
    914 
    915 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
    916 // using the idea of leading subsets this time
    917 int AffixMgr::process_pfx_order() {
    918  PfxEntry* ptr;
    919 
    920  // loop through each prefix list starting point
    921  for (int i = 1; i < SETSIZE; i++) {
    922    ptr = pStart[i];
    923 
    924    // look through the remainder of the list
    925    //  and find next entry with affix that
    926    // the current one is not a subset of
    927    // mark that as destination for NextNE
    928    // use next in list that you are a subset
    929    // of as NextEQ
    930 
    931    for (; ptr != NULL; ptr = ptr->getNext()) {
    932      PfxEntry* nptr = ptr->getNext();
    933      for (; nptr != NULL; nptr = nptr->getNext()) {
    934        if (!isSubset(ptr->getKey(), nptr->getKey()))
    935          break;
    936      }
    937      ptr->setNextNE(nptr);
    938      ptr->setNextEQ(NULL);
    939      if ((ptr->getNext()) &&
    940          isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
    941        ptr->setNextEQ(ptr->getNext());
    942    }
    943 
    944    // now clean up by adding smart search termination strings:
    945    // if you are already a superset of the previous prefix
    946    // but not a subset of the next, search can end here
    947    // so set NextNE properly
    948 
    949    ptr = pStart[i];
    950    for (; ptr != NULL; ptr = ptr->getNext()) {
    951      PfxEntry* nptr = ptr->getNext();
    952      PfxEntry* mptr = NULL;
    953      for (; nptr != NULL; nptr = nptr->getNext()) {
    954        if (!isSubset(ptr->getKey(), nptr->getKey()))
    955          break;
    956        mptr = nptr;
    957      }
    958      if (mptr)
    959        mptr->setNextNE(NULL);
    960    }
    961  }
    962  return 0;
    963 }
    964 
    965 // initialize the SfxEntry links NextEQ and NextNE to speed searching
    966 // using the idea of leading subsets this time
    967 int AffixMgr::process_sfx_order() {
    968  SfxEntry* ptr;
    969 
    970  // loop through each prefix list starting point
    971  for (int i = 1; i < SETSIZE; i++) {
    972    ptr = sStart[i];
    973 
    974    // look through the remainder of the list
    975    //  and find next entry with affix that
    976    // the current one is not a subset of
    977    // mark that as destination for NextNE
    978    // use next in list that you are a subset
    979    // of as NextEQ
    980 
    981    for (; ptr != NULL; ptr = ptr->getNext()) {
    982      SfxEntry* nptr = ptr->getNext();
    983      for (; nptr != NULL; nptr = nptr->getNext()) {
    984        if (!isSubset(ptr->getKey(), nptr->getKey()))
    985          break;
    986      }
    987      ptr->setNextNE(nptr);
    988      ptr->setNextEQ(NULL);
    989      if ((ptr->getNext()) &&
    990          isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
    991        ptr->setNextEQ(ptr->getNext());
    992    }
    993 
    994    // now clean up by adding smart search termination strings:
    995    // if you are already a superset of the previous suffix
    996    // but not a subset of the next, search can end here
    997    // so set NextNE properly
    998 
    999    ptr = sStart[i];
   1000    for (; ptr != NULL; ptr = ptr->getNext()) {
   1001      SfxEntry* nptr = ptr->getNext();
   1002      SfxEntry* mptr = NULL;
   1003      for (; nptr != NULL; nptr = nptr->getNext()) {
   1004        if (!isSubset(ptr->getKey(), nptr->getKey()))
   1005          break;
   1006        mptr = nptr;
   1007      }
   1008      if (mptr)
   1009        mptr->setNextNE(NULL);
   1010    }
   1011  }
   1012  return 0;
   1013 }
   1014 
   1015 // add flags to the result for dictionary debugging
   1016 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
   1017  char* st = encode_flag(flag);
   1018  result.push_back(MSEP_FLD);
   1019  result.append(MORPH_FLAG);
   1020  if (st) {
   1021    result.append(st);
   1022    free(st);
   1023  }
   1024  return result;
   1025 }
   1026 
   1027 // calculate the character length of the condition
   1028 int AffixMgr::condlen(const char* st) {
   1029  int l = 0;
   1030  bool group = false;
   1031  for (; *st; st++) {
   1032    if (*st == '[') {
   1033      group = true;
   1034      l++;
   1035    } else if (*st == ']')
   1036      group = false;
   1037    else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
   1038      l++;
   1039  }
   1040  return l;
   1041 }
   1042 
   1043 int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
   1044  if (strcmp(cs, ".") != 0) {
   1045    entry.numconds = (char)condlen(cs);
   1046    const size_t cslen = strlen(cs);
   1047    const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen);
   1048    memcpy(entry.c.conds, cs, short_part);
   1049    if (short_part < MAXCONDLEN) {
   1050      //blank out the remaining space
   1051      memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part);
   1052    } else if (cs[MAXCONDLEN]) {
   1053      //there is more conditions than fit in fixed space, so its
   1054      //a long condition
   1055      entry.opts |= aeLONGCOND;
   1056      entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
   1057      if (!entry.c.l.conds2)
   1058        return 1;
   1059    }
   1060  } else {
   1061    entry.numconds = 0;
   1062    entry.c.conds[0] = '\0';
   1063  }
   1064  return 0;
   1065 }
   1066 
   1067 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
   1068 inline int AffixMgr::isSubset(const char* s1, const char* s2) {
   1069  while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
   1070    s1++;
   1071    s2++;
   1072  }
   1073  return (*s1 == '\0');
   1074 }
   1075 
   1076 // check word for prefixes
   1077 struct hentry* AffixMgr::prefix_check(const char* word,
   1078                                      int len,
   1079                                      char in_compound,
   1080                                      const FLAG needflag) {
   1081  struct hentry* rv = NULL;
   1082 
   1083  pfx = NULL;
   1084  pfxappnd = NULL;
   1085  sfxappnd = NULL;
   1086  sfxextra = 0;
   1087 
   1088  // first handle the special case of 0 length prefixes
   1089  PfxEntry* pe = pStart[0];
   1090  while (pe) {
   1091    if (
   1092        // fogemorpheme
   1093        ((in_compound != IN_CPD_NOT) ||
   1094         !(pe->getCont() &&
   1095           (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
   1096        // permit prefixes in compounds
   1097        ((in_compound != IN_CPD_END) ||
   1098         (pe->getCont() &&
   1099          (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) {
   1100      // check prefix
   1101      rv = pe->checkword(word, len, in_compound, needflag);
   1102      if (rv) {
   1103        pfx = pe;  // BUG: pfx not stateless
   1104        return rv;
   1105      }
   1106    }
   1107    pe = pe->getNext();
   1108  }
   1109 
   1110  // now handle the general case
   1111  unsigned char sp = *((const unsigned char*)word);
   1112  PfxEntry* pptr = pStart[sp];
   1113 
   1114  while (pptr) {
   1115    if (isSubset(pptr->getKey(), word)) {
   1116      if (
   1117          // fogemorpheme
   1118          ((in_compound != IN_CPD_NOT) ||
   1119           !(pptr->getCont() &&
   1120             (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
   1121          // permit prefixes in compounds
   1122          ((in_compound != IN_CPD_END) ||
   1123           (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,
   1124                                        pptr->getContLen()))))) {
   1125        // check prefix
   1126        rv = pptr->checkword(word, len, in_compound, needflag);
   1127        if (rv) {
   1128          pfx = pptr;  // BUG: pfx not stateless
   1129          return rv;
   1130        }
   1131      }
   1132      pptr = pptr->getNextEQ();
   1133    } else {
   1134      pptr = pptr->getNextNE();
   1135    }
   1136  }
   1137 
   1138  return NULL;
   1139 }
   1140 
   1141 // check word for prefixes and two-level suffixes
   1142 struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
   1143                                             int len,
   1144                                             char in_compound,
   1145                                             const FLAG needflag) {
   1146  struct hentry* rv = NULL;
   1147 
   1148  pfx = NULL;
   1149  sfxappnd = NULL;
   1150  sfxextra = 0;
   1151 
   1152  // first handle the special case of 0 length prefixes
   1153  PfxEntry* pe = pStart[0];
   1154 
   1155  while (pe) {
   1156    rv = pe->check_twosfx(word, len, in_compound, needflag);
   1157    if (rv)
   1158      return rv;
   1159    pe = pe->getNext();
   1160  }
   1161 
   1162  // now handle the general case
   1163  unsigned char sp = *((const unsigned char*)word);
   1164  PfxEntry* pptr = pStart[sp];
   1165 
   1166  while (pptr) {
   1167    if (isSubset(pptr->getKey(), word)) {
   1168      rv = pptr->check_twosfx(word, len, in_compound, needflag);
   1169      if (rv) {
   1170        pfx = pptr;
   1171        return rv;
   1172      }
   1173      pptr = pptr->getNextEQ();
   1174    } else {
   1175      pptr = pptr->getNextNE();
   1176    }
   1177  }
   1178 
   1179  return NULL;
   1180 }
   1181 
   1182 // check word for prefixes and morph
   1183 std::string AffixMgr::prefix_check_morph(const char* word,
   1184                                         int len,
   1185                                         char in_compound,
   1186                                         const FLAG needflag) {
   1187 
   1188  std::string result;
   1189 
   1190  pfx = NULL;
   1191  sfxappnd = NULL;
   1192  sfxextra = 0;
   1193 
   1194  // first handle the special case of 0 length prefixes
   1195  PfxEntry* pe = pStart[0];
   1196  while (pe) {
   1197    std::string st = pe->check_morph(word, len, in_compound, needflag);
   1198    if (!st.empty()) {
   1199      result.append(st);
   1200    }
   1201    pe = pe->getNext();
   1202  }
   1203 
   1204  // now handle the general case
   1205  unsigned char sp = *((const unsigned char*)word);
   1206  PfxEntry* pptr = pStart[sp];
   1207 
   1208  while (pptr) {
   1209    if (isSubset(pptr->getKey(), word)) {
   1210      std::string st = pptr->check_morph(word, len, in_compound, needflag);
   1211      if (!st.empty()) {
   1212        // fogemorpheme
   1213        if ((in_compound != IN_CPD_NOT) ||
   1214            !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,
   1215                                           pptr->getContLen()))))) {
   1216          result.append(st);
   1217          pfx = pptr;
   1218        }
   1219      }
   1220      pptr = pptr->getNextEQ();
   1221    } else {
   1222      pptr = pptr->getNextNE();
   1223    }
   1224  }
   1225 
   1226  return result;
   1227 }
   1228 
   1229 // check word for prefixes and morph and two-level suffixes
   1230 std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
   1231                                                int len,
   1232                                                char in_compound,
   1233                                                const FLAG needflag) {
   1234  std::string result;
   1235 
   1236  pfx = NULL;
   1237  sfxappnd = NULL;
   1238  sfxextra = 0;
   1239 
   1240  // first handle the special case of 0 length prefixes
   1241  PfxEntry* pe = pStart[0];
   1242  while (pe) {
   1243    std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag);
   1244    if (!st.empty()) {
   1245      result.append(st);
   1246    }
   1247    pe = pe->getNext();
   1248  }
   1249 
   1250  // now handle the general case
   1251  unsigned char sp = *((const unsigned char*)word);
   1252  PfxEntry* pptr = pStart[sp];
   1253 
   1254  while (pptr) {
   1255    if (isSubset(pptr->getKey(), word)) {
   1256      std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
   1257      if (!st.empty()) {
   1258        result.append(st);
   1259        pfx = pptr;
   1260      }
   1261      pptr = pptr->getNextEQ();
   1262    } else {
   1263      pptr = pptr->getNextNE();
   1264    }
   1265  }
   1266 
   1267  return result;
   1268 }
   1269 
   1270 // Is word a non-compound with a REP substitution (see checkcompoundrep)?
   1271 int AffixMgr::cpdrep_check(const char* word, int wl) {
   1272 
   1273  if ((wl < 2) || get_reptable().empty())
   1274    return 0;
   1275 
   1276  for (size_t i = 0; i < get_reptable().size(); ++i) {
   1277    // use only available mid patterns
   1278    if (!get_reptable()[i].outstrings[0].empty()) {
   1279      const char* r = word;
   1280      const size_t lenp = get_reptable()[i].pattern.size();
   1281      // search every occurence of the pattern in the word
   1282      while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) {
   1283        std::string candidate(word);
   1284        candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
   1285        if (candidate_check(candidate.c_str(), candidate.size()))
   1286          return 1;
   1287        ++r;  // search for the next letter
   1288      }
   1289    }
   1290  }
   1291 
   1292 return 0;
   1293 }
   1294 
   1295 // forbid compound words, if they are in the dictionary as a
   1296 // word pair separated by space
   1297 int AffixMgr::cpdwordpair_check(const char * word, int wl) {
   1298  if (wl > 2) {
   1299    std::string candidate(word);
   1300    for (size_t i = 1; i < candidate.size(); i++) {
   1301      // go to end of the UTF-8 character
   1302      if (utf8 && ((word[i] & 0xc0) == 0x80))
   1303          continue;
   1304      candidate.insert(i, 1, ' ');
   1305      if (candidate_check(candidate.c_str(), candidate.size()))
   1306        return 1;
   1307      candidate.erase(i, 1);
   1308    }
   1309  }
   1310 
   1311  return 0;
   1312 }
   1313 
   1314 // forbid compoundings when there are special patterns at word bound
   1315 int AffixMgr::cpdpat_check(const char* word,
   1316                           int pos,
   1317                           hentry* r1,
   1318                           hentry* r2,
   1319                           const char /*affixed*/) {
   1320  for (size_t i = 0; i < checkcpdtable.size(); ++i) {
   1321    size_t len;
   1322    if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) &&
   1323        (!r1 || !checkcpdtable[i].cond ||
   1324         (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
   1325        (!r2 || !checkcpdtable[i].cond2 ||
   1326         (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
   1327        // zero length pattern => only TESTAFF
   1328        // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
   1329        (checkcpdtable[i].pattern.empty() ||
   1330         ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos &&
   1331           strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
   1332          (checkcpdtable[i].pattern[0] != '0' &&
   1333           ((len = checkcpdtable[i].pattern.size()) != 0) &&
   1334           strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) {
   1335      return 1;
   1336    }
   1337  }
   1338  return 0;
   1339 }
   1340 
   1341 // forbid compounding with neighbouring upper and lower case characters at word
   1342 // bounds
   1343 int AffixMgr::cpdcase_check(const char* word, int pos) {
   1344  if (utf8) {
   1345    const char* p;
   1346    for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
   1347      ;
   1348    std::string pair(p);
   1349    std::vector<w_char> pair_u;
   1350    u8_u16(pair_u, pair);
   1351    unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
   1352    unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
   1353    if (((unicodetoupper(a, langnum) == a) ||
   1354         (unicodetoupper(b, langnum) == b)) &&
   1355        (a != '-') && (b != '-'))
   1356      return 1;
   1357  } else {
   1358    unsigned char a = *(word + pos - 1);
   1359    unsigned char b = *(word + pos);
   1360    if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
   1361      return 1;
   1362  }
   1363  return 0;
   1364 }
   1365 
   1366 struct metachar_data {
   1367  signed short btpp;  // metacharacter (*, ?) position for backtracking
   1368  signed short btwp;  // word position for metacharacters
   1369  int btnum;          // number of matched characters in metacharacter
   1370 };
   1371 
   1372 // check compound patterns
   1373 int AffixMgr::defcpd_check(hentry*** words,
   1374                           short wnum,
   1375                           hentry* rv,
   1376                           hentry** def,
   1377                           char all) {
   1378  int w = 0;
   1379 
   1380  if (!*words) {
   1381    w = 1;
   1382    *words = def;
   1383  }
   1384 
   1385  if (!*words) {
   1386    return 0;
   1387  }
   1388 
   1389  std::vector<metachar_data> btinfo(1);
   1390 
   1391  short bt = 0;
   1392 
   1393  (*words)[wnum] = rv;
   1394 
   1395  // has the last word COMPOUNDRULE flag?
   1396  if (rv->alen == 0) {
   1397    (*words)[wnum] = NULL;
   1398    if (w)
   1399      *words = NULL;
   1400    return 0;
   1401  }
   1402  int ok = 0;
   1403  for (size_t i = 0; i < defcpdtable.size(); ++i) {
   1404    for (size_t j = 0; j < defcpdtable[i].size(); ++j) {
   1405      if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' &&
   1406          TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) {
   1407        ok = 1;
   1408        break;
   1409      }
   1410    }
   1411  }
   1412  if (ok == 0) {
   1413    (*words)[wnum] = NULL;
   1414    if (w)
   1415      *words = NULL;
   1416    return 0;
   1417  }
   1418 
   1419  for (size_t i = 0; i < defcpdtable.size(); ++i) {
   1420    size_t pp = 0;  // pattern position
   1421    signed short wp = 0;  // "words" position
   1422    int ok2;
   1423    ok = 1;
   1424    ok2 = 1;
   1425    do {
   1426      while ((pp < defcpdtable[i].size()) && (wp <= wnum)) {
   1427        if (((pp + 1) < defcpdtable[i].size()) &&
   1428            ((defcpdtable[i][pp + 1] == '*') ||
   1429             (defcpdtable[i][pp + 1] == '?'))) {
   1430          int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum;
   1431          ok2 = 1;
   1432          pp += 2;
   1433          btinfo[bt].btpp = pp;
   1434          btinfo[bt].btwp = wp;
   1435          while (wp <= wend) {
   1436            if (!(*words)[wp]->alen ||
   1437                !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2],
   1438                         (*words)[wp]->alen)) {
   1439              ok2 = 0;
   1440              break;
   1441            }
   1442            wp++;
   1443          }
   1444          if (wp <= wnum)
   1445            ok2 = 0;
   1446          btinfo[bt].btnum = wp - btinfo[bt].btwp;
   1447          if (btinfo[bt].btnum > 0) {
   1448            ++bt;
   1449            btinfo.resize(bt+1);
   1450          }
   1451          if (ok2)
   1452            break;
   1453        } else {
   1454          ok2 = 1;
   1455          if (!(*words)[wp] || !(*words)[wp]->alen ||
   1456              !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp],
   1457                       (*words)[wp]->alen)) {
   1458            ok = 0;
   1459            break;
   1460          }
   1461          pp++;
   1462          wp++;
   1463          if ((defcpdtable[i].size() == pp) && !(wp > wnum))
   1464            ok = 0;
   1465        }
   1466      }
   1467      if (ok && ok2) {
   1468        size_t r = pp;
   1469        while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) &&
   1470               ((defcpdtable[i][r + 1] == '*') ||
   1471                (defcpdtable[i][r + 1] == '?')))
   1472          r += 2;
   1473        if (defcpdtable[i].size() <= r)
   1474          return 1;
   1475      }
   1476      // backtrack
   1477      if (bt)
   1478        do {
   1479          ok = 1;
   1480          btinfo[bt - 1].btnum--;
   1481          pp = btinfo[bt - 1].btpp;
   1482          wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
   1483        } while ((btinfo[bt - 1].btnum < 0) && --bt);
   1484    } while (bt);
   1485 
   1486    if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp)))
   1487      return 1;
   1488 
   1489    // check zero ending
   1490    while (ok && ok2 && (defcpdtable[i].size() > pp) &&
   1491           ((pp + 1) < defcpdtable[i].size()) &&
   1492           ((defcpdtable[i][pp + 1] == '*') ||
   1493            (defcpdtable[i][pp + 1] == '?')))
   1494      pp += 2;
   1495    if (ok && ok2 && (defcpdtable[i].size() <= pp))
   1496      return 1;
   1497  }
   1498  (*words)[wnum] = NULL;
   1499  if (w)
   1500    *words = NULL;
   1501  return 0;
   1502 }
   1503 
   1504 inline int AffixMgr::candidate_check(const char* word, int len) {
   1505 
   1506  struct hentry* rv = lookup(word);
   1507  if (rv)
   1508    return 1;
   1509 
   1510  //  rv = prefix_check(word,len,1);
   1511  //  if (rv) return 1;
   1512 
   1513  rv = affix_check(word, len);
   1514  if (rv)
   1515    return 1;
   1516  return 0;
   1517 }
   1518 
   1519 // calculate number of syllable for compound-checking
   1520 short AffixMgr::get_syllable(const std::string& word) {
   1521  if (cpdmaxsyllable == 0)
   1522    return 0;
   1523 
   1524  short num = 0;
   1525 
   1526  if (!utf8) {
   1527    for (size_t i = 0; i < word.size(); ++i) {
   1528      if (std::binary_search(cpdvowels.begin(), cpdvowels.end(),
   1529                             word[i])) {
   1530        ++num;
   1531      }
   1532    }
   1533  } else if (!cpdvowels_utf16.empty()) {
   1534    std::vector<w_char> w;
   1535    u8_u16(w, word);
   1536    for (size_t i = 0; i < w.size(); ++i) {
   1537      if (std::binary_search(cpdvowels_utf16.begin(),
   1538                             cpdvowels_utf16.end(),
   1539                             w[i])) {
   1540        ++num;
   1541      }
   1542    }
   1543  }
   1544 
   1545  return num;
   1546 }
   1547 
   1548 void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
   1549  if (utf8) {
   1550    int i;
   1551    for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
   1552      for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
   1553        ;
   1554    }
   1555    for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
   1556      for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
   1557        ;
   1558    }
   1559  } else {
   1560    *cmin = cpdmin;
   1561    *cmax = len - cpdmin + 1;
   1562  }
   1563 }
   1564 
   1565 // check if compound word is correctly spelled
   1566 // hu_mov_rule = spec. Hungarian rule (XXX)
   1567 struct hentry* AffixMgr::compound_check(const std::string& word,
   1568                                        short wordnum,
   1569                                        short numsyllable,
   1570                                        short maxwordnum,
   1571                                        short wnum,
   1572                                        hentry** words = NULL,
   1573                                        hentry** rwords = NULL,
   1574                                        char hu_mov_rule = 0,
   1575                                        char is_sug = 0,
   1576                                        int* info = NULL) {
   1577  int i;
   1578  short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
   1579  struct hentry* rv = NULL;
   1580  struct hentry* rv_first;
   1581  std::string st;
   1582  char ch = '\0';
   1583  int cmin;
   1584  int cmax;
   1585  int striple = 0;
   1586  size_t scpd = 0;
   1587  int soldi = 0;
   1588  int oldcmin = 0;
   1589  int oldcmax = 0;
   1590  int oldlen = 0;
   1591  int checkedstriple = 0;
   1592  char affixed = 0;
   1593  hentry** oldwords = words;
   1594  size_t len = word.size();
   1595 
   1596  int checked_prefix;
   1597 
   1598  // add a time limit to handle possible
   1599  // combinatorical explosion of the overlapping words
   1600 
   1601  HUNSPELL_THREAD_LOCAL clock_t timelimit;
   1602 
   1603  if (wordnum == 0) {
   1604      // get the start time, seeing as we're reusing this set to 0
   1605      // to flag timeout, use clock() + 1 to avoid start clock()
   1606      // of 0 as being a timeout
   1607      timelimit = clock() + 1;
   1608  }
   1609  else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
   1610      timelimit = 0;
   1611  }
   1612 
   1613  setcminmax(&cmin, &cmax, word.c_str(), len);
   1614 
   1615  st.assign(word);
   1616 
   1617  for (i = cmin; i < cmax; i++) {
   1618    // go to end of the UTF-8 character
   1619    if (utf8) {
   1620      for (; (st[i] & 0xc0) == 0x80; i++)
   1621        ;
   1622      if (i >= cmax)
   1623        return NULL;
   1624    }
   1625 
   1626    words = oldwords;
   1627    int onlycpdrule = (words) ? 1 : 0;
   1628 
   1629    do {  // onlycpdrule loop
   1630 
   1631      oldnumsyllable = numsyllable;
   1632      oldwordnum = wordnum;
   1633      checked_prefix = 0;
   1634 
   1635      do {  // simplified checkcompoundpattern loop
   1636 
   1637        if (timelimit == 0)
   1638          return 0;
   1639 
   1640        if (scpd > 0) {
   1641          for (; scpd <= checkcpdtable.size() &&
   1642                 (checkcpdtable[scpd - 1].pattern3.empty() ||
   1643                  strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(),
   1644                          checkcpdtable[scpd - 1].pattern3.size()) != 0);
   1645               scpd++)
   1646            ;
   1647 
   1648          if (scpd > checkcpdtable.size())
   1649            break;  // break simplified checkcompoundpattern loop
   1650          st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
   1651          soldi = i;
   1652          i += checkcpdtable[scpd - 1].pattern.size();
   1653          st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
   1654          st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
   1655                 word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
   1656 
   1657          oldlen = len;
   1658          len += checkcpdtable[scpd - 1].pattern.size() +
   1659                 checkcpdtable[scpd - 1].pattern2.size() -
   1660                 checkcpdtable[scpd - 1].pattern3.size();
   1661          oldcmin = cmin;
   1662          oldcmax = cmax;
   1663          setcminmax(&cmin, &cmax, st.c_str(), len);
   1664 
   1665          cmax = len - cpdmin + 1;
   1666        }
   1667 
   1668        ch = st[i];
   1669        st[i] = '\0';
   1670 
   1671        sfx = NULL;
   1672        pfx = NULL;
   1673 
   1674        // FIRST WORD
   1675 
   1676        affixed = 1;
   1677        rv = lookup(st.c_str());  // perhaps without prefix
   1678 
   1679        // forbid dictionary stems with COMPOUNDFORBIDFLAG in
   1680        // compound words, overriding the effect of COMPOUNDPERMITFLAG
   1681        if ((rv) && compoundforbidflag &&
   1682                TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
   1683            continue;
   1684 
   1685        // search homonym with compound flag
   1686        while ((rv) && !hu_mov_rule &&
   1687               ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
   1688                !((compoundflag && !words && !onlycpdrule &&
   1689                   TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   1690                  (compoundbegin && !wordnum && !onlycpdrule &&
   1691                   TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
   1692                  (compoundmiddle && wordnum && !words && !onlycpdrule &&
   1693                   TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
   1694                  (!defcpdtable.empty() && onlycpdrule &&
   1695                   ((!words && !wordnum &&
   1696                     defcpd_check(&words, wnum, rv, rwords, 0)) ||
   1697                    (words &&
   1698                     defcpd_check(&words, wnum, rv, rwords, 0))))) ||
   1699                (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
   1700                 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
   1701          rv = rv->next_homonym;
   1702        }
   1703 
   1704        if (rv)
   1705          affixed = 0;
   1706 
   1707        if (!rv) {
   1708          if (onlycpdrule)
   1709            break;
   1710          if (compoundflag &&
   1711              !(rv = prefix_check(st.c_str(), i,
   1712                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
   1713                                  compoundflag))) {
   1714            if (((rv = suffix_check(
   1715                      st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag,
   1716                      hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
   1717                 (compoundmoresuffixes &&
   1718                  (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
   1719                !hu_mov_rule && sfx->getCont() &&
   1720                ((compoundforbidflag &&
   1721                  TESTAFF(sfx->getCont(), compoundforbidflag,
   1722                          sfx->getContLen())) ||
   1723                 (compoundend &&
   1724                  TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
   1725              rv = NULL;
   1726            }
   1727          }
   1728 
   1729          if (rv ||
   1730              (((wordnum == 0) && compoundbegin &&
   1731                ((rv = suffix_check(
   1732                      st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin,
   1733                      hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
   1734                 (compoundmoresuffixes &&
   1735                  (rv = suffix_check_twosfx(
   1736                       st.c_str(), i, 0, NULL,
   1737                       compoundbegin))) ||  // twofold suffixes + compound
   1738                 (rv = prefix_check(st.c_str(), i,
   1739                                    hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
   1740                                    compoundbegin)))) ||
   1741               ((wordnum > 0) && compoundmiddle &&
   1742                ((rv = suffix_check(
   1743                      st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle,
   1744                      hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
   1745                 (compoundmoresuffixes &&
   1746                  (rv = suffix_check_twosfx(
   1747                       st.c_str(), i, 0, NULL,
   1748                       compoundmiddle))) ||  // twofold suffixes + compound
   1749                 (rv = prefix_check(st.c_str(), i,
   1750                                    hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
   1751                                    compoundmiddle))))))
   1752            checked_prefix = 1;
   1753          // else check forbiddenwords and needaffix
   1754        } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   1755                                TESTAFF(rv->astr, needaffix, rv->alen) ||
   1756                                TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
   1757                                (is_sug && nosuggest &&
   1758                                 TESTAFF(rv->astr, nosuggest, rv->alen)))) {
   1759          st[i] = ch;
   1760          // continue;
   1761          break;
   1762        }
   1763 
   1764        // check non_compound flag in suffix and prefix
   1765        if ((rv) && !hu_mov_rule &&
   1766            ((pfx && pfx->getCont() &&
   1767              TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
   1768             (sfx && sfx->getCont() &&
   1769              TESTAFF(sfx->getCont(), compoundforbidflag,
   1770                      sfx->getContLen())))) {
   1771          rv = NULL;
   1772        }
   1773 
   1774        // check compoundend flag in suffix and prefix
   1775        if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
   1776            ((pfx && pfx->getCont() &&
   1777              TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
   1778             (sfx && sfx->getCont() &&
   1779              TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
   1780          rv = NULL;
   1781        }
   1782 
   1783        // check compoundmiddle flag in suffix and prefix
   1784        if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
   1785            !hu_mov_rule &&
   1786            ((pfx && pfx->getCont() &&
   1787              TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
   1788             (sfx && sfx->getCont() &&
   1789              TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
   1790          rv = NULL;
   1791        }
   1792 
   1793        // check forbiddenwords
   1794        if ((rv) && (rv->astr) &&
   1795            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   1796             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
   1797             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
   1798          return NULL;
   1799        }
   1800 
   1801        // increment word number, if the second root has a compoundroot flag
   1802        if ((rv) && compoundroot &&
   1803            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
   1804          wordnum++;
   1805        }
   1806 
   1807        // first word is acceptable in compound words?
   1808        if (((rv) &&
   1809             (checked_prefix || (words && words[wnum]) ||
   1810              (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   1811              ((oldwordnum == 0) && compoundbegin &&
   1812               TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
   1813              ((oldwordnum > 0) && compoundmiddle &&
   1814               TESTAFF(rv->astr, compoundmiddle, rv->alen))
   1815 
   1816              // LANG_hu section: spec. Hungarian rule
   1817              || ((langnum == LANG_hu) && hu_mov_rule &&
   1818                  (TESTAFF(
   1819                       rv->astr, 'F',
   1820                       rv->alen) ||  // XXX hardwired Hungarian dictionary codes
   1821                   TESTAFF(rv->astr, 'G', rv->alen) ||
   1822                   TESTAFF(rv->astr, 'H', rv->alen)))
   1823              // END of LANG_hu section
   1824              ) &&
   1825             (
   1826                 // test CHECKCOMPOUNDPATTERN conditions
   1827                 scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL ||
   1828                 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) &&
   1829             !((checkcompoundtriple && scpd == 0 &&
   1830                !words &&  // test triple letters
   1831                (word[i - 1] == word[i]) &&
   1832                (((i > 1) && (word[i - 1] == word[i - 2])) ||
   1833                 ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
   1834                 )) ||
   1835               (checkcompoundcase && scpd == 0 && !words &&
   1836                cpdcase_check(word.c_str(), i))))
   1837            // LANG_hu section: spec. Hungarian rule
   1838            || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
   1839                (rv = affix_check(st.c_str(), i)) &&
   1840                (sfx && sfx->getCont() &&
   1841                 (  // XXX hardwired Hungarian dic. codes
   1842                     TESTAFF(sfx->getCont(), (unsigned short)'x',
   1843                             sfx->getContLen()) ||
   1844                     TESTAFF(
   1845                         sfx->getCont(), (unsigned short)'%',
   1846                         sfx->getContLen()))))) {  // first word is ok condition
   1847 
   1848          // LANG_hu section: spec. Hungarian rule
   1849          if (langnum == LANG_hu) {
   1850            // calculate syllable number of the word
   1851            numsyllable += get_syllable(st.substr(0, i));
   1852            // + 1 word, if syllable number of the prefix > 1 (hungarian
   1853            // convention)
   1854            if (pfx && (get_syllable(pfx->getKey()) > 1))
   1855              wordnum++;
   1856          }
   1857          // END of LANG_hu section
   1858 
   1859          // NEXT WORD(S)
   1860          rv_first = rv;
   1861          st[i] = ch;
   1862 
   1863          do {  // striple loop
   1864 
   1865            // check simplifiedtriple
   1866            if (simplifiedtriple) {
   1867              if (striple) {
   1868                checkedstriple = 1;
   1869                i--;  // check "fahrt" instead of "ahrt" in "Schiffahrt"
   1870              } else if (i > 2 && word[i - 1] == word[i - 2])
   1871                striple = 1;
   1872            }
   1873 
   1874            rv = lookup(st.c_str() + i);  // perhaps without prefix
   1875 
   1876            // search homonym with compound flag
   1877            while ((rv) &&
   1878                   ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
   1879                    !((compoundflag && !words &&
   1880                       TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   1881                      (compoundend && !words &&
   1882                       TESTAFF(rv->astr, compoundend, rv->alen)) ||
   1883                      (!defcpdtable.empty() && words &&
   1884                       defcpd_check(&words, wnum + 1, rv, NULL, 1))) ||
   1885                    (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL &&
   1886                     !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,
   1887                              rv->alen)))) {
   1888              rv = rv->next_homonym;
   1889            }
   1890 
   1891            // check FORCEUCASE
   1892            if (rv && forceucase && (rv) &&
   1893                (TESTAFF(rv->astr, forceucase, rv->alen)) &&
   1894                !(info && *info & SPELL_ORIGCAP))
   1895              rv = NULL;
   1896 
   1897            if (rv && words && words[wnum + 1])
   1898              return rv_first;
   1899 
   1900            oldnumsyllable2 = numsyllable;
   1901            oldwordnum2 = wordnum;
   1902 
   1903            // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
   1904            // code
   1905            if ((rv) && (langnum == LANG_hu) &&
   1906                (TESTAFF(rv->astr, 'I', rv->alen)) &&
   1907                !(TESTAFF(rv->astr, 'J', rv->alen))) {
   1908              numsyllable--;
   1909            }
   1910            // END of LANG_hu section
   1911 
   1912            // increment word number, if the second root has a compoundroot flag
   1913            if ((rv) && (compoundroot) &&
   1914                (TESTAFF(rv->astr, compoundroot, rv->alen))) {
   1915              wordnum++;
   1916            }
   1917 
   1918            // check forbiddenwords
   1919            if ((rv) && (rv->astr) &&
   1920                (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   1921                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
   1922                 (is_sug && nosuggest &&
   1923                  TESTAFF(rv->astr, nosuggest, rv->alen))))
   1924              return NULL;
   1925 
   1926            // second word is acceptable, as a root?
   1927            // hungarian conventions: compounding is acceptable,
   1928            // when compound forms consist of 2 words, or if more,
   1929            // then the syllable number of root words must be 6, or lesser.
   1930 
   1931            if ((rv) &&
   1932                ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   1933                 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
   1934                (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
   1935                 ((cpdmaxsyllable != 0) &&
   1936                  (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
   1937                   cpdmaxsyllable))) &&
   1938                (
   1939                    // test CHECKCOMPOUNDPATTERN
   1940                    checkcpdtable.empty() || scpd != 0 ||
   1941                    !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) &&
   1942                ((!checkcompounddup || (rv != rv_first)))
   1943                // test CHECKCOMPOUNDPATTERN conditions
   1944                &&
   1945                (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
   1946                 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
   1947              // forbid compound word, if it is a non-compound word with typical
   1948              // fault
   1949              if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
   1950                      cpdwordpair_check(word.c_str(), len))
   1951                return NULL;
   1952              return rv_first;
   1953            }
   1954 
   1955            numsyllable = oldnumsyllable2;
   1956            wordnum = oldwordnum2;
   1957 
   1958            // perhaps second word has prefix or/and suffix
   1959            sfx = NULL;
   1960            sfxflag = FLAG_NULL;
   1961            rv = (compoundflag && !onlycpdrule)
   1962                     ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag,
   1963                                   IN_CPD_END)
   1964                     : NULL;
   1965            if (!rv && compoundend && !onlycpdrule) {
   1966              sfx = NULL;
   1967              pfx = NULL;
   1968              rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend,
   1969                               IN_CPD_END);
   1970            }
   1971 
   1972            if (!rv && !defcpdtable.empty() && words) {
   1973              rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END);
   1974              if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1))
   1975                return rv_first;
   1976              rv = NULL;
   1977            }
   1978 
   1979            // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
   1980            if (rv &&
   1981                !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
   1982                  TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)))
   1983              rv = NULL;
   1984 
   1985            // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
   1986            if (rv && !checkcpdtable.empty() && scpd == 0 &&
   1987                cpdpat_check(word.c_str(), i, rv_first, rv, affixed))
   1988              rv = NULL;
   1989 
   1990            // check non_compound flag in suffix and prefix
   1991            if ((rv) && ((pfx && pfx->getCont() &&
   1992                          TESTAFF(pfx->getCont(), compoundforbidflag,
   1993                                  pfx->getContLen())) ||
   1994                         (sfx && sfx->getCont() &&
   1995                          TESTAFF(sfx->getCont(), compoundforbidflag,
   1996                                  sfx->getContLen())))) {
   1997              rv = NULL;
   1998            }
   1999 
   2000            // check FORCEUCASE
   2001            if (rv && forceucase && (rv) &&
   2002                (TESTAFF(rv->astr, forceucase, rv->alen)) &&
   2003                !(info && *info & SPELL_ORIGCAP))
   2004              rv = NULL;
   2005 
   2006            // check forbiddenwords
   2007            if ((rv) && (rv->astr) &&
   2008                (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   2009                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
   2010                 (is_sug && nosuggest &&
   2011                  TESTAFF(rv->astr, nosuggest, rv->alen))))
   2012              return NULL;
   2013 
   2014            // pfxappnd = prefix of word+i, or NULL
   2015            // calculate syllable number of prefix.
   2016            // hungarian convention: when syllable number of prefix is more,
   2017            // than 1, the prefix+word counts as two words.
   2018 
   2019            if (langnum == LANG_hu) {
   2020              // calculate syllable number of the word
   2021              numsyllable += get_syllable(word.c_str() + i);
   2022 
   2023              // - affix syllable num.
   2024              // XXX only second suffix (inflections, not derivations)
   2025              if (sfxappnd) {
   2026                std::string tmp(sfxappnd);
   2027                reverseword(tmp);
   2028                numsyllable -= short(get_syllable(tmp) + sfxextra);
   2029              } else {
   2030                numsyllable -= short(sfxextra);
   2031              }
   2032 
   2033              // + 1 word, if syllable number of the prefix > 1 (hungarian
   2034              // convention)
   2035              if (pfx && (get_syllable(pfx->getKey()) > 1))
   2036                wordnum++;
   2037 
   2038              // increment syllable num, if last word has a SYLLABLENUM flag
   2039              // and the suffix is beginning `s'
   2040 
   2041              if (!cpdsyllablenum.empty()) {
   2042                switch (sfxflag) {
   2043                  case 'c': {
   2044                    numsyllable += 2;
   2045                    break;
   2046                  }
   2047                  case 'J': {
   2048                    numsyllable += 1;
   2049                    break;
   2050                  }
   2051                  case 'I': {
   2052                    if (rv && TESTAFF(rv->astr, 'J', rv->alen))
   2053                      numsyllable += 1;
   2054                    break;
   2055                  }
   2056                }
   2057              }
   2058            }
   2059 
   2060            // increment word number, if the second word has a compoundroot flag
   2061            if ((rv) && (compoundroot) &&
   2062                (TESTAFF(rv->astr, compoundroot, rv->alen))) {
   2063              wordnum++;
   2064            }
   2065            // second word is acceptable, as a word with prefix or/and suffix?
   2066            // hungarian conventions: compounding is acceptable,
   2067            // when compound forms consist 2 word, otherwise
   2068            // the syllable number of root words is 6, or lesser.
   2069            if ((rv) &&
   2070                (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
   2071                 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
   2072                ((!checkcompounddup || (rv != rv_first)))) {
   2073              // forbid compound word, if it is a non-compound word with typical
   2074              // fault
   2075              if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
   2076                      cpdwordpair_check(word.c_str(), len))
   2077                return NULL;
   2078              return rv_first;
   2079            }
   2080 
   2081            numsyllable = oldnumsyllable2;
   2082            wordnum = oldwordnum2;
   2083 
   2084            // perhaps second word is a compound word (recursive call)
   2085            if (wordnum + 2 < maxwordnum) {
   2086              rv = compound_check(st.substr(i), wordnum + 1,
   2087                                  numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
   2088                                  is_sug, info);
   2089 
   2090              if (rv && !checkcpdtable.empty() &&
   2091                  ((scpd == 0 &&
   2092                    cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) ||
   2093                   (scpd != 0 &&
   2094                    !cpdpat_check(word.c_str(), i, rv_first, rv, affixed))))
   2095                rv = NULL;
   2096            } else {
   2097              rv = NULL;
   2098            }
   2099            if (rv) {
   2100              // forbid compound word, if it is a non-compound word with typical
   2101              // fault, or a dictionary word pair
   2102 
   2103              if (cpdwordpair_check(word.c_str(), len))
   2104                  return NULL;
   2105 
   2106              if (checkcompoundrep || forbiddenword) {
   2107 
   2108                if (checkcompoundrep && cpdrep_check(word.c_str(), len))
   2109                  return NULL;
   2110 
   2111                // check first part
   2112                if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) {
   2113                  char r = st[i + rv->blen];
   2114                  st[i + rv->blen] = '\0';
   2115 
   2116                  if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
   2117                      cpdwordpair_check(st.c_str(), i + rv->blen)) {
   2118                    st[ + i + rv->blen] = r;
   2119                    continue;
   2120                  }
   2121 
   2122                  if (forbiddenword) {
   2123                    struct hentry* rv2 = lookup(word.c_str());
   2124                    if (!rv2)
   2125                      rv2 = affix_check(word.c_str(), len);
   2126                    if (rv2 && rv2->astr &&
   2127                        TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
   2128                        (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
   2129                      return NULL;
   2130                    }
   2131                  }
   2132                  st[i + rv->blen] = r;
   2133                }
   2134              }
   2135              return rv_first;
   2136            }
   2137          } while (striple && !checkedstriple);  // end of striple loop
   2138 
   2139          if (checkedstriple) {
   2140            i++;
   2141            checkedstriple = 0;
   2142            striple = 0;
   2143          }
   2144 
   2145        }  // first word is ok condition
   2146 
   2147        if (soldi != 0) {
   2148          i = soldi;
   2149          soldi = 0;
   2150          len = oldlen;
   2151          cmin = oldcmin;
   2152          cmax = oldcmax;
   2153        }
   2154        scpd++;
   2155 
   2156      } while (!onlycpdrule && simplifiedcpd &&
   2157               scpd <= checkcpdtable.size());  // end of simplifiedcpd loop
   2158 
   2159      scpd = 0;
   2160      wordnum = oldwordnum;
   2161      numsyllable = oldnumsyllable;
   2162 
   2163      if (soldi != 0) {
   2164        i = soldi;
   2165        st.assign(word);  // XXX add more optim.
   2166        soldi = 0;
   2167      } else
   2168        st[i] = ch;
   2169 
   2170    } while (!defcpdtable.empty() && oldwordnum == 0 &&
   2171             onlycpdrule++ < 1);  // end of onlycpd loop
   2172  }
   2173 
   2174  return NULL;
   2175 }
   2176 
   2177 // check if compound word is correctly spelled
   2178 // hu_mov_rule = spec. Hungarian rule (XXX)
   2179 int AffixMgr::compound_check_morph(const char* word,
   2180                                   int len,
   2181                                   short wordnum,
   2182                                   short numsyllable,
   2183                                   short maxwordnum,
   2184                                   short wnum,
   2185                                   hentry** words,
   2186                                   hentry** rwords,
   2187                                   char hu_mov_rule,
   2188                                   std::string& result,
   2189                                   const std::string* partresult) {
   2190  int i;
   2191  short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
   2192  int ok = 0;
   2193 
   2194  struct hentry* rv = NULL;
   2195  struct hentry* rv_first;
   2196  std::string st;
   2197  char ch;
   2198 
   2199  int checked_prefix;
   2200  std::string presult;
   2201 
   2202  int cmin;
   2203  int cmax;
   2204 
   2205  char affixed = 0;
   2206  hentry** oldwords = words;
   2207 
   2208  // add a time limit to handle possible
   2209  // combinatorical explosion of the overlapping words
   2210 
   2211  HUNSPELL_THREAD_LOCAL clock_t timelimit;
   2212 
   2213  if (wordnum == 0) {
   2214      // get the start time, seeing as we're reusing this set to 0
   2215      // to flag timeout, use clock() + 1 to avoid start clock()
   2216      // of 0 as being a timeout
   2217      timelimit = clock() + 1;
   2218  }
   2219  else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
   2220      timelimit = 0;
   2221  }
   2222 
   2223  setcminmax(&cmin, &cmax, word, len);
   2224 
   2225  st.assign(word);
   2226 
   2227  for (i = cmin; i < cmax; i++) {
   2228    // go to end of the UTF-8 character
   2229    if (utf8) {
   2230      for (; (st[i] & 0xc0) == 0x80; i++)
   2231        ;
   2232      if (i >= cmax)
   2233        return 0;
   2234    }
   2235 
   2236    words = oldwords;
   2237    int onlycpdrule = (words) ? 1 : 0;
   2238 
   2239    do {  // onlycpdrule loop
   2240 
   2241      if (timelimit == 0)
   2242        return 0;
   2243 
   2244      oldnumsyllable = numsyllable;
   2245      oldwordnum = wordnum;
   2246      checked_prefix = 0;
   2247 
   2248      ch = st[i];
   2249      st[i] = '\0';
   2250      sfx = NULL;
   2251 
   2252      // FIRST WORD
   2253 
   2254      affixed = 1;
   2255 
   2256      presult.clear();
   2257      if (partresult)
   2258        presult.append(*partresult);
   2259 
   2260      rv = lookup(st.c_str());  // perhaps without prefix
   2261 
   2262      // forbid dictionary stems with COMPOUNDFORBIDFLAG in
   2263      // compound words, overriding the effect of COMPOUNDPERMITFLAG
   2264      if ((rv) && compoundforbidflag &&
   2265              TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
   2266          continue;
   2267 
   2268      // search homonym with compound flag
   2269      while ((rv) && !hu_mov_rule &&
   2270             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
   2271              !((compoundflag && !words && !onlycpdrule &&
   2272                 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   2273                (compoundbegin && !wordnum && !onlycpdrule &&
   2274                 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
   2275                (compoundmiddle && wordnum && !words && !onlycpdrule &&
   2276                 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
   2277                (!defcpdtable.empty() && onlycpdrule &&
   2278                 ((!words && !wordnum &&
   2279                   defcpd_check(&words, wnum, rv, rwords, 0)) ||
   2280                  (words &&
   2281                   defcpd_check(&words, wnum, rv, rwords, 0))))))) {
   2282        rv = rv->next_homonym;
   2283      }
   2284 
   2285      if (timelimit == 0)
   2286        return 0;
   2287 
   2288      if (rv)
   2289        affixed = 0;
   2290 
   2291      if (rv) {
   2292        presult.push_back(MSEP_FLD);
   2293        presult.append(MORPH_PART);
   2294        presult.append(st.c_str());
   2295        if (!HENTRY_FIND(rv, MORPH_STEM)) {
   2296          presult.push_back(MSEP_FLD);
   2297          presult.append(MORPH_STEM);
   2298          presult.append(st.c_str());
   2299        }
   2300        if (HENTRY_DATA(rv)) {
   2301          presult.push_back(MSEP_FLD);
   2302          presult.append(HENTRY_DATA2(rv));
   2303        }
   2304      }
   2305 
   2306      if (!rv) {
   2307        if (compoundflag &&
   2308            !(rv =
   2309                  prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
   2310                               compoundflag))) {
   2311          if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
   2312                                  compoundflag,
   2313                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
   2314               (compoundmoresuffixes &&
   2315                (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
   2316              !hu_mov_rule && sfx->getCont() &&
   2317              ((compoundforbidflag &&
   2318                TESTAFF(sfx->getCont(), compoundforbidflag,
   2319                        sfx->getContLen())) ||
   2320               (compoundend &&
   2321                TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
   2322            rv = NULL;
   2323          }
   2324        }
   2325 
   2326        if (rv ||
   2327            (((wordnum == 0) && compoundbegin &&
   2328              ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
   2329                                  compoundbegin,
   2330                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
   2331               (compoundmoresuffixes &&
   2332                (rv = suffix_check_twosfx(
   2333                     st.c_str(), i, 0, NULL,
   2334                     compoundbegin))) ||  // twofold suffix+compound
   2335               (rv = prefix_check(st.c_str(), i,
   2336                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
   2337                                  compoundbegin)))) ||
   2338             ((wordnum > 0) && compoundmiddle &&
   2339              ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
   2340                                  compoundmiddle,
   2341                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
   2342               (compoundmoresuffixes &&
   2343                (rv = suffix_check_twosfx(
   2344                     st.c_str(), i, 0, NULL,
   2345                     compoundmiddle))) ||  // twofold suffix+compound
   2346               (rv = prefix_check(st.c_str(), i,
   2347                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
   2348                                  compoundmiddle)))))) {
   2349          std::string p;
   2350          if (compoundflag)
   2351            p = affix_check_morph(st.c_str(), i, compoundflag);
   2352          if (p.empty()) {
   2353            if ((wordnum == 0) && compoundbegin) {
   2354              p = affix_check_morph(st.c_str(), i, compoundbegin);
   2355            } else if ((wordnum > 0) && compoundmiddle) {
   2356              p = affix_check_morph(st.c_str(), i, compoundmiddle);
   2357            }
   2358          }
   2359          if (!p.empty()) {
   2360            presult.push_back(MSEP_FLD);
   2361            presult.append(MORPH_PART);
   2362            presult.append(st.c_str());
   2363            line_uniq_app(p, MSEP_REC);
   2364            presult.append(p);
   2365          }
   2366          checked_prefix = 1;
   2367        }
   2368        // else check forbiddenwords
   2369      } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   2370                              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
   2371                              TESTAFF(rv->astr, needaffix, rv->alen))) {
   2372        st[i] = ch;
   2373        continue;
   2374      }
   2375 
   2376      // check non_compound flag in suffix and prefix
   2377      if ((rv) && !hu_mov_rule &&
   2378          ((pfx && pfx->getCont() &&
   2379            TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
   2380           (sfx && sfx->getCont() &&
   2381            TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
   2382        continue;
   2383      }
   2384 
   2385      // check compoundend flag in suffix and prefix
   2386      if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
   2387          ((pfx && pfx->getCont() &&
   2388            TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
   2389           (sfx && sfx->getCont() &&
   2390            TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
   2391        continue;
   2392      }
   2393 
   2394      // check compoundmiddle flag in suffix and prefix
   2395      if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
   2396          !hu_mov_rule &&
   2397          ((pfx && pfx->getCont() &&
   2398            TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
   2399           (sfx && sfx->getCont() &&
   2400            TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
   2401        rv = NULL;
   2402      }
   2403 
   2404      // check forbiddenwords
   2405      if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   2406                                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
   2407        continue;
   2408 
   2409      // increment word number, if the second root has a compoundroot flag
   2410      if ((rv) && (compoundroot) &&
   2411          (TESTAFF(rv->astr, compoundroot, rv->alen))) {
   2412        wordnum++;
   2413      }
   2414 
   2415      // first word is acceptable in compound words?
   2416      if (((rv) &&
   2417           (checked_prefix || (words && words[wnum]) ||
   2418            (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   2419            ((oldwordnum == 0) && compoundbegin &&
   2420             TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
   2421            ((oldwordnum > 0) && compoundmiddle &&
   2422             TESTAFF(rv->astr, compoundmiddle, rv->alen))
   2423            // LANG_hu section: spec. Hungarian rule
   2424            || ((langnum == LANG_hu) &&  // hu_mov_rule
   2425                hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
   2426                                TESTAFF(rv->astr, 'G', rv->alen) ||
   2427                                TESTAFF(rv->astr, 'H', rv->alen)))
   2428            // END of LANG_hu section
   2429            ) &&
   2430           !((checkcompoundtriple && !words &&  // test triple letters
   2431              (word[i - 1] == word[i]) &&
   2432              (((i > 1) && (word[i - 1] == word[i - 2])) ||
   2433               ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
   2434               )) ||
   2435             (
   2436                 // test CHECKCOMPOUNDPATTERN
   2437                 !checkcpdtable.empty() && !words &&
   2438                 cpdpat_check(word, i, rv, NULL, affixed)) ||
   2439             (checkcompoundcase && !words && cpdcase_check(word, i))))
   2440          // LANG_hu section: spec. Hungarian rule
   2441          ||
   2442          ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
   2443           (rv = affix_check(st.c_str(), i)) &&
   2444           (sfx && sfx->getCont() &&
   2445            (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
   2446             TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
   2447          // END of LANG_hu section
   2448          ) {
   2449        // LANG_hu section: spec. Hungarian rule
   2450        if (langnum == LANG_hu) {
   2451          // calculate syllable number of the word
   2452          numsyllable += get_syllable(st.substr(0, i));
   2453 
   2454          // + 1 word, if syllable number of the prefix > 1 (hungarian
   2455          // convention)
   2456          if (pfx && (get_syllable(pfx->getKey()) > 1))
   2457            wordnum++;
   2458        }
   2459        // END of LANG_hu section
   2460 
   2461        // NEXT WORD(S)
   2462        rv_first = rv;
   2463        rv = lookup((word + i));  // perhaps without prefix
   2464 
   2465        // search homonym with compound flag
   2466        while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
   2467                        !((compoundflag && !words &&
   2468                           TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   2469                          (compoundend && !words &&
   2470                           TESTAFF(rv->astr, compoundend, rv->alen)) ||
   2471                          (!defcpdtable.empty() && words &&
   2472                           defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
   2473          rv = rv->next_homonym;
   2474        }
   2475 
   2476        if (rv && words && words[wnum + 1]) {
   2477          result.append(presult);
   2478          result.push_back(MSEP_FLD);
   2479          result.append(MORPH_PART);
   2480          result.append(word + i);
   2481          if (complexprefixes && HENTRY_DATA(rv))
   2482            result.append(HENTRY_DATA2(rv));
   2483          if (!HENTRY_FIND(rv, MORPH_STEM)) {
   2484            result.push_back(MSEP_FLD);
   2485            result.append(MORPH_STEM);
   2486            result.append(HENTRY_WORD(rv));
   2487          }
   2488          // store the pointer of the hash entry
   2489          if (!complexprefixes && HENTRY_DATA(rv)) {
   2490            result.push_back(MSEP_FLD);
   2491            result.append(HENTRY_DATA2(rv));
   2492          }
   2493          result.push_back(MSEP_REC);
   2494          return 0;
   2495        }
   2496 
   2497        oldnumsyllable2 = numsyllable;
   2498        oldwordnum2 = wordnum;
   2499 
   2500        // LANG_hu section: spec. Hungarian rule
   2501        if ((rv) && (langnum == LANG_hu) &&
   2502            (TESTAFF(rv->astr, 'I', rv->alen)) &&
   2503            !(TESTAFF(rv->astr, 'J', rv->alen))) {
   2504          numsyllable--;
   2505        }
   2506        // END of LANG_hu section
   2507        // increment word number, if the second root has a compoundroot flag
   2508        if ((rv) && (compoundroot) &&
   2509            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
   2510          wordnum++;
   2511        }
   2512 
   2513        // check forbiddenwords
   2514        if ((rv) && (rv->astr) &&
   2515            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   2516             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
   2517          st[i] = ch;
   2518          continue;
   2519        }
   2520 
   2521        // second word is acceptable, as a root?
   2522        // hungarian conventions: compounding is acceptable,
   2523        // when compound forms consist of 2 words, or if more,
   2524        // then the syllable number of root words must be 6, or lesser.
   2525        if ((rv) &&
   2526            ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
   2527             (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
   2528            (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
   2529             ((cpdmaxsyllable != 0) &&
   2530              (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
   2531               cpdmaxsyllable))) &&
   2532            ((!checkcompounddup || (rv != rv_first)))) {
   2533          // bad compound word
   2534          result.append(presult);
   2535          result.push_back(MSEP_FLD);
   2536          result.append(MORPH_PART);
   2537          result.append(word + i);
   2538 
   2539          if (HENTRY_DATA(rv)) {
   2540            if (complexprefixes)
   2541              result.append(HENTRY_DATA2(rv));
   2542            if (!HENTRY_FIND(rv, MORPH_STEM)) {
   2543              result.push_back(MSEP_FLD);
   2544              result.append(MORPH_STEM);
   2545              result.append(HENTRY_WORD(rv));
   2546            }
   2547            // store the pointer of the hash entry
   2548            if (!complexprefixes) {
   2549              result.push_back(MSEP_FLD);
   2550              result.append(HENTRY_DATA2(rv));
   2551            }
   2552          }
   2553          result.push_back(MSEP_REC);
   2554          ok = 1;
   2555        }
   2556 
   2557        numsyllable = oldnumsyllable2;
   2558        wordnum = oldwordnum2;
   2559 
   2560        // perhaps second word has prefix or/and suffix
   2561        sfx = NULL;
   2562        sfxflag = FLAG_NULL;
   2563 
   2564        if (compoundflag && !onlycpdrule)
   2565          rv = affix_check((word + i), strlen(word + i), compoundflag);
   2566        else
   2567          rv = NULL;
   2568 
   2569        if (!rv && compoundend && !onlycpdrule) {
   2570          sfx = NULL;
   2571          pfx = NULL;
   2572          rv = affix_check((word + i), strlen(word + i), compoundend);
   2573        }
   2574 
   2575        if (!rv && !defcpdtable.empty() && words) {
   2576          rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
   2577          if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
   2578            std::string m;
   2579            if (compoundflag)
   2580              m = affix_check_morph((word + i), strlen(word + i), compoundflag);
   2581            if (m.empty() && compoundend) {
   2582              m = affix_check_morph((word + i), strlen(word + i), compoundend);
   2583            }
   2584            result.append(presult);
   2585            if (!m.empty()) {
   2586              result.push_back(MSEP_FLD);
   2587              result.append(MORPH_PART);
   2588              result.append(word + i);
   2589              line_uniq_app(m, MSEP_REC);
   2590              result.append(m);
   2591            }
   2592            result.push_back(MSEP_REC);
   2593            ok = 1;
   2594          }
   2595        }
   2596 
   2597        // check non_compound flag in suffix and prefix
   2598        if ((rv) &&
   2599            ((pfx && pfx->getCont() &&
   2600              TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
   2601             (sfx && sfx->getCont() &&
   2602              TESTAFF(sfx->getCont(), compoundforbidflag,
   2603                      sfx->getContLen())))) {
   2604          rv = NULL;
   2605        }
   2606 
   2607        // check forbiddenwords
   2608        if ((rv) && (rv->astr) &&
   2609            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
   2610             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
   2611            (!TESTAFF(rv->astr, needaffix, rv->alen))) {
   2612          st[i] = ch;
   2613          continue;
   2614        }
   2615 
   2616        if (langnum == LANG_hu) {
   2617          // calculate syllable number of the word
   2618          numsyllable += get_syllable(word + i);
   2619 
   2620          // - affix syllable num.
   2621          // XXX only second suffix (inflections, not derivations)
   2622          if (sfxappnd) {
   2623            std::string tmp(sfxappnd);
   2624            reverseword(tmp);
   2625            numsyllable -= short(get_syllable(tmp) + sfxextra);
   2626          } else {
   2627            numsyllable -= short(sfxextra);
   2628          }
   2629 
   2630          // + 1 word, if syllable number of the prefix > 1 (hungarian
   2631          // convention)
   2632          if (pfx && (get_syllable(pfx->getKey()) > 1))
   2633            wordnum++;
   2634 
   2635          // increment syllable num, if last word has a SYLLABLENUM flag
   2636          // and the suffix is beginning `s'
   2637 
   2638          if (!cpdsyllablenum.empty()) {
   2639            switch (sfxflag) {
   2640              case 'c': {
   2641                numsyllable += 2;
   2642                break;
   2643              }
   2644              case 'J': {
   2645                numsyllable += 1;
   2646                break;
   2647              }
   2648              case 'I': {
   2649                if (rv && TESTAFF(rv->astr, 'J', rv->alen))
   2650                  numsyllable += 1;
   2651                break;
   2652              }
   2653            }
   2654          }
   2655        }
   2656 
   2657        // increment word number, if the second word has a compoundroot flag
   2658        if ((rv) && (compoundroot) &&
   2659            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
   2660          wordnum++;
   2661        }
   2662        // second word is acceptable, as a word with prefix or/and suffix?
   2663        // hungarian conventions: compounding is acceptable,
   2664        // when compound forms consist 2 word, otherwise
   2665        // the syllable number of root words is 6, or lesser.
   2666        if ((rv) &&
   2667            (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
   2668             ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
   2669            ((!checkcompounddup || (rv != rv_first)))) {
   2670          std::string m;
   2671          if (compoundflag)
   2672            m = affix_check_morph((word + i), strlen(word + i), compoundflag);
   2673          if (m.empty() && compoundend) {
   2674            m = affix_check_morph((word + i), strlen(word + i), compoundend);
   2675          }
   2676          result.append(presult);
   2677          if (!m.empty()) {
   2678            result.push_back(MSEP_FLD);
   2679            result.append(MORPH_PART);
   2680            result.append(word + i);
   2681            line_uniq_app(m, MSEP_REC);
   2682            result.push_back(MSEP_FLD);
   2683            result.append(m);
   2684          }
   2685          result.push_back(MSEP_REC);
   2686          ok = 1;
   2687        }
   2688 
   2689        numsyllable = oldnumsyllable2;
   2690        wordnum = oldwordnum2;
   2691 
   2692        // perhaps second word is a compound word (recursive call)
   2693        if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
   2694          compound_check_morph((word + i), strlen(word + i), wordnum + 1,
   2695                               numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
   2696                               result, &presult);
   2697        } else {
   2698          rv = NULL;
   2699        }
   2700      }
   2701      st[i] = ch;
   2702      wordnum = oldwordnum;
   2703      numsyllable = oldnumsyllable;
   2704 
   2705    } while (!defcpdtable.empty() && oldwordnum == 0 &&
   2706             onlycpdrule++ < 1);  // end of onlycpd loop
   2707  }
   2708  return 0;
   2709 }
   2710 
   2711 
   2712 inline int AffixMgr::isRevSubset(const char* s1,
   2713                                 const char* end_of_s2,
   2714                                 int len) {
   2715  while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
   2716    s1++;
   2717    end_of_s2--;
   2718    len--;
   2719  }
   2720  return (*s1 == '\0');
   2721 }
   2722 
   2723 // check word for suffixes
   2724 struct hentry* AffixMgr::suffix_check(const char* word,
   2725                                      int len,
   2726                                      int sfxopts,
   2727                                      PfxEntry* ppfx,
   2728                                      const FLAG cclass,
   2729                                      const FLAG needflag,
   2730                                      char in_compound) {
   2731  struct hentry* rv = NULL;
   2732  PfxEntry* ep = ppfx;
   2733 
   2734  // first handle the special case of 0 length suffixes
   2735  SfxEntry* se = sStart[0];
   2736 
   2737  while (se) {
   2738    if (!cclass || se->getCont()) {
   2739      // suffixes are not allowed in beginning of compounds
   2740      if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
   2741           // except when signed with compoundpermitflag flag
   2742           (se->getCont() && compoundpermitflag &&
   2743            TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
   2744          (!circumfix ||
   2745           // no circumfix flag in prefix and suffix
   2746           ((!ppfx || !(ep->getCont()) ||
   2747             !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   2748            (!se->getCont() ||
   2749             !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
   2750           // circumfix flag in prefix AND suffix
   2751           ((ppfx && (ep->getCont()) &&
   2752             TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   2753            (se->getCont() &&
   2754             (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
   2755          // fogemorpheme
   2756          (in_compound ||
   2757           !(se->getCont() &&
   2758             (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
   2759          // needaffix on prefix or first suffix
   2760          (cclass ||
   2761           !(se->getCont() &&
   2762             TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
   2763           (ppfx &&
   2764            !((ep->getCont()) &&
   2765              TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) {
   2766        rv = se->checkword(word, len, sfxopts, ppfx,
   2767                           (FLAG)cclass, needflag,
   2768                           (in_compound ? 0 : onlyincompound));
   2769        if (rv) {
   2770          sfx = se;  // BUG: sfx not stateless
   2771          return rv;
   2772        }
   2773      }
   2774    }
   2775    se = se->getNext();
   2776  }
   2777 
   2778  // now handle the general case
   2779  if (len == 0)
   2780    return NULL;  // FULLSTRIP
   2781  unsigned char sp = *((const unsigned char*)(word + len - 1));
   2782  SfxEntry* sptr = sStart[sp];
   2783 
   2784  while (sptr) {
   2785    if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
   2786      // suffixes are not allowed in beginning of compounds
   2787      if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
   2788           // except when signed with compoundpermitflag flag
   2789           (sptr->getCont() && compoundpermitflag &&
   2790            TESTAFF(sptr->getCont(), compoundpermitflag,
   2791                    sptr->getContLen()))) &&
   2792          (!circumfix ||
   2793           // no circumfix flag in prefix and suffix
   2794           ((!ppfx || !(ep->getCont()) ||
   2795             !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   2796            (!sptr->getCont() ||
   2797             !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
   2798           // circumfix flag in prefix AND suffix
   2799           ((ppfx && (ep->getCont()) &&
   2800             TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   2801            (sptr->getCont() &&
   2802             (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
   2803          // fogemorpheme
   2804          (in_compound ||
   2805           !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
   2806                                          sptr->getContLen()))))) &&
   2807          // needaffix on prefix or first suffix
   2808          (cclass ||
   2809           !(sptr->getCont() &&
   2810             TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
   2811           (ppfx &&
   2812            !((ep->getCont()) &&
   2813              TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))
   2814        if (in_compound != IN_CPD_END || ppfx ||
   2815            !(sptr->getCont() &&
   2816              TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
   2817          rv = sptr->checkword(word, len, sfxopts, ppfx,
   2818                               cclass, needflag,
   2819                               (in_compound ? 0 : onlyincompound));
   2820          if (rv) {
   2821            sfx = sptr;                 // BUG: sfx not stateless
   2822            sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
   2823            if (!sptr->getCont())
   2824              sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
   2825            // LANG_hu section: spec. Hungarian rule
   2826            else if (langnum == LANG_hu && sptr->getKeyLen() &&
   2827                     sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
   2828                     sptr->getKey()[1] != 't') {
   2829              sfxextra = 1;
   2830            }
   2831            // END of LANG_hu section
   2832            return rv;
   2833          }
   2834        }
   2835      sptr = sptr->getNextEQ();
   2836    } else {
   2837      sptr = sptr->getNextNE();
   2838    }
   2839  }
   2840 
   2841  return NULL;
   2842 }
   2843 
   2844 // check word for two-level suffixes
   2845 struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
   2846                                             int len,
   2847                                             int sfxopts,
   2848                                             PfxEntry* ppfx,
   2849                                             const FLAG needflag) {
   2850  struct hentry* rv = NULL;
   2851 
   2852  // first handle the special case of 0 length suffixes
   2853  SfxEntry* se = sStart[0];
   2854  while (se) {
   2855    if (contclasses[se->getFlag()]) {
   2856      rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag);
   2857      if (rv)
   2858        return rv;
   2859    }
   2860    se = se->getNext();
   2861  }
   2862 
   2863  // now handle the general case
   2864  if (len == 0)
   2865    return NULL;  // FULLSTRIP
   2866  unsigned char sp = *((const unsigned char*)(word + len - 1));
   2867  SfxEntry* sptr = sStart[sp];
   2868 
   2869  while (sptr) {
   2870    if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
   2871      if (contclasses[sptr->getFlag()]) {
   2872        rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag);
   2873        if (rv) {
   2874          sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
   2875          if (!sptr->getCont())
   2876            sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
   2877          return rv;
   2878        }
   2879      }
   2880      sptr = sptr->getNextEQ();
   2881    } else {
   2882      sptr = sptr->getNextNE();
   2883    }
   2884  }
   2885 
   2886  return NULL;
   2887 }
   2888 
   2889 // check word for two-level suffixes and morph
   2890 std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
   2891                                                int len,
   2892                                                int sfxopts,
   2893                                                PfxEntry* ppfx,
   2894                                                const FLAG needflag) {
   2895  std::string result;
   2896  std::string result2;
   2897  std::string result3;
   2898 
   2899  // first handle the special case of 0 length suffixes
   2900  SfxEntry* se = sStart[0];
   2901  while (se) {
   2902    if (contclasses[se->getFlag()]) {
   2903      std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
   2904      if (!st.empty()) {
   2905        if (ppfx) {
   2906          if (ppfx->getMorph()) {
   2907            result.append(ppfx->getMorph());
   2908            result.push_back(MSEP_FLD);
   2909          } else
   2910            debugflag(result, ppfx->getFlag());
   2911        }
   2912        result.append(st);
   2913        if (se->getMorph()) {
   2914          result.push_back(MSEP_FLD);
   2915          result.append(se->getMorph());
   2916        } else
   2917          debugflag(result, se->getFlag());
   2918        result.push_back(MSEP_REC);
   2919      }
   2920    }
   2921    se = se->getNext();
   2922  }
   2923 
   2924  // now handle the general case
   2925  if (len == 0)
   2926    return std::string();  // FULLSTRIP
   2927  unsigned char sp = *((const unsigned char*)(word + len - 1));
   2928  SfxEntry* sptr = sStart[sp];
   2929 
   2930  while (sptr) {
   2931    if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
   2932      if (contclasses[sptr->getFlag()]) {
   2933        std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
   2934        if (!st.empty()) {
   2935          sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
   2936          if (!sptr->getCont())
   2937            sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
   2938          result2.assign(st);
   2939 
   2940          result3.clear();
   2941 
   2942          if (sptr->getMorph()) {
   2943            result3.push_back(MSEP_FLD);
   2944            result3.append(sptr->getMorph());
   2945          } else
   2946            debugflag(result3, sptr->getFlag());
   2947          strlinecat(result2, result3);
   2948          result2.push_back(MSEP_REC);
   2949          result.append(result2);
   2950        }
   2951      }
   2952      sptr = sptr->getNextEQ();
   2953    } else {
   2954      sptr = sptr->getNextNE();
   2955    }
   2956  }
   2957 
   2958  return result;
   2959 }
   2960 
   2961 std::string AffixMgr::suffix_check_morph(const char* word,
   2962                                         int len,
   2963                                         int sfxopts,
   2964                                         PfxEntry* ppfx,
   2965                                         const FLAG cclass,
   2966                                         const FLAG needflag,
   2967                                         char in_compound) {
   2968  std::string result;
   2969 
   2970  struct hentry* rv = NULL;
   2971 
   2972  PfxEntry* ep = ppfx;
   2973 
   2974  // first handle the special case of 0 length suffixes
   2975  SfxEntry* se = sStart[0];
   2976  while (se) {
   2977    if (!cclass || se->getCont()) {
   2978      // suffixes are not allowed in beginning of compounds
   2979      if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
   2980            // except when signed with compoundpermitflag flag
   2981            (se->getCont() && compoundpermitflag &&
   2982             TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
   2983           (!circumfix ||
   2984            // no circumfix flag in prefix and suffix
   2985            ((!ppfx || !(ep->getCont()) ||
   2986              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   2987             (!se->getCont() ||
   2988              !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
   2989            // circumfix flag in prefix AND suffix
   2990            ((ppfx && (ep->getCont()) &&
   2991              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   2992             (se->getCont() &&
   2993              (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
   2994           // fogemorpheme
   2995           (in_compound ||
   2996            !((se->getCont() &&
   2997               (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
   2998           // needaffix on prefix or first suffix
   2999           (cclass ||
   3000            !(se->getCont() &&
   3001              TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
   3002            (ppfx &&
   3003             !((ep->getCont()) &&
   3004               TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))))
   3005        rv = se->checkword(word, len, sfxopts, ppfx, cclass,
   3006                           needflag, FLAG_NULL);
   3007      while (rv) {
   3008        if (ppfx) {
   3009          if (ppfx->getMorph()) {
   3010            result.append(ppfx->getMorph());
   3011            result.push_back(MSEP_FLD);
   3012          } else
   3013            debugflag(result, ppfx->getFlag());
   3014        }
   3015        if (complexprefixes && HENTRY_DATA(rv))
   3016          result.append(HENTRY_DATA2(rv));
   3017        if (!HENTRY_FIND(rv, MORPH_STEM)) {
   3018          result.push_back(MSEP_FLD);
   3019          result.append(MORPH_STEM);
   3020          result.append(HENTRY_WORD(rv));
   3021        }
   3022 
   3023        if (!complexprefixes && HENTRY_DATA(rv)) {
   3024          result.push_back(MSEP_FLD);
   3025          result.append(HENTRY_DATA2(rv));
   3026        }
   3027        if (se->getMorph()) {
   3028          result.push_back(MSEP_FLD);
   3029          result.append(se->getMorph());
   3030        } else
   3031          debugflag(result, se->getFlag());
   3032        result.push_back(MSEP_REC);
   3033        rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
   3034      }
   3035    }
   3036    se = se->getNext();
   3037  }
   3038 
   3039  // now handle the general case
   3040  if (len == 0)
   3041    return std::string();  // FULLSTRIP
   3042  unsigned char sp = *((const unsigned char*)(word + len - 1));
   3043  SfxEntry* sptr = sStart[sp];
   3044 
   3045  while (sptr) {
   3046    if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
   3047      // suffixes are not allowed in beginning of compounds
   3048      if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
   3049            // except when signed with compoundpermitflag flag
   3050            (sptr->getCont() && compoundpermitflag &&
   3051             TESTAFF(sptr->getCont(), compoundpermitflag,
   3052                     sptr->getContLen()))) &&
   3053           (!circumfix ||
   3054            // no circumfix flag in prefix and suffix
   3055            ((!ppfx || !(ep->getCont()) ||
   3056              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   3057             (!sptr->getCont() ||
   3058              !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
   3059            // circumfix flag in prefix AND suffix
   3060            ((ppfx && (ep->getCont()) &&
   3061              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
   3062             (sptr->getCont() &&
   3063              (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
   3064           // fogemorpheme
   3065           (in_compound ||
   3066            !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
   3067                                           sptr->getContLen()))))) &&
   3068           // needaffix on first suffix
   3069           (cclass ||
   3070            !(sptr->getCont() &&
   3071              TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))))
   3072        rv = sptr->checkword(word, len, sfxopts, ppfx, cclass,
   3073                             needflag, FLAG_NULL);
   3074      while (rv) {
   3075        if (ppfx) {
   3076          if (ppfx->getMorph()) {
   3077            result.append(ppfx->getMorph());
   3078            result.push_back(MSEP_FLD);
   3079          } else
   3080            debugflag(result, ppfx->getFlag());
   3081        }
   3082        if (complexprefixes && HENTRY_DATA(rv))
   3083          result.append(HENTRY_DATA2(rv));
   3084        if (!HENTRY_FIND(rv, MORPH_STEM)) {
   3085          result.push_back(MSEP_FLD);
   3086          result.append(MORPH_STEM);
   3087          result.append(HENTRY_WORD(rv));
   3088        }
   3089 
   3090        if (!complexprefixes && HENTRY_DATA(rv)) {
   3091          result.push_back(MSEP_FLD);
   3092          result.append(HENTRY_DATA2(rv));
   3093        }
   3094 
   3095        if (sptr->getMorph()) {
   3096          result.push_back(MSEP_FLD);
   3097          result.append(sptr->getMorph());
   3098        } else
   3099          debugflag(result, sptr->getFlag());
   3100        result.push_back(MSEP_REC);
   3101        rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
   3102      }
   3103      sptr = sptr->getNextEQ();
   3104    } else {
   3105      sptr = sptr->getNextNE();
   3106    }
   3107  }
   3108 
   3109  return result;
   3110 }
   3111 
   3112 // check if word with affixes is correctly spelled
   3113 struct hentry* AffixMgr::affix_check(const char* word,
   3114                                     int len,
   3115                                     const FLAG needflag,
   3116                                     char in_compound) {
   3117 
   3118  // check all prefixes (also crossed with suffixes if allowed)
   3119  struct hentry* rv = prefix_check(word, len, in_compound, needflag);
   3120  if (rv)
   3121    return rv;
   3122 
   3123  // if still not found check all suffixes
   3124  rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound);
   3125 
   3126  if (havecontclass) {
   3127    sfx = NULL;
   3128    pfx = NULL;
   3129 
   3130    if (rv)
   3131      return rv;
   3132    // if still not found check all two-level suffixes
   3133    rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
   3134 
   3135    if (rv)
   3136      return rv;
   3137    // if still not found check all two-level suffixes
   3138    rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
   3139  }
   3140 
   3141  return rv;
   3142 }
   3143 
   3144 // check if word with affixes is correctly spelled
   3145 std::string AffixMgr::affix_check_morph(const char* word,
   3146                                  int len,
   3147                                  const FLAG needflag,
   3148                                  char in_compound) {
   3149  std::string result;
   3150 
   3151  // check all prefixes (also crossed with suffixes if allowed)
   3152  std::string st = prefix_check_morph(word, len, in_compound);
   3153  if (!st.empty()) {
   3154    result.append(st);
   3155  }
   3156 
   3157  // if still not found check all suffixes
   3158  st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
   3159  if (!st.empty()) {
   3160    result.append(st);
   3161  }
   3162 
   3163  if (havecontclass) {
   3164    sfx = NULL;
   3165    pfx = NULL;
   3166    // if still not found check all two-level suffixes
   3167    st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
   3168    if (!st.empty()) {
   3169      result.append(st);
   3170    }
   3171 
   3172    // if still not found check all two-level suffixes
   3173    st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
   3174    if (!st.empty()) {
   3175      result.append(st);
   3176    }
   3177  }
   3178 
   3179  return result;
   3180 }
   3181 
   3182 // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
   3183 // in the first line of the inputs
   3184 // return 0, if inputs equal
   3185 // return 1, if inputs may equal with a secondary suffix
   3186 // otherwise return -1
   3187 static int morphcmp(const char* s, const char* t) {
   3188  int se = 0;
   3189  int te = 0;
   3190  const char* sl;
   3191  const char* tl;
   3192  const char* olds;
   3193  const char* oldt;
   3194  if (!s || !t)
   3195    return 1;
   3196  olds = s;
   3197  sl = strchr(s, '\n');
   3198  s = strstr(s, MORPH_DERI_SFX);
   3199  if (!s || (sl && sl < s))
   3200    s = strstr(olds, MORPH_INFL_SFX);
   3201  if (!s || (sl && sl < s)) {
   3202    s = strstr(olds, MORPH_TERM_SFX);
   3203    olds = NULL;
   3204  }
   3205  oldt = t;
   3206  tl = strchr(t, '\n');
   3207  t = strstr(t, MORPH_DERI_SFX);
   3208  if (!t || (tl && tl < t))
   3209    t = strstr(oldt, MORPH_INFL_SFX);
   3210  if (!t || (tl && tl < t)) {
   3211    t = strstr(oldt, MORPH_TERM_SFX);
   3212    oldt = NULL;
   3213  }
   3214  while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
   3215    s += MORPH_TAG_LEN;
   3216    t += MORPH_TAG_LEN;
   3217    se = 0;
   3218    te = 0;
   3219    while ((*s == *t) && !se && !te) {
   3220      s++;
   3221      t++;
   3222      switch (*s) {
   3223        case ' ':
   3224        case '\n':
   3225        case '\t':
   3226        case '\0':
   3227          se = 1;
   3228      }
   3229      switch (*t) {
   3230        case ' ':
   3231        case '\n':
   3232        case '\t':
   3233        case '\0':
   3234          te = 1;
   3235      }
   3236    }
   3237    if (!se || !te) {
   3238      // not terminal suffix difference
   3239      if (olds)
   3240        return -1;
   3241      return 1;
   3242    }
   3243    olds = s;
   3244    s = strstr(s, MORPH_DERI_SFX);
   3245    if (!s || (sl && sl < s))
   3246      s = strstr(olds, MORPH_INFL_SFX);
   3247    if (!s || (sl && sl < s)) {
   3248      s = strstr(olds, MORPH_TERM_SFX);
   3249      olds = NULL;
   3250    }
   3251    oldt = t;
   3252    t = strstr(t, MORPH_DERI_SFX);
   3253    if (!t || (tl && tl < t))
   3254      t = strstr(oldt, MORPH_INFL_SFX);
   3255    if (!t || (tl && tl < t)) {
   3256      t = strstr(oldt, MORPH_TERM_SFX);
   3257      oldt = NULL;
   3258    }
   3259  }
   3260  if (!s && !t && se && te)
   3261    return 0;
   3262  return 1;
   3263 }
   3264 
   3265 std::string AffixMgr::morphgen(const char* ts,
   3266                               int wl,
   3267                               const unsigned short* ap,
   3268                               unsigned short al,
   3269                               const char* morph,
   3270                               const char* targetmorph,
   3271                         int level) {
   3272  // handle suffixes
   3273  if (!morph)
   3274    return std::string();
   3275 
   3276  // check substandard flag
   3277  if (TESTAFF(ap, substandard, al))
   3278    return std::string();
   3279 
   3280  if (morphcmp(morph, targetmorph) == 0)
   3281    return ts;
   3282 
   3283  size_t stemmorphcatpos;
   3284  std::string mymorph;
   3285 
   3286  // use input suffix fields, if exist
   3287  if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
   3288    mymorph.assign(morph);
   3289    mymorph.push_back(MSEP_FLD);
   3290    stemmorphcatpos = mymorph.size();
   3291  } else {
   3292    stemmorphcatpos = std::string::npos;
   3293  }
   3294 
   3295  for (int i = 0; i < al; i++) {
   3296    const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
   3297    SfxEntry* sptr = sFlag[c];
   3298    while (sptr) {
   3299      if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
   3300          ((sptr->getContLen() == 0) ||
   3301           // don't generate forms with substandard affixes
   3302           !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
   3303        const char* stemmorph;
   3304        if (stemmorphcatpos != std::string::npos) {
   3305          mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
   3306          stemmorph = mymorph.c_str();
   3307        } else {
   3308          stemmorph = sptr->getMorph();
   3309        }
   3310 
   3311        int cmp = morphcmp(stemmorph, targetmorph);
   3312 
   3313        if (cmp == 0) {
   3314          std::string newword = sptr->add(ts, wl);
   3315          if (!newword.empty()) {
   3316            hentry* check = pHMgr->lookup(newword.c_str());  // XXX extra dic
   3317            if (!check || !check->astr ||
   3318                !(TESTAFF(check->astr, forbiddenword, check->alen) ||
   3319                  TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
   3320              return newword;
   3321            }
   3322          }
   3323        }
   3324 
   3325        // recursive call for secondary suffixes
   3326        if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
   3327            !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
   3328          std::string newword = sptr->add(ts, wl);
   3329          if (!newword.empty()) {
   3330            std::string newword2 =
   3331                morphgen(newword.c_str(), newword.size(), sptr->getCont(),
   3332                         sptr->getContLen(), stemmorph, targetmorph, 1);
   3333 
   3334            if (!newword2.empty()) {
   3335              return newword2;
   3336            }
   3337          }
   3338        }
   3339      }
   3340      sptr = sptr->getFlgNxt();
   3341    }
   3342  }
   3343  return std::string();
   3344 }
   3345 
   3346 int AffixMgr::expand_rootword(struct guessword* wlst,
   3347                              int maxn,
   3348                              const char* ts,
   3349                              int wl,
   3350                              const unsigned short* ap,
   3351                              unsigned short al,
   3352                              const char* bad,
   3353                              int badl,
   3354                              const char* phon) {
   3355  int nh = 0;
   3356  // first add root word to list
   3357  if ((nh < maxn) &&
   3358      !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
   3359               (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
   3360    wlst[nh].word = mystrdup(ts);
   3361    if (!wlst[nh].word)
   3362      return 0;
   3363    wlst[nh].allow = false;
   3364    wlst[nh].orig = NULL;
   3365    nh++;
   3366    // add special phonetic version
   3367    if (phon && (nh < maxn)) {
   3368      wlst[nh].word = mystrdup(phon);
   3369      if (!wlst[nh].word)
   3370        return nh - 1;
   3371      wlst[nh].allow = false;
   3372      wlst[nh].orig = mystrdup(ts);
   3373      if (!wlst[nh].orig)
   3374        return nh - 1;
   3375      nh++;
   3376    }
   3377  }
   3378 
   3379  // handle suffixes
   3380  for (int i = 0; i < al; i++) {
   3381    const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
   3382    SfxEntry* sptr = sFlag[c];
   3383    while (sptr) {
   3384      if ((sptr->getFlag() == ap[i]) &&
   3385          (!sptr->getKeyLen() ||
   3386           ((badl > sptr->getKeyLen()) &&
   3387            (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
   3388          // check needaffix flag
   3389          !(sptr->getCont() &&
   3390            ((needaffix &&
   3391              TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
   3392             (circumfix &&
   3393              TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
   3394             (onlyincompound &&
   3395              TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) {
   3396        std::string newword = sptr->add(ts, wl);
   3397        if (!newword.empty()) {
   3398          if (nh < maxn) {
   3399            wlst[nh].word = mystrdup(newword.c_str());
   3400            wlst[nh].allow = sptr->allowCross();
   3401            wlst[nh].orig = NULL;
   3402            nh++;
   3403            // add special phonetic version
   3404            if (phon && (nh < maxn)) {
   3405              std::string prefix(phon);
   3406              std::string key(sptr->getKey());
   3407              reverseword(key);
   3408              prefix.append(key);
   3409              wlst[nh].word = mystrdup(prefix.c_str());
   3410              if (!wlst[nh].word)
   3411                return nh - 1;
   3412              wlst[nh].allow = false;
   3413              wlst[nh].orig = mystrdup(newword.c_str());
   3414              if (!wlst[nh].orig)
   3415                return nh - 1;
   3416              nh++;
   3417            }
   3418          }
   3419        }
   3420      }
   3421      sptr = sptr->getFlgNxt();
   3422    }
   3423  }
   3424 
   3425  int n = nh;
   3426 
   3427  // handle cross products of prefixes and suffixes
   3428  for (int j = 1; j < n; j++)
   3429    if (wlst[j].allow) {
   3430      for (int k = 0; k < al; k++) {
   3431        const unsigned char c = (unsigned char)(ap[k] & 0x00FF);
   3432        PfxEntry* cptr = pFlag[c];
   3433        while (cptr) {
   3434          if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
   3435              (!cptr->getKeyLen() ||
   3436               ((badl > cptr->getKeyLen()) &&
   3437                (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
   3438            int l1 = strlen(wlst[j].word);
   3439            std::string newword = cptr->add(wlst[j].word, l1);
   3440            if (!newword.empty()) {
   3441              if (nh < maxn) {
   3442                wlst[nh].word = mystrdup(newword.c_str());
   3443                wlst[nh].allow = cptr->allowCross();
   3444                wlst[nh].orig = NULL;
   3445                nh++;
   3446              }
   3447            }
   3448          }
   3449          cptr = cptr->getFlgNxt();
   3450        }
   3451      }
   3452    }
   3453 
   3454  // now handle pure prefixes
   3455  for (int m = 0; m < al; m++) {
   3456    const unsigned char c = (unsigned char)(ap[m] & 0x00FF);
   3457    PfxEntry* ptr = pFlag[c];
   3458    while (ptr) {
   3459      if ((ptr->getFlag() == ap[m]) &&
   3460          (!ptr->getKeyLen() ||
   3461           ((badl > ptr->getKeyLen()) &&
   3462            (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
   3463          // check needaffix flag
   3464          !(ptr->getCont() &&
   3465            ((needaffix &&
   3466              TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
   3467             (circumfix &&
   3468              TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
   3469             (onlyincompound &&
   3470              TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) {
   3471        std::string newword = ptr->add(ts, wl);
   3472        if (!newword.empty()) {
   3473          if (nh < maxn) {
   3474            wlst[nh].word = mystrdup(newword.c_str());
   3475            wlst[nh].allow = ptr->allowCross();
   3476            wlst[nh].orig = NULL;
   3477            nh++;
   3478          }
   3479        }
   3480      }
   3481      ptr = ptr->getFlgNxt();
   3482    }
   3483  }
   3484 
   3485  return nh;
   3486 }
   3487 
   3488 // return replacing table
   3489 const std::vector<replentry>& AffixMgr::get_reptable() const {
   3490  return pHMgr->get_reptable();
   3491 }
   3492 
   3493 // return iconv table
   3494 RepList* AffixMgr::get_iconvtable() const {
   3495  if (!iconvtable)
   3496    return NULL;
   3497  return iconvtable;
   3498 }
   3499 
   3500 // return oconv table
   3501 RepList* AffixMgr::get_oconvtable() const {
   3502  if (!oconvtable)
   3503    return NULL;
   3504  return oconvtable;
   3505 }
   3506 
   3507 // return replacing table
   3508 struct phonetable* AffixMgr::get_phonetable() const {
   3509  if (!phone)
   3510    return NULL;
   3511  return phone;
   3512 }
   3513 
   3514 // return character map table
   3515 const std::vector<mapentry>& AffixMgr::get_maptable() const {
   3516  return maptable;
   3517 }
   3518 
   3519 // return character map table
   3520 const std::vector<std::string>& AffixMgr::get_breaktable() const {
   3521  return breaktable;
   3522 }
   3523 
   3524 // return text encoding of dictionary
   3525 const std::string& AffixMgr::get_encoding() {
   3526  if (encoding.empty())
   3527    encoding = SPELL_ENCODING;
   3528  return encoding;
   3529 }
   3530 
   3531 // return text encoding of dictionary
   3532 int AffixMgr::get_langnum() const {
   3533  return langnum;
   3534 }
   3535 
   3536 // return double prefix option
   3537 int AffixMgr::get_complexprefixes() const {
   3538  return complexprefixes;
   3539 }
   3540 
   3541 // return FULLSTRIP option
   3542 int AffixMgr::get_fullstrip() const {
   3543  return fullstrip;
   3544 }
   3545 
   3546 FLAG AffixMgr::get_keepcase() const {
   3547  return keepcase;
   3548 }
   3549 
   3550 FLAG AffixMgr::get_forceucase() const {
   3551  return forceucase;
   3552 }
   3553 
   3554 FLAG AffixMgr::get_warn() const {
   3555  return warn;
   3556 }
   3557 
   3558 int AffixMgr::get_forbidwarn() const {
   3559  return forbidwarn;
   3560 }
   3561 
   3562 int AffixMgr::get_checksharps() const {
   3563  return checksharps;
   3564 }
   3565 
   3566 char* AffixMgr::encode_flag(unsigned short aflag) const {
   3567  return pHMgr->encode_flag(aflag);
   3568 }
   3569 
   3570 // return the preferred ignore string for suggestions
   3571 const char* AffixMgr::get_ignore() const {
   3572  if (ignorechars.empty())
   3573    return NULL;
   3574  return ignorechars.c_str();
   3575 }
   3576 
   3577 // return the preferred ignore string for suggestions
   3578 const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
   3579  return ignorechars_utf16;
   3580 }
   3581 
   3582 // return the keyboard string for suggestions
   3583 char* AffixMgr::get_key_string() {
   3584  if (keystring.empty())
   3585    keystring = SPELL_KEYSTRING;
   3586  return mystrdup(keystring.c_str());
   3587 }
   3588 
   3589 // return the preferred try string for suggestions
   3590 char* AffixMgr::get_try_string() const {
   3591  if (trystring.empty())
   3592    return NULL;
   3593  return mystrdup(trystring.c_str());
   3594 }
   3595 
   3596 // return the preferred try string for suggestions
   3597 const std::string& AffixMgr::get_wordchars() const {
   3598  return wordchars;
   3599 }
   3600 
   3601 const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
   3602  return wordchars_utf16;
   3603 }
   3604 
   3605 // is there compounding?
   3606 int AffixMgr::get_compound() const {
   3607  return compoundflag || compoundbegin || !defcpdtable.empty();
   3608 }
   3609 
   3610 // return the compound words control flag
   3611 FLAG AffixMgr::get_compoundflag() const {
   3612  return compoundflag;
   3613 }
   3614 
   3615 // return the forbidden words control flag
   3616 FLAG AffixMgr::get_forbiddenword() const {
   3617  return forbiddenword;
   3618 }
   3619 
   3620 // return the forbidden words control flag
   3621 FLAG AffixMgr::get_nosuggest() const {
   3622  return nosuggest;
   3623 }
   3624 
   3625 // return the forbidden words control flag
   3626 FLAG AffixMgr::get_nongramsuggest() const {
   3627  return nongramsuggest;
   3628 }
   3629 
   3630 // return the substandard root/affix control flag
   3631 FLAG AffixMgr::get_substandard() const {
   3632  return substandard;
   3633 }
   3634 
   3635 // return the forbidden words flag modify flag
   3636 FLAG AffixMgr::get_needaffix() const {
   3637  return needaffix;
   3638 }
   3639 
   3640 // return the onlyincompound flag
   3641 FLAG AffixMgr::get_onlyincompound() const {
   3642  return onlyincompound;
   3643 }
   3644 
   3645 // return the value of suffix
   3646 const std::string& AffixMgr::get_version() const {
   3647  return version;
   3648 }
   3649 
   3650 // utility method to look up root words in hash table
   3651 struct hentry* AffixMgr::lookup(const char* word) {
   3652  struct hentry* he = NULL;
   3653  for (size_t i = 0; i < alldic.size() && !he; ++i) {
   3654    he = alldic[i]->lookup(word);
   3655  }
   3656  return he;
   3657 }
   3658 
   3659 // return the value of suffix
   3660 int AffixMgr::have_contclass() const {
   3661  return havecontclass;
   3662 }
   3663 
   3664 // return utf8
   3665 int AffixMgr::get_utf8() const {
   3666  return utf8;
   3667 }
   3668 
   3669 int AffixMgr::get_maxngramsugs(void) const {
   3670  return maxngramsugs;
   3671 }
   3672 
   3673 int AffixMgr::get_maxcpdsugs(void) const {
   3674  return maxcpdsugs;
   3675 }
   3676 
   3677 int AffixMgr::get_maxdiff(void) const {
   3678  return maxdiff;
   3679 }
   3680 
   3681 int AffixMgr::get_onlymaxdiff(void) const {
   3682  return onlymaxdiff;
   3683 }
   3684 
   3685 // return nosplitsugs
   3686 int AffixMgr::get_nosplitsugs(void) const {
   3687  return nosplitsugs;
   3688 }
   3689 
   3690 // return sugswithdots
   3691 int AffixMgr::get_sugswithdots(void) const {
   3692  return sugswithdots;
   3693 }
   3694 
   3695 /* parse flag */
   3696 bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
   3697  if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
   3698    HUNSPELL_WARNING(
   3699        stderr,
   3700        "error: line %d: multiple definitions of an affix file parameter\n",
   3701        af->getlinenum());
   3702    return false;
   3703  }
   3704  std::string s;
   3705  if (!parse_string(line, s, af->getlinenum()))
   3706    return false;
   3707  *out = pHMgr->decode_flag(s.c_str());
   3708  return true;
   3709 }
   3710 
   3711 /* parse num */
   3712 bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
   3713  if (*out != -1) {
   3714    HUNSPELL_WARNING(
   3715        stderr,
   3716        "error: line %d: multiple definitions of an affix file parameter\n",
   3717        af->getlinenum());
   3718    return false;
   3719  }
   3720  std::string s;
   3721  if (!parse_string(line, s, af->getlinenum()))
   3722    return false;
   3723  *out = atoi(s.c_str());
   3724  return true;
   3725 }
   3726 
   3727 /* parse in the max syllablecount of compound words and  */
   3728 bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
   3729  int i = 0;
   3730  int np = 0;
   3731  std::string::const_iterator iter = line.begin();
   3732  std::string::const_iterator start_piece = mystrsep(line, iter);
   3733  while (start_piece != line.end()) {
   3734    switch (i) {
   3735      case 0: {
   3736        np++;
   3737        break;
   3738      }
   3739      case 1: {
   3740        cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
   3741        np++;
   3742        break;
   3743      }
   3744      case 2: {
   3745        if (!utf8) {
   3746          cpdvowels.assign(start_piece, iter);
   3747          std::sort(cpdvowels.begin(), cpdvowels.end());
   3748        } else {
   3749          std::string piece(start_piece, iter);
   3750          u8_u16(cpdvowels_utf16, piece);
   3751          std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
   3752        }
   3753        np++;
   3754        break;
   3755      }
   3756      default:
   3757        break;
   3758    }
   3759    ++i;
   3760    start_piece = mystrsep(line, iter);
   3761  }
   3762  if (np < 2) {
   3763    HUNSPELL_WARNING(stderr,
   3764                     "error: line %d: missing compoundsyllable information\n",
   3765                     af->getlinenum());
   3766    return false;
   3767  }
   3768  if (np == 2)
   3769    cpdvowels = "AEIOUaeiou";
   3770  return true;
   3771 }
   3772 
   3773 bool AffixMgr::parse_convtable(const std::string& line,
   3774                              FileMgr* af,
   3775                              RepList** rl,
   3776                              const std::string& keyword) {
   3777  if (*rl) {
   3778    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
   3779                     af->getlinenum());
   3780    return false;
   3781  }
   3782  int i = 0;
   3783  int np = 0;
   3784  int numrl = 0;
   3785  std::string::const_iterator iter = line.begin();
   3786  std::string::const_iterator start_piece = mystrsep(line, iter);
   3787  while (start_piece != line.end()) {
   3788    switch (i) {
   3789      case 0: {
   3790        np++;
   3791        break;
   3792      }
   3793      case 1: {
   3794        numrl = atoi(std::string(start_piece, iter).c_str());
   3795        if (numrl < 1) {
   3796          HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
   3797                           af->getlinenum());
   3798          return false;
   3799        }
   3800        *rl = new RepList(numrl);
   3801        if (!*rl)
   3802          return false;
   3803        np++;
   3804        break;
   3805      }
   3806      default:
   3807        break;
   3808    }
   3809    ++i;
   3810    start_piece = mystrsep(line, iter);
   3811  }
   3812  if (np != 2) {
   3813    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   3814                     af->getlinenum());
   3815    return false;
   3816  }
   3817 
   3818  /* now parse the num lines to read in the remainder of the table */
   3819  for (int j = 0; j < numrl; j++) {
   3820    std::string nl;
   3821    if (!af->getline(nl))
   3822      return false;
   3823    mychomp(nl);
   3824    i = 0;
   3825    std::string pattern;
   3826    std::string pattern2;
   3827    iter = nl.begin();
   3828    start_piece = mystrsep(nl, iter);
   3829    while (start_piece != nl.end()) {
   3830      {
   3831        switch (i) {
   3832          case 0: {
   3833            if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
   3834              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   3835                               af->getlinenum());
   3836              delete *rl;
   3837              *rl = NULL;
   3838              return false;
   3839            }
   3840            break;
   3841          }
   3842          case 1: {
   3843            pattern.assign(start_piece, iter);
   3844            break;
   3845          }
   3846          case 2: {
   3847            pattern2.assign(start_piece, iter);
   3848            break;
   3849          }
   3850          default:
   3851            break;
   3852        }
   3853        ++i;
   3854      }
   3855      start_piece = mystrsep(nl, iter);
   3856    }
   3857    if (pattern.empty() || pattern2.empty()) {
   3858      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   3859                       af->getlinenum());
   3860      return false;
   3861    }
   3862    (*rl)->add(pattern, pattern2);
   3863  }
   3864  return true;
   3865 }
   3866 
   3867 /* parse in the typical fault correcting table */
   3868 bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
   3869  if (phone) {
   3870    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
   3871                     af->getlinenum());
   3872    return false;
   3873  }
   3874  int num = -1;
   3875  int i = 0;
   3876  int np = 0;
   3877  std::string::const_iterator iter = line.begin();
   3878  std::string::const_iterator start_piece = mystrsep(line, iter);
   3879  while (start_piece != line.end()) {
   3880    switch (i) {
   3881      case 0: {
   3882        np++;
   3883        break;
   3884      }
   3885      case 1: {
   3886        num = atoi(std::string(start_piece, iter).c_str());
   3887        if (num < 1) {
   3888          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
   3889                           af->getlinenum());
   3890          return false;
   3891        }
   3892        phone = new phonetable;
   3893        phone->utf8 = (char)utf8;
   3894        np++;
   3895        break;
   3896      }
   3897      default:
   3898        break;
   3899    }
   3900    ++i;
   3901    start_piece = mystrsep(line, iter);
   3902  }
   3903  if (np != 2) {
   3904    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   3905                     af->getlinenum());
   3906    return false;
   3907  }
   3908 
   3909  /* now parse the phone->num lines to read in the remainder of the table */
   3910  for (int j = 0; j < num; ++j) {
   3911    std::string nl;
   3912    if (!af->getline(nl))
   3913      return false;
   3914    mychomp(nl);
   3915    i = 0;
   3916    const size_t old_size = phone->rules.size();
   3917    iter = nl.begin();
   3918    start_piece = mystrsep(nl, iter);
   3919    while (start_piece != nl.end()) {
   3920      {
   3921        switch (i) {
   3922          case 0: {
   3923            if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
   3924              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   3925                               af->getlinenum());
   3926              return false;
   3927            }
   3928            break;
   3929          }
   3930          case 1: {
   3931            phone->rules.push_back(std::string(start_piece, iter));
   3932            break;
   3933          }
   3934          case 2: {
   3935            phone->rules.push_back(std::string(start_piece, iter));
   3936            mystrrep(phone->rules.back(), "_", "");
   3937            break;
   3938          }
   3939          default:
   3940            break;
   3941        }
   3942        ++i;
   3943      }
   3944      start_piece = mystrsep(nl, iter);
   3945    }
   3946    if (phone->rules.size() != old_size + 2) {
   3947      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   3948                       af->getlinenum());
   3949      phone->rules.clear();
   3950      return false;
   3951    }
   3952  }
   3953  phone->rules.push_back("");
   3954  phone->rules.push_back("");
   3955  init_phonet_hash(*phone);
   3956  return true;
   3957 }
   3958 
   3959 /* parse in the checkcompoundpattern table */
   3960 bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
   3961  if (parsedcheckcpd) {
   3962    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
   3963                     af->getlinenum());
   3964    return false;
   3965  }
   3966  parsedcheckcpd = true;
   3967  int numcheckcpd = -1;
   3968  int i = 0;
   3969  int np = 0;
   3970  std::string::const_iterator iter = line.begin();
   3971  std::string::const_iterator start_piece = mystrsep(line, iter);
   3972  while (start_piece != line.end()) {
   3973    switch (i) {
   3974      case 0: {
   3975        np++;
   3976        break;
   3977      }
   3978      case 1: {
   3979        numcheckcpd = atoi(std::string(start_piece, iter).c_str());
   3980        if (numcheckcpd < 1) {
   3981          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
   3982                           af->getlinenum());
   3983          return false;
   3984        }
   3985        checkcpdtable.reserve(numcheckcpd);
   3986        np++;
   3987        break;
   3988      }
   3989      default:
   3990        break;
   3991    }
   3992    ++i;
   3993    start_piece = mystrsep(line, iter);
   3994  }
   3995  if (np != 2) {
   3996    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   3997                     af->getlinenum());
   3998    return false;
   3999  }
   4000 
   4001  /* now parse the numcheckcpd lines to read in the remainder of the table */
   4002  for (int j = 0; j < numcheckcpd; ++j) {
   4003    std::string nl;
   4004    if (!af->getline(nl))
   4005      return false;
   4006    mychomp(nl);
   4007    i = 0;
   4008    checkcpdtable.push_back(patentry());
   4009    iter = nl.begin();
   4010    start_piece = mystrsep(nl, iter);
   4011    while (start_piece != nl.end()) {
   4012      switch (i) {
   4013        case 0: {
   4014          if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
   4015            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4016                             af->getlinenum());
   4017            return false;
   4018          }
   4019          break;
   4020        }
   4021        case 1: {
   4022          checkcpdtable.back().pattern.assign(start_piece, iter);
   4023          size_t slash_pos = checkcpdtable.back().pattern.find('/');
   4024          if (slash_pos != std::string::npos) {
   4025            std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
   4026            checkcpdtable.back().pattern.resize(slash_pos);
   4027            checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str());
   4028          }
   4029          break;
   4030        }
   4031        case 2: {
   4032          checkcpdtable.back().pattern2.assign(start_piece, iter);
   4033          size_t slash_pos = checkcpdtable.back().pattern2.find('/');
   4034          if (slash_pos != std::string::npos) {
   4035            std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
   4036            checkcpdtable.back().pattern2.resize(slash_pos);
   4037            checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str());
   4038          }
   4039          break;
   4040        }
   4041        case 3: {
   4042          checkcpdtable.back().pattern3.assign(start_piece, iter);
   4043          simplifiedcpd = 1;
   4044          break;
   4045        }
   4046        default:
   4047          break;
   4048      }
   4049      i++;
   4050      start_piece = mystrsep(nl, iter);
   4051    }
   4052  }
   4053  return true;
   4054 }
   4055 
   4056 /* parse in the compound rule table */
   4057 bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
   4058  if (parseddefcpd) {
   4059    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
   4060                     af->getlinenum());
   4061    return false;
   4062  }
   4063  parseddefcpd = true;
   4064  int numdefcpd = -1;
   4065  int i = 0;
   4066  int np = 0;
   4067  std::string::const_iterator iter = line.begin();
   4068  std::string::const_iterator start_piece = mystrsep(line, iter);
   4069  while (start_piece != line.end()) {
   4070    switch (i) {
   4071      case 0: {
   4072        np++;
   4073        break;
   4074      }
   4075      case 1: {
   4076        numdefcpd = atoi(std::string(start_piece, iter).c_str());
   4077        if (numdefcpd < 1) {
   4078          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
   4079                           af->getlinenum());
   4080          return false;
   4081        }
   4082        defcpdtable.reserve(numdefcpd);
   4083        np++;
   4084        break;
   4085      }
   4086      default:
   4087        break;
   4088    }
   4089    ++i;
   4090    start_piece = mystrsep(line, iter);
   4091  }
   4092  if (np != 2) {
   4093    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   4094                     af->getlinenum());
   4095    return false;
   4096  }
   4097 
   4098  /* now parse the numdefcpd lines to read in the remainder of the table */
   4099  for (int j = 0; j < numdefcpd; ++j) {
   4100    std::string nl;
   4101    if (!af->getline(nl))
   4102      return false;
   4103    mychomp(nl);
   4104    i = 0;
   4105    defcpdtable.push_back(flagentry());
   4106    iter = nl.begin();
   4107    start_piece = mystrsep(nl, iter);
   4108    while (start_piece != nl.end()) {
   4109      switch (i) {
   4110        case 0: {
   4111          if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
   4112            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4113                             af->getlinenum());
   4114            numdefcpd = 0;
   4115            return false;
   4116          }
   4117          break;
   4118        }
   4119        case 1: {  // handle parenthesized flags
   4120          if (std::find(start_piece, iter, '(') != iter) {
   4121            for (std::string::const_iterator k = start_piece; k != iter; ++k) {
   4122              std::string::const_iterator chb = k;
   4123              std::string::const_iterator che = k + 1;
   4124              if (*k == '(') {
   4125                std::string::const_iterator parpos = std::find(k, iter, ')');
   4126                if (parpos != iter) {
   4127                  chb = k + 1;
   4128                  che = parpos;
   4129                  k = parpos;
   4130                }
   4131              }
   4132 
   4133              if (*chb == '*' || *chb == '?') {
   4134                defcpdtable.back().push_back((FLAG)*chb);
   4135              } else {
   4136                pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
   4137              }
   4138            }
   4139          } else {
   4140            pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
   4141          }
   4142          break;
   4143        }
   4144        default:
   4145          break;
   4146      }
   4147      ++i;
   4148      start_piece = mystrsep(nl, iter);
   4149    }
   4150    if (defcpdtable.back().empty()) {
   4151      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4152                       af->getlinenum());
   4153      return false;
   4154    }
   4155  }
   4156  return true;
   4157 }
   4158 
   4159 /* parse in the character map table */
   4160 bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
   4161  if (parsedmaptable) {
   4162    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
   4163                     af->getlinenum());
   4164    return false;
   4165  }
   4166  parsedmaptable = true;
   4167  int nummap = -1;
   4168  int i = 0;
   4169  int np = 0;
   4170  std::string::const_iterator iter = line.begin();
   4171  std::string::const_iterator start_piece = mystrsep(line, iter);
   4172  while (start_piece != line.end()) {
   4173    switch (i) {
   4174      case 0: {
   4175        np++;
   4176        break;
   4177      }
   4178      case 1: {
   4179        nummap = atoi(std::string(start_piece, iter).c_str());
   4180        if (nummap < 1) {
   4181          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
   4182                           af->getlinenum());
   4183          return false;
   4184        }
   4185        maptable.reserve(nummap);
   4186        np++;
   4187        break;
   4188      }
   4189      default:
   4190        break;
   4191    }
   4192    ++i;
   4193    start_piece = mystrsep(line, iter);
   4194  }
   4195  if (np != 2) {
   4196    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   4197                     af->getlinenum());
   4198    return false;
   4199  }
   4200 
   4201  /* now parse the nummap lines to read in the remainder of the table */
   4202  for (int j = 0; j < nummap; ++j) {
   4203    std::string nl;
   4204    if (!af->getline(nl))
   4205      return false;
   4206    mychomp(nl);
   4207    i = 0;
   4208    maptable.push_back(mapentry());
   4209    iter = nl.begin();
   4210    start_piece = mystrsep(nl, iter);
   4211    while (start_piece != nl.end()) {
   4212      switch (i) {
   4213        case 0: {
   4214          if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
   4215            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4216                             af->getlinenum());
   4217            nummap = 0;
   4218            return false;
   4219          }
   4220          break;
   4221        }
   4222        case 1: {
   4223          for (std::string::const_iterator k = start_piece; k != iter; ++k) {
   4224            std::string::const_iterator chb = k;
   4225            std::string::const_iterator che = k + 1;
   4226            if (*k == '(') {
   4227              std::string::const_iterator parpos = std::find(k, iter, ')');
   4228              if (parpos != iter) {
   4229                chb = k + 1;
   4230                che = parpos;
   4231                k = parpos;
   4232              }
   4233            } else {
   4234              if (utf8 && (*k & 0xc0) == 0xc0) {
   4235                ++k;
   4236                while (k != iter && (*k & 0xc0) == 0x80)
   4237                    ++k;
   4238                che = k;
   4239                --k;
   4240              }
   4241            }
   4242            maptable.back().push_back(std::string(chb, che));
   4243          }
   4244          break;
   4245        }
   4246        default:
   4247          break;
   4248      }
   4249      ++i;
   4250      start_piece = mystrsep(nl, iter);
   4251    }
   4252    if (maptable.back().empty()) {
   4253      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4254                       af->getlinenum());
   4255      return false;
   4256    }
   4257  }
   4258  return true;
   4259 }
   4260 
   4261 /* parse in the word breakpoint table */
   4262 bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
   4263  if (parsedbreaktable) {
   4264    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
   4265                     af->getlinenum());
   4266    return false;
   4267  }
   4268  parsedbreaktable = true;
   4269  int numbreak = -1;
   4270  int i = 0;
   4271  int np = 0;
   4272  std::string::const_iterator iter = line.begin();
   4273  std::string::const_iterator start_piece = mystrsep(line, iter);
   4274  while (start_piece != line.end()) {
   4275    switch (i) {
   4276      case 0: {
   4277        np++;
   4278        break;
   4279      }
   4280      case 1: {
   4281        numbreak = atoi(std::string(start_piece, iter).c_str());
   4282        if (numbreak < 0) {
   4283          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
   4284                           af->getlinenum());
   4285          return false;
   4286        }
   4287        if (numbreak == 0)
   4288          return true;
   4289        breaktable.reserve(numbreak);
   4290        np++;
   4291        break;
   4292      }
   4293      default:
   4294        break;
   4295    }
   4296    ++i;
   4297    start_piece = mystrsep(line, iter);
   4298  }
   4299  if (np != 2) {
   4300    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   4301                     af->getlinenum());
   4302    return false;
   4303  }
   4304 
   4305  /* now parse the numbreak lines to read in the remainder of the table */
   4306  for (int j = 0; j < numbreak; ++j) {
   4307    std::string nl;
   4308    if (!af->getline(nl))
   4309      return false;
   4310    mychomp(nl);
   4311    i = 0;
   4312    iter = nl.begin();
   4313    start_piece = mystrsep(nl, iter);
   4314    while (start_piece != nl.end()) {
   4315      switch (i) {
   4316        case 0: {
   4317          if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
   4318            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4319                             af->getlinenum());
   4320            numbreak = 0;
   4321            return false;
   4322          }
   4323          break;
   4324        }
   4325        case 1: {
   4326          breaktable.push_back(std::string(start_piece, iter));
   4327          break;
   4328        }
   4329        default:
   4330          break;
   4331      }
   4332      ++i;
   4333      start_piece = mystrsep(nl, iter);
   4334    }
   4335  }
   4336 
   4337  if (breaktable.size() != static_cast<size_t>(numbreak)) {
   4338    HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
   4339                     af->getlinenum());
   4340    return false;
   4341  }
   4342 
   4343  return true;
   4344 }
   4345 
   4346 void AffixMgr::reverse_condition(std::string& piece) {
   4347  if (piece.empty())
   4348      return;
   4349 
   4350  int neg = 0;
   4351  for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
   4352    switch (*k) {
   4353      case '[': {
   4354        if (neg)
   4355          *(k - 1) = '[';
   4356        else
   4357          *k = ']';
   4358        break;
   4359      }
   4360      case ']': {
   4361        *k = '[';
   4362        if (neg)
   4363          *(k - 1) = '^';
   4364        neg = 0;
   4365        break;
   4366      }
   4367      case '^': {
   4368        if (*(k - 1) == ']')
   4369          neg = 1;
   4370        else if (neg)
   4371          *(k - 1) = *k;
   4372        break;
   4373      }
   4374      default: {
   4375        if (neg)
   4376          *(k - 1) = *k;
   4377      }
   4378    }
   4379  }
   4380 }
   4381 
   4382 class entries_container {
   4383  std::vector<AffEntry*> entries;
   4384  AffixMgr* m_mgr;
   4385  char m_at;
   4386 public:
   4387  entries_container(char at, AffixMgr* mgr)
   4388    : m_mgr(mgr)
   4389    , m_at(at) {
   4390  }
   4391  void release() {
   4392    entries.clear();
   4393  }
   4394  void initialize(int numents,
   4395                  char opts, unsigned short aflag) {
   4396    entries.reserve(numents);
   4397 
   4398    if (m_at == 'P') {
   4399      entries.push_back(new PfxEntry(m_mgr));
   4400    } else {
   4401      entries.push_back(new SfxEntry(m_mgr));
   4402    }
   4403 
   4404    entries.back()->opts = opts;
   4405    entries.back()->aflag = aflag;
   4406  }
   4407 
   4408  AffEntry* add_entry(char opts) {
   4409    if (m_at == 'P') {
   4410      entries.push_back(new PfxEntry(m_mgr));
   4411    } else {
   4412      entries.push_back(new SfxEntry(m_mgr));
   4413    }
   4414    AffEntry* ret = entries.back();
   4415    ret->opts = entries[0]->opts & opts;
   4416    return ret;
   4417  }
   4418 
   4419  AffEntry* first_entry() {
   4420    return entries.empty() ? NULL : entries[0];
   4421  }
   4422 
   4423  ~entries_container() {
   4424    for (size_t i = 0; i < entries.size(); ++i) {
   4425        delete entries[i];
   4426    }
   4427  }
   4428 
   4429  std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
   4430  std::vector<AffEntry*>::iterator end() { return entries.end(); }
   4431 };
   4432 
   4433 bool AffixMgr::parse_affix(const std::string& line,
   4434                          const char at,
   4435                          FileMgr* af,
   4436                          char* dupflags) {
   4437  int numents = 0;  // number of AffEntry structures to parse
   4438 
   4439  unsigned short aflag = 0;  // affix char identifier
   4440 
   4441  char ff = 0;
   4442  entries_container affentries(at, this);
   4443 
   4444  int i = 0;
   4445 
   4446 // checking lines with bad syntax
   4447 #ifdef DEBUG
   4448  int basefieldnum = 0;
   4449 #endif
   4450 
   4451  // split affix header line into pieces
   4452 
   4453  int np = 0;
   4454  std::string::const_iterator iter = line.begin();
   4455  std::string::const_iterator start_piece = mystrsep(line, iter);
   4456  while (start_piece != line.end()) {
   4457    switch (i) {
   4458      // piece 1 - is type of affix
   4459      case 0: {
   4460        np++;
   4461        break;
   4462      }
   4463 
   4464      // piece 2 - is affix char
   4465      case 1: {
   4466        np++;
   4467        aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str());
   4468        if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
   4469            ((at == 'P') && (dupflags[aflag] & dupPFX))) {
   4470          HUNSPELL_WARNING(
   4471              stderr,
   4472              "error: line %d: multiple definitions of an affix flag\n",
   4473              af->getlinenum());
   4474        }
   4475        dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX);
   4476        break;
   4477      }
   4478      // piece 3 - is cross product indicator
   4479      case 2: {
   4480        np++;
   4481        if (*start_piece == 'Y')
   4482          ff = aeXPRODUCT;
   4483        break;
   4484      }
   4485 
   4486      // piece 4 - is number of affentries
   4487      case 3: {
   4488        np++;
   4489        numents = atoi(std::string(start_piece, iter).c_str());
   4490        if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
   4491                                sizeof(AffEntry)) < static_cast<size_t>(numents))) {
   4492          char* err = pHMgr->encode_flag(aflag);
   4493          if (err) {
   4494            HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
   4495                             af->getlinenum());
   4496            free(err);
   4497          }
   4498          return false;
   4499        }
   4500 
   4501        char opts = ff;
   4502        if (utf8)
   4503          opts |= aeUTF8;
   4504        if (pHMgr->is_aliasf())
   4505          opts |= aeALIASF;
   4506        if (pHMgr->is_aliasm())
   4507          opts |= aeALIASM;
   4508        affentries.initialize(numents, opts, aflag);
   4509      }
   4510 
   4511      default:
   4512        break;
   4513    }
   4514    ++i;
   4515    start_piece = mystrsep(line, iter);
   4516  }
   4517  // check to make sure we parsed enough pieces
   4518  if (np != 4) {
   4519    char* err = pHMgr->encode_flag(aflag);
   4520    if (err) {
   4521      HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
   4522                       af->getlinenum());
   4523      free(err);
   4524    }
   4525    return false;
   4526  }
   4527 
   4528  // now parse numents affentries for this affix
   4529  AffEntry* entry = affentries.first_entry();
   4530  for (int ent = 0; ent < numents; ++ent) {
   4531    std::string nl;
   4532    if (!af->getline(nl))
   4533      return false;
   4534    mychomp(nl);
   4535 
   4536    iter = nl.begin();
   4537    i = 0;
   4538    np = 0;
   4539 
   4540    // split line into pieces
   4541    start_piece = mystrsep(nl, iter);
   4542    while (start_piece != nl.end()) {
   4543      switch (i) {
   4544        // piece 1 - is type
   4545        case 0: {
   4546          np++;
   4547          if (ent != 0)
   4548            entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM));
   4549          break;
   4550        }
   4551 
   4552        // piece 2 - is affix char
   4553        case 1: {
   4554          np++;
   4555          std::string chunk(start_piece, iter);
   4556          if (pHMgr->decode_flag(chunk.c_str()) != aflag) {
   4557            char* err = pHMgr->encode_flag(aflag);
   4558            if (err) {
   4559              HUNSPELL_WARNING(stderr,
   4560                               "error: line %d: affix %s is corrupt\n",
   4561                               af->getlinenum(), err);
   4562              free(err);
   4563            }
   4564            return false;
   4565          }
   4566 
   4567          if (ent != 0) {
   4568            AffEntry* start_entry = affentries.first_entry();
   4569            entry->aflag = start_entry->aflag;
   4570          }
   4571          break;
   4572        }
   4573 
   4574        // piece 3 - is string to strip or 0 for null
   4575        case 2: {
   4576          np++;
   4577          entry->strip = std::string(start_piece, iter);
   4578          if (complexprefixes) {
   4579            if (utf8)
   4580              reverseword_utf(entry->strip);
   4581            else
   4582              reverseword(entry->strip);
   4583          }
   4584          if (entry->strip.compare("0") == 0) {
   4585            entry->strip.clear();
   4586          }
   4587          break;
   4588        }
   4589 
   4590        // piece 4 - is affix string or 0 for null
   4591        case 3: {
   4592          entry->morphcode = NULL;
   4593          entry->contclass = NULL;
   4594          entry->contclasslen = 0;
   4595          np++;
   4596          std::string::const_iterator dash = std::find(start_piece, iter, '/');
   4597          if (dash != iter) {
   4598            entry->appnd = std::string(start_piece, dash);
   4599            std::string dash_str(dash + 1, iter);
   4600 
   4601            if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
   4602              if (utf8) {
   4603                remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
   4604              } else {
   4605                remove_ignored_chars(entry->appnd, ignorechars);
   4606              }
   4607            }
   4608 
   4609            if (complexprefixes) {
   4610              if (utf8)
   4611                reverseword_utf(entry->appnd);
   4612              else
   4613                reverseword(entry->appnd);
   4614            }
   4615 
   4616            if (pHMgr->is_aliasf()) {
   4617              int index = atoi(dash_str.c_str());
   4618              entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
   4619                  index, &(entry->contclass), af);
   4620              if (!entry->contclasslen)
   4621                HUNSPELL_WARNING(stderr,
   4622                                 "error: bad affix flag alias: \"%s\"\n",
   4623                                 dash_str.c_str());
   4624            } else {
   4625              entry->contclasslen = (unsigned short)pHMgr->decode_flags(
   4626                  &(entry->contclass), dash_str.c_str(), af);
   4627              std::sort(entry->contclass, entry->contclass + entry->contclasslen);
   4628            }
   4629 
   4630            havecontclass = 1;
   4631            for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
   4632              contclasses[(entry->contclass)[_i]] = 1;
   4633            }
   4634          } else {
   4635            entry->appnd = std::string(start_piece, iter);
   4636 
   4637            if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
   4638              if (utf8) {
   4639                remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
   4640              } else {
   4641                remove_ignored_chars(entry->appnd, ignorechars);
   4642              }
   4643            }
   4644 
   4645            if (complexprefixes) {
   4646              if (utf8)
   4647                reverseword_utf(entry->appnd);
   4648              else
   4649                reverseword(entry->appnd);
   4650            }
   4651          }
   4652 
   4653          if (entry->appnd.compare("0") == 0) {
   4654            entry->appnd.clear();
   4655          }
   4656          break;
   4657        }
   4658 
   4659        // piece 5 - is the conditions descriptions
   4660        case 4: {
   4661          std::string chunk(start_piece, iter);
   4662          np++;
   4663          if (complexprefixes) {
   4664            if (utf8)
   4665              reverseword_utf(chunk);
   4666            else
   4667              reverseword(chunk);
   4668            reverse_condition(chunk);
   4669          }
   4670          if (!entry->strip.empty() && chunk != "." &&
   4671              redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
   4672                                  af->getlinenum()))
   4673            chunk = ".";
   4674          if (at == 'S') {
   4675            reverseword(chunk);
   4676            reverse_condition(chunk);
   4677          }
   4678          if (encodeit(*entry, chunk.c_str()))
   4679            return false;
   4680          break;
   4681        }
   4682 
   4683        case 5: {
   4684          std::string chunk(start_piece, iter);
   4685          np++;
   4686          if (pHMgr->is_aliasm()) {
   4687            int index = atoi(chunk.c_str());
   4688            entry->morphcode = pHMgr->get_aliasm(index);
   4689          } else {
   4690            if (complexprefixes) {  // XXX - fix me for morph. gen.
   4691              if (utf8)
   4692                reverseword_utf(chunk);
   4693              else
   4694                reverseword(chunk);
   4695            }
   4696            // add the remaining of the line
   4697            std::string::const_iterator end = nl.end();
   4698            if (iter != end) {
   4699              chunk.append(iter, end);
   4700            }
   4701            entry->morphcode = mystrdup(chunk.c_str());
   4702            if (!entry->morphcode)
   4703              return false;
   4704          }
   4705          break;
   4706        }
   4707        default:
   4708          break;
   4709      }
   4710      i++;
   4711      start_piece = mystrsep(nl, iter);
   4712    }
   4713    // check to make sure we parsed enough pieces
   4714    if (np < 4) {
   4715      char* err = pHMgr->encode_flag(aflag);
   4716      if (err) {
   4717        HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
   4718                         af->getlinenum(), err);
   4719        free(err);
   4720      }
   4721      return false;
   4722    }
   4723 
   4724 #ifdef DEBUG
   4725    // detect unnecessary fields, excepting comments
   4726    if (basefieldnum) {
   4727      int fieldnum =
   4728          !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
   4729      if (fieldnum != basefieldnum)
   4730        HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n",
   4731                         af->getlinenum());
   4732    } else {
   4733      basefieldnum =
   4734          !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
   4735    }
   4736 #endif
   4737  }
   4738 
   4739  // now create SfxEntry or PfxEntry objects and use links to
   4740  // build an ordered (sorted by affix string) list
   4741  std::vector<AffEntry*>::iterator start = affentries.begin();
   4742  std::vector<AffEntry*>::iterator end = affentries.end();
   4743  for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) {
   4744    if (at == 'P') {
   4745      build_pfxtree(static_cast<PfxEntry*>(*affentry));
   4746    } else {
   4747      build_sfxtree(static_cast<SfxEntry*>(*affentry));
   4748    }
   4749  }
   4750 
   4751  //contents belong to AffixMgr now
   4752  affentries.release();
   4753 
   4754  return true;
   4755 }
   4756 
   4757 int AffixMgr::redundant_condition(char ft,
   4758                                  const char* strip,
   4759                                  int stripl,
   4760                                  const char* cond,
   4761                                  int linenum) {
   4762  int condl = strlen(cond);
   4763  int i;
   4764  int j;
   4765  int neg;
   4766  int in;
   4767  if (ft == 'P') {  // prefix
   4768    if (strncmp(strip, cond, condl) == 0)
   4769      return 1;
   4770    if (utf8) {
   4771    } else {
   4772      for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
   4773        if (cond[j] != '[') {
   4774          if (cond[j] != strip[i]) {
   4775            HUNSPELL_WARNING(stderr,
   4776                             "warning: line %d: incompatible stripping "
   4777                             "characters and condition\n",
   4778                             linenum);
   4779            return 0;
   4780          }
   4781        } else {
   4782          neg = (cond[j + 1] == '^') ? 1 : 0;
   4783          in = 0;
   4784          do {
   4785            j++;
   4786            if (strip[i] == cond[j])
   4787              in = 1;
   4788          } while ((j < (condl - 1)) && (cond[j] != ']'));
   4789          if (j == (condl - 1) && (cond[j] != ']')) {
   4790            HUNSPELL_WARNING(stderr,
   4791                             "error: line %d: missing ] in condition:\n%s\n",
   4792                             linenum, cond);
   4793            return 0;
   4794          }
   4795          if ((!neg && !in) || (neg && in)) {
   4796            HUNSPELL_WARNING(stderr,
   4797                             "warning: line %d: incompatible stripping "
   4798                             "characters and condition\n",
   4799                             linenum);
   4800            return 0;
   4801          }
   4802        }
   4803      }
   4804      if (j >= condl)
   4805        return 1;
   4806    }
   4807  } else {  // suffix
   4808    if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0)
   4809      return 1;
   4810    if (utf8) {
   4811    } else {
   4812      for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
   4813        if (cond[j] != ']') {
   4814          if (cond[j] != strip[i]) {
   4815            HUNSPELL_WARNING(stderr,
   4816                             "warning: line %d: incompatible stripping "
   4817                             "characters and condition\n",
   4818                             linenum);
   4819            return 0;
   4820          }
   4821        } else {
   4822          in = 0;
   4823          do {
   4824            j--;
   4825            if (strip[i] == cond[j])
   4826              in = 1;
   4827          } while ((j > 0) && (cond[j] != '['));
   4828          if ((j == 0) && (cond[j] != '[')) {
   4829            HUNSPELL_WARNING(stderr,
   4830                             "error: line: %d: missing ] in condition:\n%s\n",
   4831                             linenum, cond);
   4832            return 0;
   4833          }
   4834          neg = (cond[j + 1] == '^') ? 1 : 0;
   4835          if ((!neg && !in) || (neg && in)) {
   4836            HUNSPELL_WARNING(stderr,
   4837                             "warning: line %d: incompatible stripping "
   4838                             "characters and condition\n",
   4839                             linenum);
   4840            return 0;
   4841          }
   4842        }
   4843      }
   4844      if (j < 0)
   4845        return 1;
   4846    }
   4847  }
   4848  return 0;
   4849 }
   4850 
   4851 std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
   4852                               int len,
   4853                               const char* root_word) {
   4854  std::vector<std::string> slst;
   4855  short unsigned* start_ptr = suff;
   4856  for (int j = 0; j < SETSIZE; j++) {
   4857    SfxEntry* ptr = sStart[j];
   4858    while (ptr) {
   4859      suff = start_ptr;
   4860      for (int i = 0; i < len; i++) {
   4861        if ((*suff) == ptr->getFlag()) {
   4862          std::string nw(root_word);
   4863          nw.append(ptr->getAffix());
   4864          hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0);
   4865          if (ht) {
   4866            slst.push_back(nw);
   4867          }
   4868        }
   4869        suff++;
   4870      }
   4871      ptr = ptr->getNext();
   4872    }
   4873  }
   4874  return slst;
   4875 }