affixmgr.hxx (14700B)
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 * 4 * Copyright (C) 2002-2022 Németh László 5 * 6 * The contents of this file are subject to the Mozilla Public License Version 7 * 1.1 (the "License"); you may not use this file except in compliance with 8 * the License. You may obtain a copy of the License at 9 * http://www.mozilla.org/MPL/ 10 * 11 * Software distributed under the License is distributed on an "AS IS" basis, 12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 13 * for the specific language governing rights and limitations under the 14 * License. 15 * 16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. 17 * 18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, 19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, 20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, 21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, 22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen 23 * 24 * Alternatively, the contents of this file may be used under the terms of 25 * either the GNU General Public License Version 2 or later (the "GPL"), or 26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 * in which case the provisions of the GPL or the LGPL are applicable instead 28 * of those above. If you wish to allow use of your version of this file only 29 * under the terms of either the GPL or the LGPL, and not to allow others to 30 * use your version of this file under the terms of the MPL, indicate your 31 * decision by deleting the provisions above and replace them with the notice 32 * and other provisions required by the GPL or the LGPL. If you do not delete 33 * the provisions above, a recipient may use your version of this file under 34 * the terms of any one of the MPL, the GPL or the LGPL. 35 * 36 * ***** END LICENSE BLOCK ***** */ 37 /* 38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada 39 * And Contributors. All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 52 * 3. All modifications to the source code must be clearly marked as 53 * such. Binary redistributions based on modified source code 54 * must be clearly marked as modified versions in the documentation 55 * and/or other materials provided with the distribution. 56 * 57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS 58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 */ 70 71 #ifndef AFFIXMGR_HXX_ 72 #define AFFIXMGR_HXX_ 73 74 #include <stdio.h> 75 76 #include <string> 77 #include <vector> 78 79 #include "atypes.hxx" 80 #include "baseaffix.hxx" 81 #include "hashmgr.hxx" 82 #include "phonet.hxx" 83 #include "replist.hxx" 84 85 // check flag duplication 86 #define dupSFX (1 << 0) 87 #define dupPFX (1 << 1) 88 89 class PfxEntry; 90 class SfxEntry; 91 92 class AffixMgr { 93 PfxEntry* pStart[SETSIZE]; 94 SfxEntry* sStart[SETSIZE]; 95 PfxEntry* pFlag[SETSIZE]; 96 SfxEntry* sFlag[SETSIZE]; 97 const std::vector<HashMgr*>& alldic; 98 const HashMgr* pHMgr; 99 std::string keystring; 100 std::string trystring; 101 std::string encoding; 102 struct cs_info* csconv; 103 int utf8; 104 int complexprefixes; 105 FLAG compoundflag; 106 FLAG compoundbegin; 107 FLAG compoundmiddle; 108 FLAG compoundend; 109 FLAG compoundroot; 110 FLAG compoundforbidflag; 111 FLAG compoundpermitflag; 112 int compoundmoresuffixes; 113 int checkcompounddup; 114 int checkcompoundrep; 115 int checkcompoundcase; 116 int checkcompoundtriple; 117 int simplifiedtriple; 118 FLAG forbiddenword; 119 FLAG nosuggest; 120 FLAG nongramsuggest; 121 FLAG needaffix; 122 int cpdmin; 123 RepList* iconvtable; 124 RepList* oconvtable; 125 bool parsedmaptable; 126 std::vector<mapentry> maptable; 127 bool parsedbreaktable; 128 std::vector<std::string> breaktable; 129 bool parsedcheckcpd; 130 std::vector<patentry> checkcpdtable; 131 int simplifiedcpd; 132 bool parseddefcpd; 133 std::vector<flagentry> defcpdtable; 134 phonetable* phone; 135 int maxngramsugs; 136 int maxcpdsugs; 137 int maxdiff; 138 int onlymaxdiff; 139 int nosplitsugs; 140 int sugswithdots; 141 int cpdwordmax; 142 int cpdmaxsyllable; 143 std::string cpdvowels; // vowels (for calculating of Hungarian compounding limit, 144 std::vector<w_char> cpdvowels_utf16; //vowels for UTF-8 encoding 145 std::string cpdsyllablenum; // syllable count incrementing flag 146 const char* pfxappnd; // BUG: not stateless 147 const char* sfxappnd; // BUG: not stateless 148 int sfxextra; // BUG: not stateless 149 FLAG sfxflag; // BUG: not stateless 150 char* derived; // BUG: not stateless 151 SfxEntry* sfx; // BUG: not stateless 152 PfxEntry* pfx; // BUG: not stateless 153 int checknum; 154 std::string wordchars; // letters + spec. word characters 155 std::vector<w_char> wordchars_utf16; 156 std::string ignorechars; // letters + spec. word characters 157 std::vector<w_char> ignorechars_utf16; 158 std::string version; // affix and dictionary file version string 159 std::string lang; // language 160 int langnum; 161 FLAG lemma_present; 162 FLAG circumfix; 163 FLAG onlyincompound; 164 FLAG keepcase; 165 FLAG forceucase; 166 FLAG warn; 167 int forbidwarn; 168 FLAG substandard; 169 int checksharps; 170 int fullstrip; 171 172 int havecontclass; // boolean variable 173 char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold 174 // affix) 175 176 public: 177 AffixMgr(const char* affpath, const std::vector<HashMgr*>& ptr, const char* key = NULL); 178 ~AffixMgr(); 179 struct hentry* affix_check(const char* word, 180 int len, 181 const unsigned short needflag = (unsigned short)0, 182 char in_compound = IN_CPD_NOT); 183 struct hentry* prefix_check(const char* word, 184 int len, 185 char in_compound, 186 const FLAG needflag = FLAG_NULL); 187 inline int isSubset(const char* s1, const char* s2); 188 struct hentry* prefix_check_twosfx(const char* word, 189 int len, 190 char in_compound, 191 const FLAG needflag = FLAG_NULL); 192 inline int isRevSubset(const char* s1, const char* end_of_s2, int len); 193 struct hentry* suffix_check(const char* word, 194 int len, 195 int sfxopts, 196 PfxEntry* ppfx, 197 const FLAG cclass = FLAG_NULL, 198 const FLAG needflag = FLAG_NULL, 199 char in_compound = IN_CPD_NOT); 200 struct hentry* suffix_check_twosfx(const char* word, 201 int len, 202 int sfxopts, 203 PfxEntry* ppfx, 204 const FLAG needflag = FLAG_NULL); 205 206 std::string affix_check_morph(const char* word, 207 int len, 208 const FLAG needflag = FLAG_NULL, 209 char in_compound = IN_CPD_NOT); 210 std::string prefix_check_morph(const char* word, 211 int len, 212 char in_compound, 213 const FLAG needflag = FLAG_NULL); 214 std::string suffix_check_morph(const char* word, 215 int len, 216 int sfxopts, 217 PfxEntry* ppfx, 218 const FLAG cclass = FLAG_NULL, 219 const FLAG needflag = FLAG_NULL, 220 char in_compound = IN_CPD_NOT); 221 222 std::string prefix_check_twosfx_morph(const char* word, 223 int len, 224 char in_compound, 225 const FLAG needflag = FLAG_NULL); 226 std::string suffix_check_twosfx_morph(const char* word, 227 int len, 228 int sfxopts, 229 PfxEntry* ppfx, 230 const FLAG needflag = FLAG_NULL); 231 232 std::string morphgen(const char* ts, 233 int wl, 234 const unsigned short* ap, 235 unsigned short al, 236 const char* morph, 237 const char* targetmorph, 238 int level); 239 240 int expand_rootword(struct guessword* wlst, 241 int maxn, 242 const char* ts, 243 int wl, 244 const unsigned short* ap, 245 unsigned short al, 246 const char* bad, 247 int, 248 const char*); 249 250 short get_syllable(const std::string& word); 251 int cpdrep_check(const char* word, int len); 252 int cpdwordpair_check(const char * word, int len); 253 int cpdpat_check(const char* word, 254 int len, 255 hentry* r1, 256 hentry* r2, 257 const char affixed); 258 int defcpd_check(hentry*** words, 259 short wnum, 260 hentry* rv, 261 hentry** rwords, 262 char all); 263 int cpdcase_check(const char* word, int len); 264 inline int candidate_check(const char* word, int len); 265 void setcminmax(int* cmin, int* cmax, const char* word, int len); 266 struct hentry* compound_check(const std::string& word, 267 short wordnum, 268 short numsyllable, 269 short maxwordnum, 270 short wnum, 271 hentry** words, 272 hentry** rwords, 273 char hu_mov_rule, 274 char is_sug, 275 int* info); 276 277 int compound_check_morph(const char* word, 278 int len, 279 short wordnum, 280 short numsyllable, 281 short maxwordnum, 282 short wnum, 283 hentry** words, 284 hentry** rwords, 285 char hu_mov_rule, 286 std::string& result, 287 const std::string* partresult); 288 289 std::vector<std::string> get_suffix_words(short unsigned* suff, 290 int len, 291 const char* root_word); 292 293 struct hentry* lookup(const char* word); 294 const std::vector<replentry>& get_reptable() const; 295 RepList* get_iconvtable() const; 296 RepList* get_oconvtable() const; 297 struct phonetable* get_phonetable() const; 298 const std::vector<mapentry>& get_maptable() const; 299 const std::vector<std::string>& get_breaktable() const; 300 const std::string& get_encoding(); 301 int get_langnum() const; 302 char* get_key_string(); 303 char* get_try_string() const; 304 const std::string& get_wordchars() const; 305 const std::vector<w_char>& get_wordchars_utf16() const; 306 const char* get_ignore() const; 307 const std::vector<w_char>& get_ignore_utf16() const; 308 int get_compound() const; 309 FLAG get_compoundflag() const; 310 FLAG get_forbiddenword() const; 311 FLAG get_nosuggest() const; 312 FLAG get_nongramsuggest() const; 313 FLAG get_substandard() const; 314 FLAG get_needaffix() const; 315 FLAG get_onlyincompound() const; 316 const char* get_derived() const; 317 const std::string& get_version() const; 318 int have_contclass() const; 319 int get_utf8() const; 320 int get_complexprefixes() const; 321 char* get_suffixed(char) const; 322 int get_maxngramsugs() const; 323 int get_maxcpdsugs() const; 324 int get_maxdiff() const; 325 int get_onlymaxdiff() const; 326 int get_nosplitsugs() const; 327 int get_sugswithdots(void) const; 328 FLAG get_keepcase(void) const; 329 FLAG get_forceucase(void) const; 330 FLAG get_warn(void) const; 331 int get_forbidwarn(void) const; 332 int get_checksharps(void) const; 333 char* encode_flag(unsigned short aflag) const; 334 int get_fullstrip() const; 335 336 private: 337 int parse_file(const char* affpath, const char* key); 338 bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af); 339 bool parse_num(const std::string& line, int* out, FileMgr* af); 340 bool parse_cpdsyllable(const std::string& line, FileMgr* af); 341 bool parse_convtable(const std::string& line, 342 FileMgr* af, 343 RepList** rl, 344 const std::string& keyword); 345 bool parse_phonetable(const std::string& line, FileMgr* af); 346 bool parse_maptable(const std::string& line, FileMgr* af); 347 bool parse_breaktable(const std::string& line, FileMgr* af); 348 bool parse_checkcpdtable(const std::string& line, FileMgr* af); 349 bool parse_defcpdtable(const std::string& line, FileMgr* af); 350 bool parse_affix(const std::string& line, const char at, FileMgr* af, char* dupflags); 351 352 void reverse_condition(std::string&); 353 std::string& debugflag(std::string& result, unsigned short flag); 354 int condlen(const char*); 355 int encodeit(AffEntry& entry, const char* cs); 356 int build_pfxtree(PfxEntry* pfxptr); 357 int build_sfxtree(SfxEntry* sfxptr); 358 int process_pfx_order(); 359 int process_sfx_order(); 360 PfxEntry* process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr); 361 SfxEntry* process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr); 362 int process_pfx_tree_to_list(); 363 int process_sfx_tree_to_list(); 364 int redundant_condition(char, const char* strip, int stripl, const char* cond, int); 365 void finishFileMgr(FileMgr* afflst); 366 }; 367 368 #endif