csutil.hxx (12333B)
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 * 4 * Copyright (C) 2002-2022 Németh László 5 * 6 * The contents of this file are subject to the Mozilla Public License Version 7 * 1.1 (the "License"); you may not use this file except in compliance with 8 * the License. You may obtain a copy of the License at 9 * http://www.mozilla.org/MPL/ 10 * 11 * Software distributed under the License is distributed on an "AS IS" basis, 12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 13 * for the specific language governing rights and limitations under the 14 * License. 15 * 16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. 17 * 18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, 19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, 20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, 21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, 22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen 23 * 24 * Alternatively, the contents of this file may be used under the terms of 25 * either the GNU General Public License Version 2 or later (the "GPL"), or 26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 * in which case the provisions of the GPL or the LGPL are applicable instead 28 * of those above. If you wish to allow use of your version of this file only 29 * under the terms of either the GPL or the LGPL, and not to allow others to 30 * use your version of this file under the terms of the MPL, indicate your 31 * decision by deleting the provisions above and replace them with the notice 32 * and other provisions required by the GPL or the LGPL. If you do not delete 33 * the provisions above, a recipient may use your version of this file under 34 * the terms of any one of the MPL, the GPL or the LGPL. 35 * 36 * ***** END LICENSE BLOCK ***** */ 37 /* 38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada 39 * And Contributors. All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 52 * 3. All modifications to the source code must be clearly marked as 53 * such. Binary redistributions based on modified source code 54 * must be clearly marked as modified versions in the documentation 55 * and/or other materials provided with the distribution. 56 * 57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS 58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 */ 70 71 #ifndef CSUTIL_HXX_ 72 #define CSUTIL_HXX_ 73 74 #include "hunvisapi.h" 75 76 // First some base level utility routines 77 78 #include <fstream> 79 #include <string> 80 #include <vector> 81 #include <string.h> 82 #include "w_char.hxx" 83 #include "htypes.hxx" 84 85 // casing 86 #define NOCAP 0 87 #define INITCAP 1 88 #define ALLCAP 2 89 #define HUHCAP 3 90 #define HUHINITCAP 4 91 92 // default encoding and keystring 93 #define SPELL_ENCODING "ISO8859-1" 94 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" 95 96 // default morphological fields 97 #define MORPH_STEM "st:" 98 #define MORPH_ALLOMORPH "al:" 99 #define MORPH_POS "po:" 100 #define MORPH_DERI_PFX "dp:" 101 #define MORPH_INFL_PFX "ip:" 102 #define MORPH_TERM_PFX "tp:" 103 #define MORPH_DERI_SFX "ds:" 104 #define MORPH_INFL_SFX "is:" 105 #define MORPH_TERM_SFX "ts:" 106 #define MORPH_SURF_PFX "sp:" 107 #define MORPH_FREQ "fr:" 108 #define MORPH_PHON "ph:" 109 #define MORPH_HYPH "hy:" 110 #define MORPH_PART "pa:" 111 #define MORPH_FLAG "fl:" 112 #define MORPH_HENTRY "_H:" 113 #define MORPH_TAG_LEN strlen(MORPH_STEM) 114 115 #define MSEP_FLD ' ' 116 #define MSEP_REC '\n' 117 #define MSEP_ALT '\v' 118 119 // default flags 120 #define DEFAULTFLAGS 65510 121 #define FORBIDDENWORD 65510 122 #define ONLYUPCASEFLAG 65511 123 124 // fix long pathname problem of WIN32 by using w_char std::fstream::open override 125 LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path, 126 std::ios_base::openmode mode); 127 128 // convert UTF-16 characters to UTF-8 129 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest, 130 const std::vector<w_char>& src); 131 132 // convert UTF-8 characters to UTF-16 133 LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest, 134 const std::string& src); 135 136 // remove end of line char(s) 137 LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s); 138 139 // duplicate string 140 LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s); 141 142 // parse into tokens with char delimiter 143 LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str, 144 std::string::const_iterator& start); 145 146 // replace pat by rep in word and return word 147 LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str, 148 const std::string& search, 149 const std::string& replace); 150 151 // append s to ends of every lines in text 152 LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str, 153 const std::string& apd); 154 155 // tokenize into lines with new line 156 LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text, 157 char breakchar); 158 159 // tokenize into lines with new line and uniq in place 160 LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar); 161 162 LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar); 163 164 // reverse word 165 LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word); 166 167 // reverse word 168 LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&); 169 170 // remove duplicates 171 LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list); 172 173 // character encoding information 174 struct cs_info { 175 unsigned char ccase; 176 unsigned char clower; 177 unsigned char cupper; 178 }; 179 180 LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl(); 181 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl(); 182 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c, 183 int langnum); 184 LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum); 185 LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum); 186 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c, 187 int langnum); 188 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c); 189 190 LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es); 191 192 // get language identifiers of language codes 193 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang); 194 195 // get characters of the given 8bit encoding with lower- and uppercase forms 196 LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc); 197 198 // convert std::string to all caps 199 LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s, 200 const struct cs_info* csconv); 201 202 // convert null terminated string to all little 203 LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s, 204 const struct cs_info* csconv); 205 206 // convert first letter of string to little 207 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s, 208 const struct cs_info* csconv); 209 210 // convert first letter of string to capital 211 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s, 212 const struct cs_info* csconv); 213 214 // convert first letter of UTF-8 string to capital 215 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& 216 mkinitcap_utf(std::vector<w_char>& u, int langnum); 217 218 // convert UTF-8 string to little 219 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& 220 mkallsmall_utf(std::vector<w_char>& u, int langnum); 221 222 // convert first letter of UTF-8 string to little 223 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& 224 mkinitsmall_utf(std::vector<w_char>& u, int langnum); 225 226 // convert UTF-8 string to capital 227 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>& 228 mkallcap_utf(std::vector<w_char>& u, int langnum); 229 230 // get type of capitalization 231 LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*); 232 233 // get type of capitalization (UTF-8) 234 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum); 235 236 // strip all ignored characters in the string 237 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf( 238 std::string& word, 239 const std::vector<w_char>& ignored_chars); 240 241 // strip all ignored characters in the string 242 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars( 243 std::string& word, 244 const std::string& ignored_chars); 245 246 LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line, 247 std::string& out, 248 int ln); 249 250 LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line, 251 std::string& out, 252 std::vector<w_char>& out_utf16, 253 int utf8, 254 int ln); 255 256 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r); 257 258 LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest, 259 const std::string& morph, 260 const std::string& var); 261 262 // conversion function for protected memory 263 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source); 264 265 // conversion function for protected memory 266 LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s); 267 268 269 // to avoid unnecessary string copies and Unicode conversions 270 // we simply check the ignored_chars characters in the word 271 // (in the case of UTF-8 encoded strings, "false" means 272 // "likely false", if ignored_chars characters are not ASCII) 273 inline bool has_no_ignored_chars(const std::string& word, 274 const std::string& ignored_chars) { 275 for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it) 276 if (word.find(*it) != std::string::npos) 277 return false; 278 return true; 279 } 280 281 // hash entry macros 282 inline char* HENTRY_DATA(struct hentry* h) { 283 char* ret; 284 if (!(h->var & H_OPT)) 285 ret = NULL; 286 else if (h->var & H_OPT_ALIASM) 287 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); 288 else 289 ret = HENTRY_WORD(h) + h->blen + 1; 290 return ret; 291 } 292 293 inline const char* HENTRY_DATA( 294 const struct hentry* h) { 295 const char* ret; 296 if (!(h->var & H_OPT)) 297 ret = NULL; 298 else if (h->var & H_OPT_ALIASM) 299 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); 300 else 301 ret = HENTRY_WORD(h) + h->blen + 1; 302 return ret; 303 } 304 305 // NULL-free version for warning-free OOo build 306 inline const char* HENTRY_DATA2( 307 const struct hentry* h) { 308 const char* ret; 309 if (!(h->var & H_OPT)) 310 ret = ""; 311 else if (h->var & H_OPT_ALIASM) 312 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); 313 else 314 ret = HENTRY_WORD(h) + h->blen + 1; 315 return ret; 316 } 317 318 inline char* HENTRY_FIND(struct hentry* h, 319 const char* p) { 320 return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL); 321 } 322 323 #endif