tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

csutil.hxx (12333B)


      1 /* ***** BEGIN LICENSE BLOCK *****
      2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
      3 *
      4 * Copyright (C) 2002-2022 Németh László
      5 *
      6 * The contents of this file are subject to the Mozilla Public License Version
      7 * 1.1 (the "License"); you may not use this file except in compliance with
      8 * the License. You may obtain a copy of the License at
      9 * http://www.mozilla.org/MPL/
     10 *
     11 * Software distributed under the License is distributed on an "AS IS" basis,
     12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
     13 * for the specific language governing rights and limitations under the
     14 * License.
     15 *
     16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
     17 *
     18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
     19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
     20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
     21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
     22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
     23 *
     24 * Alternatively, the contents of this file may be used under the terms of
     25 * either the GNU General Public License Version 2 or later (the "GPL"), or
     26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
     27 * in which case the provisions of the GPL or the LGPL are applicable instead
     28 * of those above. If you wish to allow use of your version of this file only
     29 * under the terms of either the GPL or the LGPL, and not to allow others to
     30 * use your version of this file under the terms of the MPL, indicate your
     31 * decision by deleting the provisions above and replace them with the notice
     32 * and other provisions required by the GPL or the LGPL. If you do not delete
     33 * the provisions above, a recipient may use your version of this file under
     34 * the terms of any one of the MPL, the GPL or the LGPL.
     35 *
     36 * ***** END LICENSE BLOCK ***** */
     37 /*
     38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
     39 * And Contributors.  All rights reserved.
     40 *
     41 * Redistribution and use in source and binary forms, with or without
     42 * modification, are permitted provided that the following conditions
     43 * are met:
     44 *
     45 * 1. Redistributions of source code must retain the above copyright
     46 *    notice, this list of conditions and the following disclaimer.
     47 *
     48 * 2. Redistributions in binary form must reproduce the above copyright
     49 *    notice, this list of conditions and the following disclaimer in the
     50 *    documentation and/or other materials provided with the distribution.
     51 *
     52 * 3. All modifications to the source code must be clearly marked as
     53 *    such.  Binary redistributions based on modified source code
     54 *    must be clearly marked as modified versions in the documentation
     55 *    and/or other materials provided with the distribution.
     56 *
     57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
     58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
     61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68 * SUCH DAMAGE.
     69 */
     70 
     71 #ifndef CSUTIL_HXX_
     72 #define CSUTIL_HXX_
     73 
     74 #include "hunvisapi.h"
     75 
     76 // First some base level utility routines
     77 
     78 #include <fstream>
     79 #include <string>
     80 #include <vector>
     81 #include <string.h>
     82 #include "w_char.hxx"
     83 #include "htypes.hxx"
     84 
     85 // casing
     86 #define NOCAP 0
     87 #define INITCAP 1
     88 #define ALLCAP 2
     89 #define HUHCAP 3
     90 #define HUHINITCAP 4
     91 
     92 // default encoding and keystring
     93 #define SPELL_ENCODING "ISO8859-1"
     94 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
     95 
     96 // default morphological fields
     97 #define MORPH_STEM "st:"
     98 #define MORPH_ALLOMORPH "al:"
     99 #define MORPH_POS "po:"
    100 #define MORPH_DERI_PFX "dp:"
    101 #define MORPH_INFL_PFX "ip:"
    102 #define MORPH_TERM_PFX "tp:"
    103 #define MORPH_DERI_SFX "ds:"
    104 #define MORPH_INFL_SFX "is:"
    105 #define MORPH_TERM_SFX "ts:"
    106 #define MORPH_SURF_PFX "sp:"
    107 #define MORPH_FREQ "fr:"
    108 #define MORPH_PHON "ph:"
    109 #define MORPH_HYPH "hy:"
    110 #define MORPH_PART "pa:"
    111 #define MORPH_FLAG "fl:"
    112 #define MORPH_HENTRY "_H:"
    113 #define MORPH_TAG_LEN strlen(MORPH_STEM)
    114 
    115 #define MSEP_FLD ' '
    116 #define MSEP_REC '\n'
    117 #define MSEP_ALT '\v'
    118 
    119 // default flags
    120 #define DEFAULTFLAGS 65510
    121 #define FORBIDDENWORD 65510
    122 #define ONLYUPCASEFLAG 65511
    123 
    124 // fix long pathname problem of WIN32 by using w_char std::fstream::open override
    125 LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
    126                                     std::ios_base::openmode mode);
    127 
    128 // convert UTF-16 characters to UTF-8
    129 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
    130                                             const std::vector<w_char>& src);
    131 
    132 // convert UTF-8 characters to UTF-16
    133 LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
    134                                    const std::string& src);
    135 
    136 // remove end of line char(s)
    137 LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
    138 
    139 // duplicate string
    140 LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
    141 
    142 // parse into tokens with char delimiter
    143 LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
    144                                                              std::string::const_iterator& start);
    145 
    146 // replace pat by rep in word and return word
    147 LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
    148                                               const std::string& search,
    149                                               const std::string& replace);
    150 
    151 // append s to ends of every lines in text
    152 LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
    153                                                 const std::string& apd);
    154 
    155 // tokenize into lines with new line
    156 LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
    157                                                           char breakchar);
    158 
    159 // tokenize into lines with new line and uniq in place
    160 LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
    161 
    162 LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
    163 
    164 // reverse word
    165 LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
    166 
    167 // reverse word
    168 LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
    169 
    170 // remove duplicates
    171 LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
    172 
    173 // character encoding information
    174 struct cs_info {
    175  unsigned char ccase;
    176  unsigned char clower;
    177  unsigned char cupper;
    178 };
    179 
    180 LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
    181 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
    182 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
    183                                                       int langnum);
    184 LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
    185 LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
    186 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
    187                                                       int langnum);
    188 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
    189 
    190 LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
    191 
    192 // get language identifiers of language codes
    193 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
    194 
    195 // get characters of the given 8bit encoding with lower- and uppercase forms
    196 LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
    197 
    198 // convert std::string to all caps
    199 LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
    200                                               const struct cs_info* csconv);
    201 
    202 // convert null terminated string to all little
    203 LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
    204                                                 const struct cs_info* csconv);
    205 
    206 // convert first letter of string to little
    207 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
    208                                                 const struct cs_info* csconv);
    209 
    210 // convert first letter of string to capital
    211 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
    212                                                const struct cs_info* csconv);
    213 
    214 // convert first letter of UTF-8 string to capital
    215 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
    216 mkinitcap_utf(std::vector<w_char>& u, int langnum);
    217 
    218 // convert UTF-8 string to little
    219 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
    220 mkallsmall_utf(std::vector<w_char>& u, int langnum);
    221 
    222 // convert first letter of UTF-8 string to little
    223 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
    224 mkinitsmall_utf(std::vector<w_char>& u, int langnum);
    225 
    226 // convert UTF-8 string to capital
    227 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
    228 mkallcap_utf(std::vector<w_char>& u, int langnum);
    229 
    230 // get type of capitalization
    231 LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
    232 
    233 // get type of capitalization (UTF-8)
    234 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
    235 
    236 // strip all ignored characters in the string
    237 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
    238    std::string& word,
    239    const std::vector<w_char>& ignored_chars);
    240 
    241 // strip all ignored characters in the string
    242 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
    243    std::string& word,
    244    const std::string& ignored_chars);
    245 
    246 LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
    247                                           std::string& out,
    248                                           int ln);
    249 
    250 LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
    251                                          std::string& out,
    252                                          std::vector<w_char>& out_utf16,
    253                                          int utf8,
    254                                          int ln);
    255 
    256 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
    257 
    258 LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
    259                                         const std::string& morph,
    260                                         const std::string& var);
    261 
    262 // conversion function for protected memory
    263 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
    264 
    265 // conversion function for protected memory
    266 LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
    267 
    268 
    269 // to avoid unnecessary string copies and Unicode conversions
    270 // we simply check the ignored_chars characters in the word
    271 // (in the case of UTF-8 encoded strings, "false" means
    272 // "likely false", if ignored_chars characters are not ASCII)
    273 inline bool has_no_ignored_chars(const std::string& word,
    274                            const std::string& ignored_chars) {
    275  for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
    276    if (word.find(*it) != std::string::npos)
    277      return false;
    278  return true;
    279 }
    280 
    281 // hash entry macros
    282 inline char* HENTRY_DATA(struct hentry* h) {
    283  char* ret;
    284  if (!(h->var & H_OPT))
    285    ret = NULL;
    286  else if (h->var & H_OPT_ALIASM)
    287    ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
    288  else
    289    ret = HENTRY_WORD(h) + h->blen + 1;
    290  return ret;
    291 }
    292 
    293 inline const char* HENTRY_DATA(
    294    const struct hentry* h) {
    295  const char* ret;
    296  if (!(h->var & H_OPT))
    297    ret = NULL;
    298  else if (h->var & H_OPT_ALIASM)
    299    ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
    300  else
    301    ret = HENTRY_WORD(h) + h->blen + 1;
    302  return ret;
    303 }
    304 
    305 // NULL-free version for warning-free OOo build
    306 inline const char* HENTRY_DATA2(
    307    const struct hentry* h) {
    308  const char* ret;
    309  if (!(h->var & H_OPT))
    310    ret = "";
    311  else if (h->var & H_OPT_ALIASM)
    312    ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
    313  else
    314    ret = HENTRY_WORD(h) + h->blen + 1;
    315  return ret;
    316 }
    317 
    318 inline char* HENTRY_FIND(struct hentry* h,
    319                                                  const char* p) {
    320  return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
    321 }
    322 
    323 #endif