tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dictbe.h (13581B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2006-2014, International Business Machines Corporation   *
      6 * and others. All Rights Reserved.                                            *
      7 *******************************************************************************
      8 */
      9 
     10 #ifndef DICTBE_H
     11 #define DICTBE_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/utext.h"
     16 
     17 #include "brkeng.h"
     18 #include "hash.h"
     19 #include "mlbe.h"
     20 #include "uvectr32.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 class DictionaryMatcher;
     25 class MlBreakEngine;
     26 class Normalizer2;
     27 
     28 /*******************************************************************
     29 * DictionaryBreakEngine
     30 */
     31 
     32 /**
     33 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
     34 * dictionary to determine language-specific breaks.</p>
     35 *
     36 * <p>After it is constructed a DictionaryBreakEngine may be shared between
     37 * threads without synchronization.</p>
     38 */
     39 class DictionaryBreakEngine : public LanguageBreakEngine {
     40 private:
     41    /**
     42     * The set of characters handled by this engine
     43     * @internal
     44     */
     45 
     46  UnicodeSet    fSet;
     47 
     48 public:
     49 
     50  /**
     51   * <p>Constructor </p>
     52   */
     53  DictionaryBreakEngine();
     54 
     55  /**
     56   * <p>Virtual destructor.</p>
     57   */
     58  virtual ~DictionaryBreakEngine();
     59 
     60  /**
     61   * <p>Indicate whether this engine handles a particular character for
     62   * a particular kind of break.</p>
     63   *
     64   * @param c A character which begins a run that the engine might handle
     65   * @param locale The locale.
     66   * @return true if this engine handles the particular character and break
     67   * type.
     68   */
     69  virtual UBool handles(UChar32 c, const char* locale) const override;
     70 
     71  /**
     72   * <p>Find any breaks within a run in the supplied text.</p>
     73   *
     74   * @param text A UText representing the text. The iterator is left at
     75   * the end of the run of characters which the engine is capable of handling
     76   * that starts from the first character in the range.
     77   * @param startPos The start of the run within the supplied text.
     78   * @param endPos The end of the run within the supplied text.
     79   * @param foundBreaks vector of int32_t to receive the break positions
     80   * @param status Information on any errors encountered.
     81   * @return The number of breaks found.
     82   */
     83  virtual int32_t findBreaks( UText *text,
     84                              int32_t startPos,
     85                              int32_t endPos,
     86                              UVector32 &foundBreaks,
     87                              UBool isPhraseBreaking,
     88                              UErrorCode& status ) const override;
     89 
     90 protected:
     91 
     92 /**
     93  * <p>Set the character set handled by this engine.</p>
     94  *
     95  * @param set A UnicodeSet of the set of characters handled by the engine
     96  */
     97  virtual void setCharacters( const UnicodeSet &set );
     98 
     99 /**
    100  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    101  *
    102  * @param text A UText representing the text
    103  * @param rangeStart The start of the range of dictionary characters
    104  * @param rangeEnd The end of the range of dictionary characters
    105  * @param foundBreaks Output of C array of int32_t break positions, or 0
    106  * @param status Information on any errors encountered.
    107  * @return The number of breaks found
    108  */
    109  virtual int32_t divideUpDictionaryRange( UText *text,
    110                                           int32_t rangeStart,
    111                                           int32_t rangeEnd,
    112                                           UVector32 &foundBreaks,
    113                                           UBool isPhraseBreaking,
    114                                           UErrorCode& status) const = 0;
    115 
    116 };
    117 
    118 /*******************************************************************
    119 * ThaiBreakEngine
    120 */
    121 
    122 /**
    123 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
    124 * dictionary and heuristics to determine Thai-specific breaks.</p>
    125 *
    126 * <p>After it is constructed a ThaiBreakEngine may be shared between
    127 * threads without synchronization.</p>
    128 */
    129 class ThaiBreakEngine : public DictionaryBreakEngine {
    130 private:
    131    /**
    132     * The set of characters handled by this engine
    133     * @internal
    134     */
    135 
    136  UnicodeSet                fEndWordSet;
    137  UnicodeSet                fBeginWordSet;
    138  UnicodeSet                fSuffixSet;
    139  UnicodeSet                fMarkSet;
    140  DictionaryMatcher  *fDictionary;
    141 
    142 public:
    143 
    144  /**
    145   * <p>Default constructor.</p>
    146   *
    147   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    148   * engine is deleted.
    149   */
    150  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    151 
    152  /**
    153   * <p>Virtual destructor.</p>
    154   */
    155  virtual ~ThaiBreakEngine();
    156 
    157 protected:
    158 /**
    159  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    160  *
    161  * @param text A UText representing the text
    162  * @param rangeStart The start of the range of dictionary characters
    163  * @param rangeEnd The end of the range of dictionary characters
    164  * @param foundBreaks Output of C array of int32_t break positions, or 0
    165  * @param status Information on any errors encountered.
    166  * @return The number of breaks found
    167  */
    168  virtual int32_t divideUpDictionaryRange( UText *text,
    169                                           int32_t rangeStart,
    170                                           int32_t rangeEnd,
    171                                           UVector32 &foundBreaks,
    172                                           UBool isPhraseBreaking,
    173                                           UErrorCode& status) const override;
    174 
    175 };
    176 
    177 /*******************************************************************
    178 * LaoBreakEngine
    179 */
    180 
    181 /**
    182 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
    183 * dictionary and heuristics to determine Lao-specific breaks.</p>
    184 *
    185 * <p>After it is constructed a LaoBreakEngine may be shared between
    186 * threads without synchronization.</p>
    187 */
    188 class LaoBreakEngine : public DictionaryBreakEngine {
    189 private:
    190    /**
    191     * The set of characters handled by this engine
    192     * @internal
    193     */
    194 
    195  UnicodeSet                fEndWordSet;
    196  UnicodeSet                fBeginWordSet;
    197  UnicodeSet                fMarkSet;
    198  DictionaryMatcher  *fDictionary;
    199 
    200 public:
    201 
    202  /**
    203   * <p>Default constructor.</p>
    204   *
    205   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    206   * engine is deleted.
    207   */
    208  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    209 
    210  /**
    211   * <p>Virtual destructor.</p>
    212   */
    213  virtual ~LaoBreakEngine();
    214 
    215 protected:
    216 /**
    217  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    218  *
    219  * @param text A UText representing the text
    220  * @param rangeStart The start of the range of dictionary characters
    221  * @param rangeEnd The end of the range of dictionary characters
    222  * @param foundBreaks Output of C array of int32_t break positions, or 0
    223  * @param status Information on any errors encountered.
    224  * @return The number of breaks found
    225  */
    226  virtual int32_t divideUpDictionaryRange( UText *text,
    227                                           int32_t rangeStart,
    228                                           int32_t rangeEnd,
    229                                           UVector32 &foundBreaks,
    230                                           UBool isPhraseBreaking,
    231                                           UErrorCode& status) const override;
    232 
    233 };
    234 
    235 /*******************************************************************
    236 * BurmeseBreakEngine
    237 */
    238 
    239 /**
    240 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
    241 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
    242 *
    243 * <p>After it is constructed a BurmeseBreakEngine may be shared between
    244 * threads without synchronization.</p>
    245 */
    246 class BurmeseBreakEngine : public DictionaryBreakEngine {
    247 private:
    248    /**
    249     * The set of characters handled by this engine
    250     * @internal
    251     */
    252 
    253  UnicodeSet                fEndWordSet;
    254  UnicodeSet                fBeginWordSet;
    255  UnicodeSet                fMarkSet;
    256  DictionaryMatcher  *fDictionary;
    257 
    258 public:
    259 
    260  /**
    261   * <p>Default constructor.</p>
    262   *
    263   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    264   * engine is deleted.
    265   */
    266  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    267 
    268  /**
    269   * <p>Virtual destructor.</p>
    270   */
    271  virtual ~BurmeseBreakEngine();
    272 
    273 protected:
    274 /**
    275  * <p>Divide up a range of known dictionary characters.</p>
    276  *
    277  * @param text A UText representing the text
    278  * @param rangeStart The start of the range of dictionary characters
    279  * @param rangeEnd The end of the range of dictionary characters
    280  * @param foundBreaks Output of C array of int32_t break positions, or 0
    281  * @param status Information on any errors encountered.
    282  * @return The number of breaks found
    283  */
    284  virtual int32_t divideUpDictionaryRange( UText *text,
    285                                           int32_t rangeStart,
    286                                           int32_t rangeEnd,
    287                                           UVector32 &foundBreaks,
    288                                           UBool isPhraseBreaking,
    289                                           UErrorCode& status) const override;
    290 
    291 };
    292 
    293 /*******************************************************************
    294 * KhmerBreakEngine
    295 */
    296 
    297 /**
    298 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
    299 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
    300 *
    301 * <p>After it is constructed a KhmerBreakEngine may be shared between
    302 * threads without synchronization.</p>
    303 */
    304 class KhmerBreakEngine : public DictionaryBreakEngine {
    305 private:
    306    /**
    307     * The set of characters handled by this engine
    308     * @internal
    309     */
    310 
    311  UnicodeSet                fEndWordSet;
    312  UnicodeSet                fBeginWordSet;
    313  UnicodeSet                fMarkSet;
    314  DictionaryMatcher  *fDictionary;
    315 
    316 public:
    317 
    318  /**
    319   * <p>Default constructor.</p>
    320   *
    321   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    322   * engine is deleted.
    323   */
    324  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    325 
    326  /**
    327   * <p>Virtual destructor.</p>
    328   */
    329  virtual ~KhmerBreakEngine();
    330 
    331 protected:
    332 /**
    333  * <p>Divide up a range of known dictionary characters.</p>
    334  *
    335  * @param text A UText representing the text
    336  * @param rangeStart The start of the range of dictionary characters
    337  * @param rangeEnd The end of the range of dictionary characters
    338  * @param foundBreaks Output of C array of int32_t break positions, or 0
    339  * @param status Information on any errors encountered.
    340  * @return The number of breaks found
    341  */
    342  virtual int32_t divideUpDictionaryRange( UText *text,
    343                                           int32_t rangeStart,
    344                                           int32_t rangeEnd,
    345                                           UVector32 &foundBreaks,
    346                                           UBool isPhraseBreaking,
    347                                           UErrorCode& status) const override;
    348 
    349 };
    350 
    351 #if !UCONFIG_NO_NORMALIZATION
    352 
    353 /*******************************************************************
    354 * CjkBreakEngine
    355 */
    356 
    357 //indicates language/script that the CjkBreakEngine will handle
    358 enum LanguageType {
    359    kKorean,
    360    kChineseJapanese
    361 };
    362 
    363 /**
    364 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
    365 * dictionary with costs associated with each word and
    366 * Viterbi decoding to determine CJK-specific breaks.</p>
    367 */
    368 class CjkBreakEngine : public DictionaryBreakEngine {
    369 protected:
    370    /**
    371     * The set of characters handled by this engine
    372     * @internal
    373     */
    374  UnicodeSet                fHangulWordSet;
    375  UnicodeSet                fDigitOrOpenPunctuationOrAlphabetSet;
    376  UnicodeSet                fClosePunctuationSet;
    377 
    378  DictionaryMatcher        *fDictionary;
    379  const Normalizer2        *nfkcNorm2;
    380  MlBreakEngine            *fMlBreakEngine;
    381  bool                      isCj;
    382 
    383 private:
    384  // Load Japanese extensions.
    385  void loadJapaneseExtensions(UErrorCode& error);
    386  // Load Japanese Hiragana.
    387  void loadHiragana(UErrorCode& error);
    388  // Initialize fSkipSet by loading Japanese Hiragana and extensions.
    389  void initJapanesePhraseParameter(UErrorCode& error);
    390 
    391  Hashtable fSkipSet;
    392 
    393 public:
    394 
    395    /**
    396     * <p>Default constructor.</p>
    397     *
    398     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    399     * engine is deleted. The DictionaryMatcher must contain costs for each word
    400     * in order for the dictionary to work properly.
    401     */
    402  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
    403 
    404    /**
    405     * <p>Virtual destructor.</p>
    406     */
    407  virtual ~CjkBreakEngine();
    408 
    409 protected:
    410    /**
    411     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    412     *
    413     * @param text A UText representing the text
    414     * @param rangeStart The start of the range of dictionary characters
    415     * @param rangeEnd The end of the range of dictionary characters
    416     * @param foundBreaks Output of C array of int32_t break positions, or 0
    417     * @param status Information on any errors encountered.
    418     * @return The number of breaks found
    419     */
    420  virtual int32_t divideUpDictionaryRange( UText *text,
    421          int32_t rangeStart,
    422          int32_t rangeEnd,
    423          UVector32 &foundBreaks,
    424          UBool isPhraseBreaking,
    425          UErrorCode& status) const override;
    426 
    427 };
    428 
    429 #endif
    430 
    431 U_NAMESPACE_END
    432 
    433    /* DICTBE_H */
    434 #endif