tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mlbe.h (4292B)


      1 // © 2022 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #ifndef MLBREAKENGINE_H
      5 #define MLBREAKENGINE_H
      6 
      7 #include "hash.h"
      8 #include "unicode/resbund.h"
      9 #include "unicode/uniset.h"
     10 #include "unicode/utext.h"
     11 #include "uvectr32.h"
     12 
     13 U_NAMESPACE_BEGIN
     14 
     15 #if !UCONFIG_NO_BREAK_ITERATION
     16 
     17 /**
     18 * A machine learning break engine for the phrase breaking in Japanese.
     19 */
     20 class MlBreakEngine : public UMemory {
     21   public:
     22    /**
     23     * Constructor.
     24     *
     25     * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
     26     * alphabet.
     27     * @param closePunctuationSet An UnicodeSet with close punctuation.
     28     * @param status Information on any errors encountered.
     29     */
     30    MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
     31                  const UnicodeSet &closePunctuationSet, UErrorCode &status);
     32 
     33    /**
     34     * Virtual destructor.
     35     */
     36    virtual ~MlBreakEngine();
     37 
     38   public:
     39    /**
     40     * Divide up a range of characters handled by this break engine.
     41     *
     42     * @param inText A UText representing the text
     43     * @param rangeStart The start of the range of the characters
     44     * @param rangeEnd The end of the range of the characters
     45     * @param foundBreaks Output of C array of int32_t break positions, or 0
     46     * @param inString The normalized string of text ranging from rangeStart to rangeEnd
     47     * @param inputMap The vector storing the native index of inText
     48     * @param status Information on any errors encountered.
     49     * @return The number of breaks found
     50     */
     51    int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
     52                          UVector32 &foundBreaks, const UnicodeString &inString,
     53                          const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
     54 
     55   private:
     56    /**
     57     * Load the machine learning's model file.
     58     *
     59     * @param error Information on any errors encountered.
     60     */
     61    void loadMLModel(UErrorCode &error);
     62 
     63    /**
     64     * In the machine learning's model file, specify the name of the key and value to load the
     65     * corresponding feature and its score.
     66     *
     67     * @param rb A ResouceBundle corresponding to the model file.
     68     * @param keyName The kay name in the model file.
     69     * @param valueName The value name in the model file.
     70     * @param model A hashtable to store the pairs of the feature and its score.
     71     * @param error Information on any errors encountered.
     72     */
     73    void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
     74                      Hashtable &model, UErrorCode &error);
     75 
     76    /**
     77     * Initialize the index list from the input string.
     78     *
     79     * @param inString A input string to be segmented.
     80     * @param indexList A code unit index list of inString.
     81     * @param status Information on any errors encountered.
     82     * @return The number of code units of the first four characters in inString.
     83     */
     84    int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
     85                          UErrorCode &status) const;
     86 
     87    /**
     88     * Evaluate whether the index is a potential breakpoint.
     89     *
     90     * @param inString A input string to be segmented.
     91     * @param indexList A code unit index list of the inString.
     92     * @param startIdx The start index of the indexList.
     93     * @param numCodeUnits  The current code unit boundary of the indexList.
     94     * @param numBreaks The accumulated number of breakpoints.
     95     * @param boundary A vector including the index of the breakpoint.
     96     * @param status Information on any errors encountered.
     97     * @return The number of breakpoints
     98     */
     99    int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
    100                               int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
    101                               UErrorCode &status) const;
    102 
    103    void printUnicodeString(const UnicodeString &s) const;
    104 
    105    UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
    106    UnicodeSet fClosePunctuationSet;
    107    Hashtable fModel[13];  // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
    108    int32_t fNegativeSum;
    109 };
    110 
    111 #endif
    112 
    113 U_NAMESPACE_END
    114 
    115 /* MLBREAKENGINE_H */
    116 #endif