tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

lstmbe.h (2620B)


      1 // © 2021 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #ifndef LSTMBE_H
      5 #define LSTMBE_H
      6 
      7 #include "unicode/utypes.h"
      8 
      9 #if !UCONFIG_NO_BREAK_ITERATION
     10 
     11 #include "unicode/uniset.h"
     12 #include "unicode/ures.h"
     13 #include "unicode/utext.h"
     14 #include "unicode/utypes.h"
     15 
     16 #include "brkeng.h"
     17 #include "dictbe.h"
     18 #include "uvectr32.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 class Vectorizer;
     23 struct LSTMData;
     24 
     25 /*******************************************************************
     26 * LSTMBreakEngine
     27 */
     28 
     29 /**
     30 * <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
     31 * LSTM to determine language-specific breaks.</p>
     32 *
     33 * <p>After it is constructed a LSTMBreakEngine may be shared between
     34 * threads without synchronization.</p>
     35 */
     36 class LSTMBreakEngine : public DictionaryBreakEngine {
     37 public:
     38    /**
     39     * <p>Constructor.</p>
     40     */
     41    LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
     42 
     43    /**
     44     * <p>Virtual destructor.</p>
     45     */
     46    virtual ~LSTMBreakEngine();
     47 
     48    virtual const char16_t* name() const;
     49 
     50 protected:
     51    /**
     52     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
     53     *
     54     * @param text A UText representing the text
     55     * @param rangeStart The start of the range of dictionary characters
     56     * @param rangeEnd The end of the range of dictionary characters
     57     * @param foundBreaks Output of C array of int32_t break positions, or 0
     58     * @param status Information on any errors encountered.
     59     * @return The number of breaks found
     60     */
     61     virtual int32_t divideUpDictionaryRange(UText *text,
     62                                             int32_t rangeStart,
     63                                             int32_t rangeEnd,
     64                                             UVector32 &foundBreaks,
     65                                             UBool isPhraseBreaking,
     66                                             UErrorCode& status) const override;
     67 private:
     68    const LSTMData* fData;
     69    const Vectorizer* fVectorizer;
     70 };
     71 
     72 U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
     73    UScriptCode script, const LSTMData* data, UErrorCode& status);
     74 
     75 U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
     76    UResourceBundle* rb, UErrorCode& status);
     77 
     78 U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
     79    UScriptCode script, UErrorCode& status);
     80 
     81 U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
     82 U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data);
     83 
     84 U_NAMESPACE_END
     85 
     86 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
     87 
     88 #endif  /* LSTMBE_H */