tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tridpars.h (15295B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **************************************************************************
      5 *   Copyright (c) 2002-2010, International Business Machines Corporation *
      6 *   and others.  All Rights Reserved.                                    *
      7 **************************************************************************
      8 *   Date        Name        Description                                  *
      9 *   01/28/2002  aliu        Creation.                                    *
     10 **************************************************************************
     11 */
     12 #ifndef TRIDPARS_H
     13 #define TRIDPARS_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_TRANSLITERATION
     18 
     19 #include "unicode/uobject.h"
     20 #include "unicode/unistr.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 class Transliterator;
     25 class UnicodeSet;
     26 class UVector;
     27 
     28 /**
     29 * Parsing component for transliterator IDs.  This class contains only
     30 * static members; it cannot be instantiated.  Methods in this class
     31 * parse various ID formats, including the following:
     32 *
     33 * A basic ID, which contains source, target, and variant, but no
     34 * filter and no explicit inverse.  Examples include
     35 * "Latin-Greek/UNGEGN" and "Null".
     36 *
     37 * A single ID, which is a basic ID plus optional filter and optional
     38 * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
     39 * "Lower (Upper)".
     40 *
     41 * A compound ID, which is a sequence of one or more single IDs,
     42 * separated by semicolons, with optional forward and reverse global
     43 * filters.  The global filters are UnicodeSet patterns prepended or
     44 * appended to the IDs, separated by semicolons.  An appended filter
     45 * must be enclosed in parentheses and applies in the reverse
     46 * direction.
     47 *
     48 * @author Alan Liu
     49 */
     50 class TransliteratorIDParser /* not : public UObject because all methods are static */ {
     51 
     52 public:
     53 
     54    /**
     55     * A structure containing the parsed data of a filtered ID, that
     56     * is, a basic ID optionally with a filter.
     57     *
     58     * 'source' and 'target' will always be non-null.  The 'variant'
     59     * will be non-null only if a non-empty variant was parsed.
     60     *
     61     * 'sawSource' is true if there was an explicit source in the
     62     * parsed id.  If there was no explicit source, then an implied
     63     * source of ANY is returned and 'sawSource' is set to false.
     64     * 
     65     * 'filter' is the parsed filter pattern, or null if there was no
     66     * filter.
     67     */
     68    class Specs : public UMemory {
     69    public:
     70        UnicodeString source; // not null
     71        UnicodeString target; // not null
     72        UnicodeString variant; // may be null
     73        UnicodeString filter; // may be null
     74        UBool sawSource;
     75        Specs(const UnicodeString& s, const UnicodeString& t,
     76              const UnicodeString& v, UBool sawS,
     77              const UnicodeString& f);
     78 
     79    private:
     80 
     81        Specs(const Specs &other); // forbid copying of this class
     82        Specs &operator=(const Specs &other); // forbid copying of this class
     83    };
     84 
     85    /**
     86     * A structure containing the canonicalized data of a filtered ID,
     87     * that is, a basic ID optionally with a filter.
     88     *
     89     * 'canonID' is always non-null.  It may be the empty string "".
     90     * It is the id that should be assigned to the created
     91     * transliterator.  It _cannot_ be instantiated directly.
     92     *
     93     * 'basicID' is always non-null and non-empty.  It is always of
     94     * the form S-T or S-T/V.  It is designed to be fed to low-level
     95     * instantiation code that only understands these two formats.
     96     *
     97     * 'filter' may be null, if there is none, or non-null and
     98     * non-empty.
     99     */
    100    class SingleID : public UMemory {
    101    public:
    102        UnicodeString canonID;
    103        UnicodeString basicID;
    104        UnicodeString filter;
    105        SingleID(const UnicodeString& c, const UnicodeString& b,
    106                 const UnicodeString& f);
    107        SingleID(const UnicodeString& c, const UnicodeString& b);
    108        Transliterator* createInstance();
    109 
    110    private:
    111 
    112        SingleID(const SingleID &other); // forbid copying of this class
    113        SingleID &operator=(const SingleID &other); // forbid copying of this class
    114    };
    115 
    116    /**
    117     * Parse a filter ID, that is, an ID of the general form
    118     * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
    119     * @param id the id to be parsed
    120     * @param pos INPUT-OUTPUT parameter.  On input, the position of
    121     * the first character to parse.  On output, the position after
    122     * the last character parsed.
    123     * @return a SingleID object or null if the parse fails
    124     */
    125    static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
    126 
    127    /**
    128     * Parse a single ID, that is, an ID of the general form
    129     * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
    130     * optional, the filters optional, and the variants optional.
    131     * @param id the id to be parsed
    132     * @param pos INPUT-OUTPUT parameter.  On input, the position of
    133     * the first character to parse.  On output, the position after
    134     * the last character parsed.
    135     * @param dir the direction.  If the direction is REVERSE then the
    136     * SingleID is constructed for the reverse direction.
    137     * @return a SingleID object or null
    138     */
    139    static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
    140                                  int32_t dir, UErrorCode& status);
    141 
    142    /**
    143     * Parse a global filter of the form "[f]" or "([f])", depending
    144     * on 'withParens'.
    145     * @param id the pattern the parse
    146     * @param pos INPUT-OUTPUT parameter.  On input, the position of
    147     * the first character to parse.  On output, the position after
    148     * the last character parsed.
    149     * @param dir the direction.
    150     * @param withParens INPUT-OUTPUT parameter.  On entry, if
    151     * withParens[0] is 0, then parens are disallowed.  If it is 1,
    152     * then parens are required.  If it is -1, then parens are
    153     * optional, and the return result will be set to 0 or 1.
    154     * @param canonID OUTPUT parameter.  The pattern for the filter
    155     * added to the canonID, either at the end, if dir is FORWARD, or
    156     * at the start, if dir is REVERSE.  The pattern will be enclosed
    157     * in parentheses if appropriate, and will be suffixed with an
    158     * ID_DELIM character.  May be null.
    159     * @return a UnicodeSet object or null.  A non-null results
    160     * indicates a successful parse, regardless of whether the filter
    161     * applies to the given direction.  The caller should discard it
    162     * if withParens != (dir == REVERSE).
    163     */
    164    static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
    165                                         int32_t dir,
    166                                         int32_t& withParens,
    167                                         UnicodeString* canonID);
    168 
    169    /**
    170     * Parse a compound ID, consisting of an optional forward global
    171     * filter, a separator, one or more single IDs delimited by
    172     * separators, an an optional reverse global filter.  The
    173     * separator is a semicolon.  The global filters are UnicodeSet
    174     * patterns.  The reverse global filter must be enclosed in
    175     * parentheses.
    176     * @param id the pattern the parse
    177     * @param dir the direction.
    178     * @param canonID OUTPUT parameter that receives the canonical ID,
    179     * consisting of canonical IDs for all elements, as returned by
    180     * parseSingleID(), separated by semicolons.  Previous contents
    181     * are discarded.
    182     * @param list OUTPUT parameter that receives a list of SingleID
    183     * objects representing the parsed IDs.  Previous contents are
    184     * discarded.
    185     * @param globalFilter OUTPUT parameter that receives a pointer to
    186     * a newly created global filter for this ID in this direction, or
    187     * null if there is none.
    188     * @return true if the parse succeeds, that is, if the entire
    189     * id is consumed without syntax error.
    190     */
    191    static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
    192                                 UnicodeString& canonID,
    193                                 UVector& list,
    194                                 UnicodeSet*& globalFilter);
    195 
    196    /**
    197     * Convert the elements of the 'list' vector, which are SingleID
    198     * objects, into actual Transliterator objects.  In the course of
    199     * this, some (or all) entries may be removed.  If all entries
    200     * are removed, the Null transliterator will be added.
    201     *
    202     * Delete entries with empty basicIDs; these are generated by
    203     * elements like "(A)" in the forward direction, or "A()" in
    204     * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
    205     * SingleID entries to actual transliterators.
    206     *
    207     * @param list vector of SingleID objects.  On exit, vector
    208     * of one or more Transliterators.
    209     * @param ec Output param to receive a success or an error code.
    210     * @return new value of insertIndex.  The index will shift if
    211     * there are empty items, like "(Lower)", with indices less than
    212     * insertIndex.
    213     */
    214    static void instantiateList(UVector& list,
    215                                UErrorCode& ec);
    216 
    217    /**
    218     * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
    219     * S-T/V, or S/V-T.  If the source is missing, return a source of
    220     * ANY.
    221     * @param id the id string, in any of several forms
    222     * @param source          the given source.
    223     * @param target          the given target.
    224     * @param variant         the given variant
    225     * @param isSourcePresent If true then the source is present. 
    226     *                        If the source is not present, ANY will be
    227     *                        given as the source, and isSourcePresent will be null
    228     * @return an array of 4 strings: source, target, variant, and
    229     * isSourcePresent.  If the source is not present, ANY will be
    230     * given as the source, and isSourcePresent will be null.  Otherwise
    231     * isSourcePresent will be non-null.  The target may be empty if the
    232     * id is not well-formed.  The variant may be empty.
    233     */
    234    static void IDtoSTV(const UnicodeString& id,
    235                        UnicodeString& source,
    236                        UnicodeString& target,
    237                        UnicodeString& variant,
    238                        UBool& isSourcePresent);
    239 
    240    /**
    241     * Given source, target, and variant strings, concatenate them into a
    242     * full ID.  If the source is empty, then "Any" will be used for the
    243     * source, so the ID will always be of the form s-t/v or s-t.
    244     */
    245    static void STVtoID(const UnicodeString& source,
    246                        const UnicodeString& target,
    247                        const UnicodeString& variant,
    248                        UnicodeString& id);
    249 
    250    /**
    251     * Register two targets as being inverses of one another.  For
    252     * example, calling registerSpecialInverse("NFC", "NFD", true) causes
    253     * Transliterator to form the following inverse relationships:
    254     *
    255     * <pre>NFC => NFD
    256     * Any-NFC => Any-NFD
    257     * NFD => NFC
    258     * Any-NFD => Any-NFC</pre>
    259     *
    260     * (Without the special inverse registration, the inverse of NFC
    261     * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
    262     * that the presence or absence of "Any-" is preserved.
    263     *
    264     * <p>The relationship is symmetrical; registering (a, b) is
    265     * equivalent to registering (b, a).
    266     *
    267     * <p>The relevant IDs must still be registered separately as
    268     * factories or classes.
    269     *
    270     * <p>Only the targets are specified.  Special inverses always
    271     * have the form Any-Target1 <=> Any-Target2.  The target should
    272     * have canonical casing (the casing desired to be produced when
    273     * an inverse is formed) and should contain no whitespace or other
    274     * extraneous characters.
    275     *
    276     * @param target the target against which to register the inverse
    277     * @param inverseTarget the inverse of target, that is
    278     * Any-target.getInverse() => Any-inverseTarget
    279     * @param bidirectional if true, register the reverse relation
    280     * as well, that is, Any-inverseTarget.getInverse() => Any-target
    281     */
    282    static void registerSpecialInverse(const UnicodeString& target,
    283                                       const UnicodeString& inverseTarget,
    284                                       UBool bidirectional,
    285                                       UErrorCode &status);
    286 
    287    /**
    288     * Free static memory.
    289     */
    290    static void cleanup();
    291 
    292 private:
    293    //----------------------------------------------------------------
    294    // Private implementation
    295    //----------------------------------------------------------------
    296 
    297    // forbid instantiation
    298    TransliteratorIDParser();
    299 
    300    /**
    301     * Parse an ID into component pieces.  Take IDs of the form T,
    302     * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
    303     * source of ANY.
    304     * @param id the id string, in any of several forms
    305     * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
    306     * offset of the first character to parse in id.  On output,
    307     * pos[0] is the offset after the last parsed character.  If the
    308     * parse failed, pos[0] will be unchanged.
    309     * @param allowFilter if true, a UnicodeSet pattern is allowed
    310     * at any location between specs or delimiters, and is returned
    311     * as the fifth string in the array.
    312     * @return a Specs object, or null if the parse failed.  If
    313     * neither source nor target was seen in the parsed id, then the
    314     * parse fails.  If allowFilter is true, then the parsed filter
    315     * pattern is returned in the Specs object, otherwise the returned
    316     * filter reference is null.  If the parse fails for any reason
    317     * null is returned.
    318     */
    319    static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
    320                                UBool allowFilter);
    321 
    322    /**
    323     * Givens a Specs object, convert it to a SingleID object.  The
    324     * Spec object is a more unprocessed parse result.  The SingleID
    325     * object contains information about canonical and basic IDs.
    326     * @param specs the given Specs object.
    327     * @param dir   either FORWARD or REVERSE.
    328     * @return a SingleID; never returns null.  Returned object always
    329     * has 'filter' field of null.
    330     */
    331    static SingleID* specsToID(const Specs* specs, int32_t dir);
    332 
    333    /**
    334     * Given a Specs object, return a SingleID representing the
    335     * special inverse of that ID.  If there is no special inverse
    336     * then return null.
    337     * @param specs the given Specs.
    338     * @return a SingleID or null.  Returned object always has
    339     * 'filter' field of null.
    340     */
    341    static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
    342 
    343    /**
    344     * Glue method to get around access problems in C++.
    345     * @param id the id string for the transliterator, in any of several forms
    346     * @param canonID the given canonical ID
    347     */
    348    static Transliterator* createBasicInstance(const UnicodeString& id,
    349                                               const UnicodeString* canonID);
    350 
    351    /**
    352     * Initialize static memory.
    353     */
    354    static void U_CALLCONV init(UErrorCode &status);
    355 
    356    friend class SingleID;
    357 };
    358 
    359 U_NAMESPACE_END
    360 
    361 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    362 
    363 #endif