[ tor-browser ].git.dasho

csrmbcs.h (6066B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2005-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #ifndef __CSRMBCS_H
     11 #define __CSRMBCS_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_CONVERSION
     16 
     17 #include "csrecog.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 // "Character"  iterated character class.
     22 //    Recognizers for specific mbcs encodings make their "characters" available
     23 //    by providing a nextChar() function that fills in an instance of IteratedChar
     24 //    with the next char from the input.
     25 //    The returned characters are not converted to Unicode, but remain as the raw
     26 //    bytes (concatenated into an int) from the codepage data.
     27 //
     28 //  For Asian charsets, use the raw input rather than the input that has been
     29 //   stripped of markup.  Detection only considers multi-byte chars, effectively
     30 //   stripping markup anyway, and double byte chars do occur in markup too.
     31 //
     32 class IteratedChar : public UMemory
     33 {
     34 public:
     35    uint32_t charValue;             // 1-4 bytes from the raw input data
     36    int32_t  index;
     37    int32_t  nextIndex;
     38    UBool    error;
     39    UBool    done;
     40 
     41 public:
     42    IteratedChar();
     43    //void reset();
     44    int32_t nextByte(InputText* det);
     45 };
     46 
     47 
     48 class CharsetRecog_mbcs : public CharsetRecognizer {
     49 
     50 protected:
     51    /**
     52     * Test the match of this charset with the input text data
     53     *      which is obtained via the CharsetDetector object.
     54     *
     55     * @param det  The CharsetDetector, which contains the input text
     56     *             to be checked for being in this charset.
     57     * @return     Two values packed into one int  (Damn java, anyhow)
     58     *             <br/>
     59     *             bits 0-7:  the match confidence, ranging from 0-100
     60     *             <br/>
     61     *             bits 8-15: The match reason, an enum-like value.
     62     */
     63    int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
     64 
     65 public:
     66 
     67    virtual ~CharsetRecog_mbcs();
     68 
     69    /**
     70     * Get the IANA name of this charset.
     71     * @return the charset name.
     72     */
     73 
     74    const char *getName() const override = 0;
     75    const char *getLanguage() const override = 0;
     76    UBool match(InputText* input, CharsetMatch *results) const override = 0;
     77 
     78    /**
     79     * Get the next character (however many bytes it is) from the input data
     80     *    Subclasses for specific charset encodings must implement this function
     81     *    to get characters according to the rules of their encoding scheme.
     82     *
     83     *  This function is not a method of class IteratedChar only because
     84     *   that would require a lot of extra derived classes, which is awkward.
     85     * @param it  The IteratedChar "struct" into which the returned char is placed.
     86     * @param det The charset detector, which is needed to get at the input byte data
     87     *            being iterated over.
     88     * @return    True if a character was returned, false at end of input.
     89     */
     90    virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
     91 
     92 };
     93 
     94 
     95 /**
     96 *   Shift-JIS charset recognizer.
     97 *
     98 */
     99 class CharsetRecog_sjis : public CharsetRecog_mbcs {
    100 public:
    101    virtual ~CharsetRecog_sjis();
    102 
    103    UBool nextChar(IteratedChar *it, InputText *det) const override;
    104 
    105    UBool match(InputText* input, CharsetMatch *results) const override;
    106 
    107    const char *getName() const override;
    108    const char *getLanguage() const override;
    109 
    110 };
    111 
    112 
    113 /**
    114 *   EUC charset recognizers.  One abstract class that provides the common function
    115 *             for getting the next character according to the EUC encoding scheme,
    116 *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
    117 *
    118 */
    119 class CharsetRecog_euc : public CharsetRecog_mbcs
    120 {
    121 public:
    122    virtual ~CharsetRecog_euc();
    123 
    124    const char *getName() const override = 0;
    125    const char *getLanguage() const override = 0;
    126 
    127    UBool match(InputText* input, CharsetMatch *results) const override = 0;
    128    /*
    129     *  (non-Javadoc)
    130     *  Get the next character value for EUC based encodings.
    131     *  Character "value" is simply the raw bytes that make up the character
    132     *     packed into an int.
    133     */
    134    UBool nextChar(IteratedChar *it, InputText *det) const override;
    135 };
    136 
    137 /**
    138 * The charset recognize for EUC-JP.  A singleton instance of this class
    139 *    is created and kept by the public CharsetDetector class
    140 */
    141 class CharsetRecog_euc_jp : public CharsetRecog_euc
    142 {
    143 public:
    144    virtual ~CharsetRecog_euc_jp();
    145 
    146    const char *getName() const override;
    147    const char *getLanguage() const override;
    148 
    149    UBool match(InputText* input, CharsetMatch *results) const override;
    150 };
    151 
    152 /**
    153 * The charset recognize for EUC-KR.  A singleton instance of this class
    154 *    is created and kept by the public CharsetDetector class
    155 */
    156 class CharsetRecog_euc_kr : public CharsetRecog_euc
    157 {
    158 public:
    159    virtual ~CharsetRecog_euc_kr();
    160 
    161    const char *getName() const override;
    162    const char *getLanguage() const override;
    163 
    164    UBool match(InputText* input, CharsetMatch *results) const override;
    165 };
    166 
    167 /**
    168 *
    169 *   Big5 charset recognizer.
    170 *
    171 */
    172 class CharsetRecog_big5 : public CharsetRecog_mbcs
    173 {
    174 public:
    175    virtual ~CharsetRecog_big5();
    176 
    177    UBool nextChar(IteratedChar* it, InputText* det) const override;
    178 
    179    const char *getName() const override;
    180    const char *getLanguage() const override;
    181 
    182    UBool match(InputText* input, CharsetMatch *results) const override;
    183 };
    184 
    185 
    186 /**
    187 *
    188 *   GB-18030 recognizer. Uses simplified Chinese statistics.
    189 *
    190 */
    191 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
    192 {
    193 public:
    194    virtual ~CharsetRecog_gb_18030();
    195 
    196    UBool nextChar(IteratedChar* it, InputText* det) const override;
    197 
    198    const char *getName() const override;
    199    const char *getLanguage() const override;
    200 
    201    UBool match(InputText* input, CharsetMatch *results) const override;
    202 };
    203 
    204 U_NAMESPACE_END
    205 
    206 #endif
    207 #endif /* __CSRMBCS_H */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE