tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucsdet.h (15043B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2005-2013, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucsdet.h
      9 *   encoding:   UTF-8
     10 *   indentation:4
     11 *
     12 *   created on: 2005Aug04
     13 *   created by: Andy Heninger
     14 *
     15 *   ICU Character Set Detection, API for C
     16 *
     17 *   Draft version 18 Oct 2005
     18 *
     19 */
     20 
     21 #ifndef __UCSDET_H
     22 #define __UCSDET_H
     23 
     24 #include "unicode/utypes.h"
     25 
     26 #if !UCONFIG_NO_CONVERSION
     27 
     28 #include "unicode/uenum.h"
     29 
     30 #if U_SHOW_CPLUSPLUS_API
     31 #include "unicode/localpointer.h"
     32 #endif   // U_SHOW_CPLUSPLUS_API
     33 
     34 /**
     35 * \file 
     36 * \brief C API: Charset Detection API
     37 *
     38 * This API provides a facility for detecting the
     39 * charset or encoding of character data in an unknown text format.
     40 * The input data can be from an array of bytes.
     41 * <p>
     42 * Character set detection is at best an imprecise operation.  The detection
     43 * process will attempt to identify the charset that best matches the characteristics
     44 * of the byte data, but the process is partly statistical in nature, and
     45 * the results can not be guaranteed to always be correct.
     46 * <p>
     47 * For best accuracy in charset detection, the input data should be primarily
     48 * in a single language, and a minimum of a few hundred bytes worth of plain text
     49 * in the language are needed.  The detection process will attempt to
     50 * ignore html or xml style markup that could otherwise obscure the content.
     51 * <p>
     52 * An alternative to the ICU Charset Detector is the
     53 * Compact Encoding Detector, https://github.com/google/compact_enc_det.
     54 * It often gives more accurate results, especially with short input samples.
     55 */
     56 
     57 
     58 struct UCharsetDetector;
     59 /**
     60  * Structure representing a charset detector
     61  * @stable ICU 3.6
     62  */
     63 typedef struct UCharsetDetector UCharsetDetector;
     64 
     65 struct UCharsetMatch;
     66 /**
     67  *  Opaque structure representing a match that was identified
     68  *  from a charset detection operation.
     69  *  @stable ICU 3.6
     70  */
     71 typedef struct UCharsetMatch UCharsetMatch;
     72 
     73 /**
     74  *  Open a charset detector.
     75  *
     76  *  @param status Any error conditions occurring during the open
     77  *                operation are reported back in this variable.
     78  *  @return the newly opened charset detector.
     79  *  @stable ICU 3.6
     80  */
     81 U_CAPI UCharsetDetector * U_EXPORT2
     82 ucsdet_open(UErrorCode   *status);
     83 
     84 /**
     85  * Close a charset detector.  All storage and any other resources
     86  *   owned by this charset detector will be released.  Failure to
     87  *   close a charset detector when finished with it can result in
     88  *   memory leaks in the application.
     89  *
     90  *  @param ucsd  The charset detector to be closed.
     91  *  @stable ICU 3.6
     92  */
     93 U_CAPI void U_EXPORT2
     94 ucsdet_close(UCharsetDetector *ucsd);
     95 
     96 #if U_SHOW_CPLUSPLUS_API
     97 
     98 U_NAMESPACE_BEGIN
     99 
    100 /**
    101 * \class LocalUCharsetDetectorPointer
    102 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
    103 * For most methods see the LocalPointerBase base class.
    104 *
    105 * @see LocalPointerBase
    106 * @see LocalPointer
    107 * @stable ICU 4.4
    108 */
    109 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
    110 
    111 U_NAMESPACE_END
    112 
    113 #endif
    114 
    115 /**
    116  * Set the input byte data whose charset is to detected.
    117  *
    118  * Ownership of the input  text byte array remains with the caller.
    119  * The input string must not be altered or deleted until the charset
    120  * detector is either closed or reset to refer to different input text.
    121  *
    122  * @param ucsd   the charset detector to be used.
    123  * @param textIn the input text of unknown encoding.   .
    124  * @param len    the length of the input text, or -1 if the text
    125  *               is NUL terminated.
    126  * @param status any error conditions are reported back in this variable.
    127  *
    128  * @stable ICU 3.6
    129  */
    130 U_CAPI void U_EXPORT2
    131 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
    132 
    133 
    134 /** Set the declared encoding for charset detection.
    135 *  The declared encoding of an input text is an encoding obtained
    136 *  by the user from an http header or xml declaration or similar source that
    137 *  can be provided as an additional hint to the charset detector.
    138 *
    139 *  How and whether the declared encoding will be used during the
    140 *  detection process is TBD.
    141 *
    142 * @param ucsd      the charset detector to be used.
    143 * @param encoding  an encoding for the current data obtained from
    144 *                  a header or declaration or other source outside
    145 *                  of the byte data itself.
    146 * @param length    the length of the encoding name, or -1 if the name string
    147 *                  is NUL terminated.
    148 * @param status    any error conditions are reported back in this variable.
    149 *
    150 * @stable ICU 3.6
    151 */
    152 U_CAPI void U_EXPORT2
    153 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
    154 
    155 
    156 /**
    157 * Return the charset that best matches the supplied input data.
    158 * 
    159 * Note though, that because the detection 
    160 * only looks at the start of the input data,
    161 * there is a possibility that the returned charset will fail to handle
    162 * the full set of input data.
    163 * <p>
    164 * The returned UCharsetMatch object is owned by the UCharsetDetector.
    165 * It will remain valid until the detector input is reset, or until
    166 * the detector is closed.
    167 * <p>
    168 * The function will fail if
    169 *  <ul>
    170 *    <li>no charset appears to match the data.</li>
    171 *    <li>no input text has been provided</li>
    172 *  </ul>
    173 *
    174 * @param ucsd      the charset detector to be used.
    175 * @param status    any error conditions are reported back in this variable.
    176 * @return          a UCharsetMatch  representing the best matching charset,
    177 *                  or NULL if no charset matches the byte data.
    178 *
    179 * @stable ICU 3.6
    180 */
    181 U_CAPI const UCharsetMatch * U_EXPORT2
    182 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
    183    
    184 
    185 /**
    186 *  Find all charset matches that appear to be consistent with the input,
    187 *  returning an array of results.  The results are ordered with the
    188 *  best quality match first.
    189 *
    190 *  Because the detection only looks at a limited amount of the
    191 *  input byte data, some of the returned charsets may fail to handle
    192 *  the all of input data.
    193 *  <p>
    194 *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
    195 *  They will remain valid until the detector is closed or modified
    196 *  
    197 * <p>
    198 * Return an error if 
    199 *  <ul>
    200 *    <li>no charsets appear to match the input data.</li>
    201 *    <li>no input text has been provided</li>
    202 *  </ul>
    203 * 
    204 * @param ucsd          the charset detector to be used.
    205 * @param matchesFound  pointer to a variable that will be set to the
    206 *                      number of charsets identified that are consistent with
    207 *                      the input data.  Output only.
    208 * @param status        any error conditions are reported back in this variable.
    209 * @return              A pointer to an array of pointers to UCharSetMatch objects.
    210 *                      This array, and the UCharSetMatch instances to which it refers,
    211 *                      are owned by the UCharsetDetector, and will remain valid until
    212 *                      the detector is closed or modified.
    213 * @stable ICU 3.6
    214 */
    215 U_CAPI const UCharsetMatch ** U_EXPORT2
    216 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
    217 
    218 
    219 
    220 /**
    221 *  Get the name of the charset represented by a UCharsetMatch.
    222 *
    223 *  The storage for the returned name string is owned by the
    224 *  UCharsetMatch, and will remain valid while the UCharsetMatch
    225 *  is valid.
    226 *
    227 *  The name returned is suitable for use with the ICU conversion APIs.
    228 *
    229 *  @param ucsm    The charset match object.
    230 *  @param status  Any error conditions are reported back in this variable.
    231 *  @return        The name of the matching charset.
    232 *
    233 *  @stable ICU 3.6
    234 */
    235 U_CAPI const char * U_EXPORT2
    236 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
    237 
    238 /**
    239 *  Get a confidence number for the quality of the match of the byte
    240 *  data with the charset.  Confidence numbers range from zero to 100,
    241 *  with 100 representing complete confidence and zero representing
    242 *  no confidence.
    243 *
    244 *  The confidence values are somewhat arbitrary.  They define an
    245 *  an ordering within the results for any single detection operation
    246 *  but are not generally comparable between the results for different input.
    247 *
    248 *  A confidence value of ten does have a general meaning - it is used
    249 *  for charsets that can represent the input data, but for which there
    250 *  is no other indication that suggests that the charset is the correct one.
    251 *  Pure 7 bit ASCII data, for example, is compatible with a
    252 *  great many charsets, most of which will appear as possible matches
    253 *  with a confidence of 10.
    254 *
    255 *  @param ucsm    The charset match object.
    256 *  @param status  Any error conditions are reported back in this variable.
    257 *  @return        A confidence number for the charset match.
    258 *
    259 *  @stable ICU 3.6
    260 */
    261 U_CAPI int32_t U_EXPORT2
    262 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
    263 
    264 /**
    265 *  Get the RFC 3066 code for the language of the input data.
    266 *
    267 *  The Charset Detection service is intended primarily for detecting
    268 *  charsets, not language.  For some, but not all, charsets, a language is
    269 *  identified as a byproduct of the detection process, and that is what
    270 *  is returned by this function.
    271 *
    272 *  CAUTION:
    273 *    1.  Language information is not available for input data encoded in
    274 *        all charsets. In particular, no language is identified
    275 *        for UTF-8 input data.
    276 *
    277 *    2.  Closely related languages may sometimes be confused.
    278 *
    279 *  If more accurate language detection is required, a linguistic
    280 *  analysis package should be used.
    281 *
    282 *  The storage for the returned name string is owned by the
    283 *  UCharsetMatch, and will remain valid while the UCharsetMatch
    284 *  is valid.
    285 *
    286 *  @param ucsm    The charset match object.
    287 *  @param status  Any error conditions are reported back in this variable.
    288 *  @return        The RFC 3066 code for the language of the input data, or
    289 *                 an empty string if the language could not be determined.
    290 *
    291 *  @stable ICU 3.6
    292 */
    293 U_CAPI const char * U_EXPORT2
    294 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
    295 
    296 
    297 /**
    298  *  Get the entire input text as a UChar string, placing it into
    299  *  a caller-supplied buffer.  A terminating
    300  *  NUL character will be appended to the buffer if space is available.
    301  *
    302  *  The number of UChars in the output string, not including the terminating
    303  *  NUL, is returned. 
    304  *
    305  *  If the supplied buffer is smaller than required to hold the output,
    306  *  the contents of the buffer are undefined.  The full output string length
    307  *  (in UChars) is returned as always, and can be used to allocate a buffer
    308  *  of the correct size.
    309  *
    310  *
    311  * @param ucsm    The charset match object.
    312  * @param buf     A UChar buffer to be filled with the converted text data.
    313  * @param cap     The capacity of the buffer in UChars.
    314  * @param status  Any error conditions are reported back in this variable.
    315  * @return        The number of UChars in the output string.
    316  *
    317  * @stable ICU 3.6
    318  */
    319 U_CAPI  int32_t U_EXPORT2
    320 ucsdet_getUChars(const UCharsetMatch *ucsm,
    321                 UChar *buf, int32_t cap, UErrorCode *status);
    322 
    323 
    324 
    325 /**
    326  *  Get an iterator over the set of all detectable charsets - 
    327  *  over the charsets that are known to the charset detection
    328  *  service.
    329  *
    330  *  The returned UEnumeration provides access to the names of
    331  *  the charsets.
    332  *
    333  *  <p>
    334  *  The state of the Charset detector that is passed in does not
    335  *  affect the result of this function, but requiring a valid, open
    336  *  charset detector as a parameter insures that the charset detection
    337  *  service has been safely initialized and that the required detection
    338  *  data is available.
    339  *
    340  *  <p>
    341  *  <b>Note:</b> Multiple different charset encodings in a same family may use
    342  *  a single shared name in this implementation. For example, this method returns
    343  *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
    344  *  (Windows Latin 1). However, actual detection result could be "windows-1252"
    345  *  when the input data matches Latin 1 code points with any points only available
    346  *  in "windows-1252".
    347  *
    348  *  @param ucsd a Charset detector.
    349  *  @param status  Any error conditions are reported back in this variable.
    350  *  @return an iterator providing access to the detectable charset names.
    351  *  @stable ICU 3.6
    352  */
    353 U_CAPI  UEnumeration * U_EXPORT2
    354 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
    355 
    356 /**
    357  *  Test whether input filtering is enabled for this charset detector.
    358  *  Input filtering removes text that appears to be HTML or xml
    359  *  markup from the input before applying the code page detection
    360  *  heuristics.
    361  *
    362  *  @param ucsd  The charset detector to check.
    363  *  @return true if filtering is enabled.
    364  *  @stable ICU 3.6
    365  */
    366 
    367 U_CAPI  UBool U_EXPORT2
    368 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
    369 
    370 
    371 /**
    372 * Enable filtering of input text. If filtering is enabled,
    373 * text within angle brackets ("<" and ">") will be removed
    374 * before detection, which will remove most HTML or xml markup.
    375 *
    376 * @param ucsd   the charset detector to be modified.
    377 * @param filter <code>true</code> to enable input text filtering.
    378 * @return The previous setting.
    379 *
    380 * @stable ICU 3.6
    381 */
    382 U_CAPI  UBool U_EXPORT2
    383 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
    384 
    385 #ifndef U_HIDE_INTERNAL_API
    386 /**
    387  *  Get an iterator over the set of detectable charsets -
    388  *  over the charsets that are enabled by the specified charset detector.
    389  *
    390  *  The returned UEnumeration provides access to the names of
    391  *  the charsets.
    392  *
    393  *  @param ucsd a Charset detector.
    394  *  @param status  Any error conditions are reported back in this variable.
    395  *  @return an iterator providing access to the detectable charset names by
    396  *  the specified charset detector.
    397  *  @internal
    398  */
    399 U_CAPI UEnumeration * U_EXPORT2
    400 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
    401 
    402 /**
    403  * Enable or disable individual charset encoding.
    404  * A name of charset encoding must be included in the names returned by
    405  * {@link #ucsdet_getAllDetectableCharsets()}.
    406  *
    407  * @param ucsd a Charset detector.
    408  * @param encoding encoding the name of charset encoding.
    409  * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
    410  *   charset encoding.
    411  * @param status receives the return status. When the name of charset encoding
    412  *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
    413  * @internal
    414  */
    415 U_CAPI void U_EXPORT2
    416 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
    417 #endif  /* U_HIDE_INTERNAL_API */
    418 
    419 #endif
    420 #endif   /* __UCSDET_H */