tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf.h (8860B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  utf.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 1999sep09
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 /**
     20 * \file
     21 * \brief C API: Code point macros
     22 *
     23 * This file defines macros for checking whether a code point is
     24 * a surrogate or a non-character etc.
     25 *
     26 * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
     27 * and itself includes utf8.h and utf16.h after some
     28 * common definitions.
     29 * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 1 then each of these headers must be
     30 * included explicitly if their definitions are used.
     31 *
     32 * utf8.h and utf16.h define macros for efficiently getting code points
     33 * in and out of UTF-8/16 strings.
     34 * utf16.h macros have "U16_" prefixes.
     35 * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling.
     36 *
     37 * ICU mostly processes 16-bit Unicode strings.
     38 * Most of the time, such strings are well-formed UTF-16.
     39 * Single, unpaired surrogates must be handled as well, and are treated in ICU
     40 * like regular code points where possible.
     41 * (Pairs of surrogate code points are indistinguishable from supplementary
     42 * code points encoded as pairs of supplementary code units.)
     43 *
     44 * In fact, almost all Unicode code points in normal text (>99%)
     45 * are on the BMP (<=U+ffff) and even <=U+d7ff.
     46 * ICU functions handle supplementary code points (U+10000..U+10ffff)
     47 * but are optimized for the much more frequently occurring BMP code points.
     48 *
     49 * umachine.h defines UChar to be an unsigned 16-bit integer.
     50 * Since ICU 59, ICU uses char16_t in C++, UChar only in C,
     51 * and defines UChar=char16_t by default. See the UChar API docs for details.
     52 *
     53 * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
     54 * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
     55 * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
     56 * the definition of UChar. For details see the documentation for UChar32 itself.
     57 *
     58 * utf.h defines a small number of C macros for single Unicode code points.
     59 * These are simple checks for surrogates and non-characters.
     60 * For actual Unicode character properties see uchar.h.
     61 *
     62 * By default, string operations must be done with error checking in case
     63 * a string is not well-formed UTF-16 or UTF-8.
     64 *
     65 * The U16_ macros detect if a surrogate code unit is unpaired
     66 * (lead unit without trail unit or vice versa) and just return the unit itself
     67 * as the code point.
     68 *
     69 * The U8_ macros detect illegal byte sequences and return a negative value.
     70 * Starting with ICU 60, the observable length of a single illegal byte sequence
     71 * skipped by one of these macros follows the Unicode 6+ recommendation
     72 * which is consistent with the W3C Encoding Standard.
     73 *
     74 * There are ..._OR_FFFD versions of both U16_ and U8_ macros
     75 * that return U+FFFD for illegal code unit sequences.
     76 *
     77 * The regular "safe" macros require that the initial, passed-in string index
     78 * is within bounds. They only check the index when they read more than one
     79 * code unit. This is usually done with code similar to the following loop:
     80 * <pre>while(i<length) {
     81 *   U16_NEXT(s, i, length, c);
     82 *   // use c
     83 * }</pre>
     84 *
     85 * When it is safe to assume that text is well-formed UTF-16
     86 * (does not contain single, unpaired surrogates), then one can use
     87 * U16_..._UNSAFE macros.
     88 * These do not check for proper code unit sequences or truncated text and may
     89 * yield wrong results or even cause a crash if they are used with "malformed"
     90 * text.
     91 * In practice, U16_..._UNSAFE macros will produce slightly less code but
     92 * should not be faster because the processing is only different when a
     93 * surrogate code unit is detected, which will be rare.
     94 *
     95 * Similarly for UTF-8, there are "safe" macros without a suffix,
     96 * and U8_..._UNSAFE versions.
     97 * The performance differences are much larger here because UTF-8 provides so
     98 * many opportunities for malformed sequences.
     99 * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
    100 * and are fast, while the safe UTF-8 macros call functions for some complicated cases.
    101 *
    102 * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
    103 * code point values (0..U+10ffff). They are indicated with negative values instead.
    104 *
    105 * For more information see the ICU User Guide Strings chapter
    106 * (https://unicode-org.github.io/icu/userguide/strings).
    107 *
    108 * <em>Usage:</em>
    109 * ICU coding guidelines for if() statements should be followed when using these macros.
    110 * Compound statements (curly braces {}) must be used  for if-else-while... 
    111 * bodies and all macro statements should be terminated with semicolon.
    112 *
    113 * @stable ICU 2.4
    114 */
    115 
    116 #ifndef __UTF_H__
    117 #define __UTF_H__
    118 
    119 #include "unicode/umachine.h"
    120 /* include the utfXX.h after the following definitions */
    121 
    122 /* single-code point definitions -------------------------------------------- */
    123 
    124 #ifndef U_HIDE_DRAFT_API
    125 
    126 /**
    127 * Is c a Unicode code point U+0000..U+10FFFF?
    128 * https://www.unicode.org/glossary/#code_point
    129 *
    130 * @param c 32-bit code point
    131 * @return true or false
    132 * @draft ICU 78
    133 * @see AllCodePoints
    134 * @see U_IS_SCALAR_VALUE
    135 */
    136 #define U_IS_CODE_POINT(c) ((uint32_t)(c)<=0x10ffff)
    137 
    138 /**
    139 * Is c a Unicode scalar value, that is, a non-surrogate code point?
    140 * Only scalar values can be represented in well-formed UTF-8/16/32.
    141 * https://www.unicode.org/glossary/#unicode_scalar_value
    142 *
    143 * @param c 32-bit code point
    144 * @return true or false
    145 * @draft ICU 78
    146 * @see AllScalarValues
    147 * @see U_IS_CODE_POINT
    148 */
    149 #define U_IS_SCALAR_VALUE(c) ((uint32_t)(c)<0xd800 || (0xe000<=(c) && (c)<=0x10ffff))
    150 
    151 #endif  // U_HIDE_DRAFT_API
    152 
    153 /**
    154 * Is this code point a Unicode noncharacter?
    155 * https://www.unicode.org/glossary/#noncharacter
    156 *
    157 * @param c 32-bit code point
    158 * @return true or false
    159 * @stable ICU 2.4
    160 */
    161 #define U_IS_UNICODE_NONCHAR(c) \
    162    ((c)>=0xfdd0 && \
    163     ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
    164 
    165 /**
    166 * Is c a Unicode code point value (0..U+10ffff)
    167 * that can be assigned a character?
    168 *
    169 * Code points that are not characters include:
    170 * - single surrogate code points (U+d800..U+dfff, 2048 code points)
    171 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
    172 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
    173 * - the highest Unicode code point value is U+10ffff
    174 *
    175 * This means that all code points below U+d800 are character code points,
    176 * and that boundary is tested first for performance.
    177 *
    178 * @param c 32-bit code point
    179 * @return true or false
    180 * @stable ICU 2.4
    181 */
    182 #define U_IS_UNICODE_CHAR(c) \
    183    ((uint32_t)(c)<0xd800 || \
    184        (0xe000<=(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
    185 
    186 /**
    187 * Is this code point a BMP code point (U+0000..U+ffff)?
    188 * @param c 32-bit code point
    189 * @return true or false
    190 * @stable ICU 2.8
    191 */
    192 #define U_IS_BMP(c) ((uint32_t)(c)<=0xffff)
    193 
    194 /**
    195 * Is this code point a supplementary code point (U+10000..U+10ffff)?
    196 * @param c 32-bit code point
    197 * @return true or false
    198 * @stable ICU 2.8
    199 */
    200 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000)<=0xfffff)
    201 
    202 /**
    203 * Is this code point a lead surrogate (U+d800..U+dbff)?
    204 * @param c 32-bit code point
    205 * @return true or false
    206 * @stable ICU 2.4
    207 */
    208 #define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
    209 
    210 /**
    211 * Is this code point a trail surrogate (U+dc00..U+dfff)?
    212 * @param c 32-bit code point
    213 * @return true or false
    214 * @stable ICU 2.4
    215 */
    216 #define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
    217 
    218 /**
    219 * Is this code point a surrogate (U+d800..U+dfff)?
    220 * @param c 32-bit code point
    221 * @return true or false
    222 * @stable ICU 2.4
    223 */
    224 #define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
    225 
    226 /**
    227 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
    228 * is it a lead surrogate?
    229 * @param c 32-bit code point
    230 * @return true or false
    231 * @stable ICU 2.4
    232 */
    233 #define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
    234 
    235 /**
    236 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
    237 * is it a trail surrogate?
    238 * @param c 32-bit code point
    239 * @return true or false
    240 * @stable ICU 4.2
    241 */
    242 #define U_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
    243 
    244 /* include the utfXX.h ------------------------------------------------------ */
    245 
    246 #if !U_NO_DEFAULT_INCLUDE_UTF_HEADERS
    247 
    248 #include "unicode/utf8.h"
    249 #include "unicode/utf16.h"
    250 
    251 /* utf_old.h contains deprecated, pre-ICU 2.4 definitions */
    252 #include "unicode/utf_old.h"
    253 
    254 #endif  /* !U_NO_DEFAULT_INCLUDE_UTF_HEADERS */
    255 
    256 #endif  /* __UTF_H__ */