tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsBidiUtils.h (9970B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 #ifndef nsBidiUtils_h__
      7 #define nsBidiUtils_h__
      8 
      9 #include "mozilla/intl/BidiClass.h"
     10 
     11 #include "nsString.h"
     12 #include "encoding_rs_mem.h"
     13 
     14 /**
     15 * definitions of bidirection character types by category
     16 */
     17 
     18 #define BIDICLASS_IS_RTL(val)                          \
     19  (((val) == mozilla::intl::BidiClass::RightToLeft) || \
     20   ((val) == mozilla::intl::BidiClass::RightToLeftArabic))
     21 
     22 #define BIDICLASS_IS_WEAK(val)                                      \
     23  (((val) == mozilla::intl::BidiClass::EuropeanNumberSeparator) ||  \
     24   ((val) == mozilla::intl::BidiClass::EuropeanNumberTerminator) || \
     25   (((val) > mozilla::intl::BidiClass::ArabicNumber) &&             \
     26    ((val) != mozilla::intl::BidiClass::RightToLeftArabic)))
     27 
     28 /**
     29 * Inspects a Unichar, converting numbers to Arabic or Hindi forms and
     30 * returning them
     31 * @param aChar is the character
     32 * @param aPrevCharArabic is true if the previous character in the string is
     33 *        an Arabic char
     34 * @param aNumFlag specifies the conversion to perform:
     35 *        IBMBIDI_NUMERAL_NOMINAL:      don't do any conversion
     36 *        IBMBIDI_NUMERAL_HINDI:        convert to Hindi forms
     37 *                                        (Unicode 0660-0669)
     38 *        IBMBIDI_NUMERAL_ARABIC:       convert to Arabic forms
     39 *                                        (Unicode 0030-0039)
     40 *        IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
     41 *                                      Hindi, otherwise to Arabic
     42 * @return the converted Unichar
     43 */
     44 char16_t HandleNumberInChar(char16_t aChar, bool aPrevCharArabic,
     45                            uint32_t aNumFlag);
     46 
     47 /**
     48 * Scan a Unichar string, converting numbers to Arabic or Hindi forms in
     49 * place
     50 * @param aBuffer is the string
     51 * @param aSize is the size of aBuffer
     52 * @param aNumFlag specifies the conversion to perform:
     53 *        IBMBIDI_NUMERAL_NOMINAL:      don't do any conversion
     54 *        IBMBIDI_NUMERAL_HINDI:        convert to Hindi forms
     55 *                                        (Unicode 0660-0669)
     56 *        IBMBIDI_NUMERAL_ARABIC:       convert to Arabic forms
     57 *                                        (Unicode 0030-0039)
     58 *        IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
     59 *                                      Hindi, otherwise to Arabic
     60 */
     61 nsresult HandleNumbers(char16_t* aBuffer, uint32_t aSize, uint32_t aNumFlag);
     62 
     63 /**
     64 * Give a UTF-32 codepoint
     65 * return true if the codepoint is a Bidi control character (LRM, RLM, ALM;
     66 * LRE, RLE, PDF, LRO, RLO; LRI, RLI, FSI, PDI).
     67 * Return false, otherwise
     68 */
     69 #define LRM_CHAR 0x200e
     70 #define RLM_CHAR 0x200f
     71 
     72 #define LRE_CHAR 0x202a
     73 #define RLE_CHAR 0x202b
     74 #define PDF_CHAR 0x202c
     75 #define LRO_CHAR 0x202d
     76 #define RLO_CHAR 0x202e
     77 
     78 #define LRI_CHAR 0x2066
     79 #define RLI_CHAR 0x2067
     80 #define FSI_CHAR 0x2068
     81 #define PDI_CHAR 0x2069
     82 
     83 #define ALM_CHAR 0x061C
     84 inline bool IsBidiControl(uint32_t aChar) {
     85  return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
     86          (LRI_CHAR <= aChar && aChar <= PDI_CHAR) || (aChar == ALM_CHAR) ||
     87          (aChar & 0xfffffe) == LRM_CHAR);
     88 }
     89 
     90 /**
     91 * Give a UTF-32 codepoint
     92 * Return true if the codepoint is a Bidi control character that may result
     93 * in RTL directionality and therefore needs to trigger bidi resolution;
     94 * return false otherwise.
     95 */
     96 inline bool IsBidiControlRTL(uint32_t aChar) {
     97  return aChar == RLM_CHAR || aChar == RLE_CHAR || aChar == RLO_CHAR ||
     98         aChar == RLI_CHAR || aChar == ALM_CHAR;
     99 }
    100 
    101 /**
    102 * Give a 16-bit (UTF-16) text buffer
    103 * @return true if the string contains right-to-left characters
    104 */
    105 inline bool HasRTLChars(mozilla::Span<const char16_t> aBuffer) {
    106  // Span ensures we never pass a nullptr to Rust--even if the
    107  // length of the buffer is zero.
    108  return encoding_mem_is_utf16_bidi(aBuffer.Elements(), aBuffer.Length());
    109 }
    110 
    111 // These values are shared with Preferences dialog
    112 //  ------------------
    113 //  If Pref values are to be changed
    114 //  in the XUL file of Prefs. the values
    115 //  Must be changed here too..
    116 //  ------------------
    117 //
    118 #define IBMBIDI_TEXTDIRECTION_STR "bidi.direction"
    119 #define IBMBIDI_TEXTTYPE_STR "bidi.texttype"
    120 #define IBMBIDI_NUMERAL_STR "bidi.numeral"
    121 
    122 //  ------------------
    123 //  Text Direction
    124 //  ------------------
    125 //  bidi.direction
    126 #define IBMBIDI_TEXTDIRECTION_LTR 1  //  1 = directionLTRBidi *
    127 #define IBMBIDI_TEXTDIRECTION_RTL 2  //  2 = directionRTLBidi
    128 //  ------------------
    129 //  Text Type
    130 //  ------------------
    131 //  bidi.texttype
    132 #define IBMBIDI_TEXTTYPE_CHARSET 1  //  1 = charsettexttypeBidi *
    133 #define IBMBIDI_TEXTTYPE_LOGICAL 2  //  2 = logicaltexttypeBidi
    134 #define IBMBIDI_TEXTTYPE_VISUAL 3   //  3 = visualtexttypeBidi
    135 //  ------------------
    136 //  Numeral Style
    137 //  ------------------
    138 //  bidi.numeral
    139 #define IBMBIDI_NUMERAL_NOMINAL 0         //  0 = nominalnumeralBidi *
    140 #define IBMBIDI_NUMERAL_REGULAR 1         //  1 = regularcontextnumeralBidi
    141 #define IBMBIDI_NUMERAL_HINDICONTEXT 2    //  2 = hindicontextnumeralBidi
    142 #define IBMBIDI_NUMERAL_ARABIC 3          //  3 = arabicnumeralBidi
    143 #define IBMBIDI_NUMERAL_HINDI 4           //  4 = hindinumeralBidi
    144 #define IBMBIDI_NUMERAL_PERSIANCONTEXT 5  // 5 = persiancontextnumeralBidi
    145 #define IBMBIDI_NUMERAL_PERSIAN 6         //  6 = persiannumeralBidi
    146 
    147 #define IBMBIDI_DEFAULT_BIDI_OPTIONS                                    \
    148  ((IBMBIDI_TEXTDIRECTION_LTR << 0) | (IBMBIDI_TEXTTYPE_CHARSET << 4) | \
    149   (IBMBIDI_NUMERAL_NOMINAL << 8))
    150 
    151 #define GET_BIDI_OPTION_DIRECTION(bo) \
    152  (((bo) >> 0) & 0x0000000F) /* 4 bits for DIRECTION */
    153 #define GET_BIDI_OPTION_TEXTTYPE(bo) \
    154  (((bo) >> 4) & 0x0000000F) /* 4 bits for TEXTTYPE */
    155 #define GET_BIDI_OPTION_NUMERAL(bo) \
    156  (((bo) >> 8) & 0x0000000F) /* 4 bits for NUMERAL */
    157 
    158 #define SET_BIDI_OPTION_DIRECTION(bo, dir)                    \
    159  {                                                           \
    160    (bo) = ((bo) & 0xFFFFFFF0) | (((dir) & 0x0000000F) << 0); \
    161  }
    162 #define SET_BIDI_OPTION_TEXTTYPE(bo, tt)                     \
    163  {                                                          \
    164    (bo) = ((bo) & 0xFFFFFF0F) | (((tt) & 0x0000000F) << 4); \
    165  }
    166 #define SET_BIDI_OPTION_NUMERAL(bo, num)                      \
    167  {                                                           \
    168    (bo) = ((bo) & 0xFFFFF0FF) | (((num) & 0x0000000F) << 8); \
    169  }
    170 
    171 /* Constants related to the position of numerics in the codepage */
    172 #define START_HINDI_DIGITS 0x0660
    173 #define END_HINDI_DIGITS 0x0669
    174 #define START_ARABIC_DIGITS 0x0030
    175 #define END_ARABIC_DIGITS 0x0039
    176 #define START_FARSI_DIGITS 0x06f0
    177 #define END_FARSI_DIGITS 0x06f9
    178 #define IS_HINDI_DIGIT(u) \
    179  (((u) >= START_HINDI_DIGITS) && ((u) <= END_HINDI_DIGITS))
    180 #define IS_ARABIC_DIGIT(u) \
    181  (((u) >= START_ARABIC_DIGITS) && ((u) <= END_ARABIC_DIGITS))
    182 #define IS_FARSI_DIGIT(u) \
    183  (((u) >= START_FARSI_DIGITS) && ((u) <= END_FARSI_DIGITS))
    184 /**
    185 * Arabic numeric separator and numeric formatting characters:
    186 *  U+0600;ARABIC NUMBER SIGN
    187 *  U+0601;ARABIC SIGN SANAH
    188 *  U+0602;ARABIC FOOTNOTE MARKER
    189 *  U+0603;ARABIC SIGN SAFHA
    190 *  U+066A;ARABIC PERCENT SIGN
    191 *  U+066B;ARABIC DECIMAL SEPARATOR
    192 *  U+066C;ARABIC THOUSANDS SEPARATOR
    193 *  U+06DD;ARABIC END OF AYAH
    194 */
    195 #define IS_ARABIC_SEPARATOR(u)                                                 \
    196  ((/*(u) >= 0x0600 &&*/ (u) <= 0x0603) || ((u) >= 0x066A && (u) <= 0x066C) || \
    197   ((u) == 0x06DD))
    198 
    199 #define IS_BIDI_DIACRITIC(u)                                                 \
    200  (((u) >= 0x0591 && (u) <= 0x05A1) || ((u) >= 0x05A3 && (u) <= 0x05B9) ||   \
    201   ((u) >= 0x05BB && (u) <= 0x05BD) || ((u) == 0x05BF) || ((u) == 0x05C1) || \
    202   ((u) == 0x05C2) || ((u) == 0x05C4) || ((u) >= 0x064B && (u) <= 0x0652) || \
    203   ((u) == 0x0670) || ((u) >= 0x06D7 && (u) <= 0x06E4) || ((u) == 0x06E7) || \
    204   ((u) == 0x06E8) || ((u) >= 0x06EA && (u) <= 0x06ED))
    205 
    206 #define IS_HEBREW_CHAR(c) \
    207  (((0x0590 <= (c)) && ((c) <= 0x05FF)) || (((c) >= 0xfb1d) && ((c) <= 0xfb4f)))
    208 #define IS_ARABIC_CHAR(c)              \
    209  ((0x0600 <= (c) && (c) <= 0x08FF) && \
    210   ((c) <= 0x06ff || ((c) >= 0x0750 && (c) <= 0x077f) || (c) >= 0x08a0))
    211 #define IS_ARABIC_ALPHABETIC(c) \
    212  (IS_ARABIC_CHAR(c) &&         \
    213   !(IS_HINDI_DIGIT(c) || IS_FARSI_DIGIT(c) || IS_ARABIC_SEPARATOR(c)))
    214 
    215 /**
    216 * The codepoint ranges in the following macros are based on the blocks
    217 *  allocated, or planned to be allocated, to right-to-left characters in the
    218 *  BMP (Basic Multilingual Plane) and SMP (Supplementary Multilingual Plane)
    219 *  according to
    220 *  http://unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt and
    221 *  http://www.unicode.org/roadmaps/
    222 */
    223 
    224 #define IS_IN_BMP_RTL_BLOCK(c) ((0x590 <= (c)) && ((c) <= 0x8ff))
    225 #define IS_RTL_PRESENTATION_FORM(c) \
    226  (((0xfb1d <= (c)) && ((c) <= 0xfdff)) || ((0xfe70 <= (c)) && ((c) <= 0xfefe)))
    227 #define IS_IN_SMP_RTL_BLOCK(c)               \
    228  (((0x10800 <= (c)) && ((c) <= 0x10fff)) || \
    229   ((0x1e800 <= (c)) && ((c) <= 0x1eFFF)))
    230 // Due to the supplementary-plane RTL blocks being identifiable from the
    231 // high surrogate without examining the low surrogate, it is correct to
    232 // use this by-code-unit check on potentially astral text without doing
    233 // the math to decode surrogate pairs into code points. However, unpaired
    234 // high surrogates that are RTL high surrogates then count as RTL even
    235 // though, if replaced by the REPLACEMENT CHARACTER, it would not be
    236 // RTL.
    237 #define UTF16_CODE_UNIT_IS_BIDI(c)                              \
    238  ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
    239   (c) == 0xD802 || (c) == 0xD803 || (c) == 0xD83A || (c) == 0xD83B)
    240 #define UTF32_CHAR_IS_BIDI(c)                                   \
    241  ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
    242   (IS_IN_SMP_RTL_BLOCK(c)))
    243 #endif /* nsBidiUtils_h__ */