tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ushape.h (18430B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ushape.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jun29
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __USHAPE_H__
     20 #define __USHAPE_H__
     21 
     22 #include "unicode/utypes.h"
     23 
     24 /**
     25 * \file
     26 * \brief C API:  Arabic shaping
     27 * 
     28 */
     29 
     30 /**
     31 * Shape Arabic text on a character basis.
     32 *
     33 * <p>This function performs basic operations for "shaping" Arabic text. It is most
     34 * useful for use with legacy data formats and legacy display technology
     35 * (simple terminals). All operations are performed on Unicode characters.</p>
     36 *
     37 * <p>Text-based shaping means that some character code points in the text are
     38 * replaced by others depending on the context. It transforms one kind of text
     39 * into another. In comparison, modern displays for Arabic text select
     40 * appropriate, context-dependent font glyphs for each text element, which means
     41 * that they transform text into a glyph vector.</p>
     42 *
     43 * <p>Text transformations are necessary when modern display technology is not
     44 * available or when text needs to be transformed to or from legacy formats that
     45 * use "shaped" characters. Since the Arabic script is cursive, connecting
     46 * adjacent letters to each other, computers select images for each letter based
     47 * on the surrounding letters. This usually results in four images per Arabic
     48 * letter: initial, middle, final, and isolated forms. In Unicode, on the other
     49 * hand, letters are normally stored abstract, and a display system is expected
     50 * to select the necessary glyphs. (This makes searching and other text
     51 * processing easier because the same letter has only one code.) It is possible
     52 * to mimic this with text transformations because there are characters in
     53 * Unicode that are rendered as letters with a specific shape
     54 * (or cursive connectivity). They were included for interoperability with
     55 * legacy systems and codepages, and for unsophisticated display systems.</p>
     56 *
     57 * <p>A second kind of text transformations is supported for Arabic digits:
     58 * For compatibility with legacy codepages that only include European digits,
     59 * it is possible to replace one set of digits by another, changing the
     60 * character code points. These operations can be performed for either
     61 * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic
     62 * digits (U+06f0...U+06f9).</p>
     63 *
     64 * <p>Some replacements may result in more or fewer characters (code points).
     65 * By default, this means that the destination buffer may receive text with a
     66 * length different from the source length. Some legacy systems rely on the
     67 * length of the text to be constant. They expect extra spaces to be added
     68 * or consumed either next to the affected character or at the end of the
     69 * text.</p>
     70 *
     71 * <p>For details about the available operations, see the description of the
     72 * <code>U_SHAPE_...</code> options.</p>
     73 *
     74 * @param source The input text.
     75 *
     76 * @param sourceLength The number of UChars in <code>source</code>.
     77 *
     78 * @param dest The destination buffer that will receive the results of the
     79 *             requested operations. It may be <code>NULL</code> only if
     80 *             <code>destSize</code> is 0. The source and destination must not
     81 *             overlap.
     82 *
     83 * @param destSize The size (capacity) of the destination buffer in UChars.
     84 *                 If <code>destSize</code> is 0, then no output is produced,
     85 *                 but the necessary buffer size is returned ("preflighting").
     86 *
     87 * @param options This is a 32-bit set of flags that specify the operations
     88 *                that are performed on the input text. If no error occurs,
     89 *                then the result will always be written to the destination
     90 *                buffer.
     91 *
     92 * @param pErrorCode must be a valid pointer to an error code value,
     93 *        which must not indicate a failure before the function call.
     94 *
     95 * @return The number of UChars written to the destination buffer.
     96 *         If an error occurred, then no output was written, or it may be
     97 *         incomplete. If <code>U_BUFFER_OVERFLOW_ERROR</code> is set, then
     98 *         the return value indicates the necessary destination buffer size.
     99 * @stable ICU 2.0
    100 */
    101 U_CAPI int32_t U_EXPORT2
    102 u_shapeArabic(const UChar *source, int32_t sourceLength,
    103              UChar *dest, int32_t destSize,
    104              uint32_t options,
    105              UErrorCode *pErrorCode);
    106 
    107 /**
    108 * Memory option: allow the result to have a different length than the source.
    109 * Affects: LamAlef options
    110 * @stable ICU 2.0
    111 */
    112 #define U_SHAPE_LENGTH_GROW_SHRINK              0
    113 
    114 /**
    115 * Memory option: allow the result to have a different length than the source.
    116 * Affects: LamAlef options
    117 * This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK
    118 * @stable ICU 4.2
    119 */
    120 #define U_SHAPE_LAMALEF_RESIZE                  0 
    121 
    122 /**
    123 * Memory option: the result must have the same length as the source.
    124 * If more room is necessary, then try to consume spaces next to modified characters.
    125 * @stable ICU 2.0
    126 */
    127 #define U_SHAPE_LENGTH_FIXED_SPACES_NEAR        1
    128 
    129 /**
    130 * Memory option: the result must have the same length as the source.
    131 * If more room is necessary, then try to consume spaces next to modified characters.
    132 * Affects: LamAlef options
    133 * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR
    134 * @stable ICU 4.2
    135 */
    136 #define U_SHAPE_LAMALEF_NEAR                    1 
    137 
    138 /**
    139 * Memory option: the result must have the same length as the source.
    140 * If more room is necessary, then try to consume spaces at the end of the text.
    141 * @stable ICU 2.0
    142 */
    143 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_END      2
    144 
    145 /**
    146 * Memory option: the result must have the same length as the source.
    147 * If more room is necessary, then try to consume spaces at the end of the text.
    148 * Affects: LamAlef options
    149 * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END
    150 * @stable ICU 4.2
    151 */
    152 #define U_SHAPE_LAMALEF_END                     2 
    153 
    154 /**
    155 * Memory option: the result must have the same length as the source.
    156 * If more room is necessary, then try to consume spaces at the beginning of the text.
    157 * @stable ICU 2.0
    158 */
    159 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3
    160 
    161 /**
    162 * Memory option: the result must have the same length as the source.
    163 * If more room is necessary, then try to consume spaces at the beginning of the text.
    164 * Affects: LamAlef options
    165 * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
    166 * @stable ICU 4.2
    167 */
    168 #define U_SHAPE_LAMALEF_BEGIN                    3 
    169 
    170 
    171 /**
    172 * Memory option: the result must have the same length as the source.
    173 * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end.
    174 *               If there is no space at end, use spaces at beginning of the buffer. If there
    175 *               is no space at beginning of the buffer, use spaces at the near (i.e. the space
    176 *               after the LAMALEF character).
    177 *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) 
    178 *               will be set in pErrorCode
    179 *
    180 * Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END. 
    181 * Affects: LamAlef options
    182 * @stable ICU 4.2
    183 */
    184 #define U_SHAPE_LAMALEF_AUTO                     0x10000 
    185 
    186 /** Bit mask for memory options. @stable ICU 2.0 */
    187 #define U_SHAPE_LENGTH_MASK                      0x10003 /* Changed old value 3 */
    188 
    189 
    190 /**
    191 * Bit mask for LamAlef memory options.
    192 * @stable ICU 4.2
    193 */
    194 #define U_SHAPE_LAMALEF_MASK                     0x10003 /* updated */
    195 
    196 /** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */
    197 #define U_SHAPE_TEXT_DIRECTION_LOGICAL          0
    198 
    199 /**
    200 * Direction indicator:
    201 * the source is in visual RTL order,
    202 * the rightmost displayed character stored first.
    203 * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL
    204 * @stable ICU 4.2
    205 */
    206 #define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL       0
    207 
    208 /**
    209 * Direction indicator:
    210 * the source is in visual LTR order,
    211 * the leftmost displayed character stored first.
    212 * @stable ICU 2.0
    213 */
    214 #define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR       4
    215 
    216 /** Bit mask for direction indicators. @stable ICU 2.0 */
    217 #define U_SHAPE_TEXT_DIRECTION_MASK             4
    218 
    219 
    220 /** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */
    221 #define U_SHAPE_LETTERS_NOOP                    0
    222 
    223 /** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */
    224 #define U_SHAPE_LETTERS_SHAPE                   8
    225 
    226 /** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */
    227 #define U_SHAPE_LETTERS_UNSHAPE                 0x10
    228 
    229 /**
    230 * Letter shaping option: replace abstract letter characters by "shaped" ones.
    231 * The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters
    232 * are always "shaped" into the isolated form instead of the medial form
    233 * (selecting code points from the Arabic Presentation Forms-B block).
    234 * @stable ICU 2.0
    235 */
    236 #define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18
    237 
    238 
    239 /** Bit mask for letter shaping options. @stable ICU 2.0 */
    240 #define U_SHAPE_LETTERS_MASK                        0x18
    241 
    242 
    243 /** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */
    244 #define U_SHAPE_DIGITS_NOOP                     0
    245 
    246 /**
    247 * Digit shaping option:
    248 * Replace European digits (U+0030...) by Arabic-Indic digits.
    249 * @stable ICU 2.0
    250 */
    251 #define U_SHAPE_DIGITS_EN2AN                    0x20
    252 
    253 /**
    254 * Digit shaping option:
    255 * Replace Arabic-Indic digits by European digits (U+0030...).
    256 * @stable ICU 2.0
    257 */
    258 #define U_SHAPE_DIGITS_AN2EN                    0x40
    259 
    260 /**
    261 * Digit shaping option:
    262 * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
    263 * strongly directional character is an Arabic letter
    264 * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
    265 * The direction of "preceding" depends on the direction indicator option.
    266 * For the first characters, the preceding strongly directional character
    267 * (initial state) is assumed to be not an Arabic letter
    268 * (it is <code>U_LEFT_TO_RIGHT</code> [L] or <code>U_RIGHT_TO_LEFT</code> [R]).
    269 * @stable ICU 2.0
    270 */
    271 #define U_SHAPE_DIGITS_ALEN2AN_INIT_LR          0x60
    272 
    273 /**
    274 * Digit shaping option:
    275 * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
    276 * strongly directional character is an Arabic letter
    277 * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
    278 * The direction of "preceding" depends on the direction indicator option.
    279 * For the first characters, the preceding strongly directional character
    280 * (initial state) is assumed to be an Arabic letter.
    281 * @stable ICU 2.0
    282 */
    283 #define U_SHAPE_DIGITS_ALEN2AN_INIT_AL          0x80
    284 
    285 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
    286 #define U_SHAPE_DIGITS_RESERVED                 0xa0
    287 
    288 /** Bit mask for digit shaping options. @stable ICU 2.0 */
    289 #define U_SHAPE_DIGITS_MASK                     0xe0
    290 
    291 
    292 /** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */
    293 #define U_SHAPE_DIGIT_TYPE_AN                   0
    294 
    295 /** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */
    296 #define U_SHAPE_DIGIT_TYPE_AN_EXTENDED          0x100
    297 
    298 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
    299 #define U_SHAPE_DIGIT_TYPE_RESERVED             0x200
    300 
    301 /** Bit mask for digit type options. @stable ICU 2.0 */
    302 #define U_SHAPE_DIGIT_TYPE_MASK                 0x300 /* I need to change this from 0x3f00 to 0x300 */
    303 
    304 /** 
    305 * Tashkeel aggregation option:
    306 * Replaces any combination of U+0651 with one of
    307 * U+064C, U+064D, U+064E, U+064F, U+0650 with
    308 * U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
    309 * @stable ICU 3.6
    310 */
    311 #define U_SHAPE_AGGREGATE_TASHKEEL              0x4000
    312 /** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */
    313 #define U_SHAPE_AGGREGATE_TASHKEEL_NOOP         0
    314 /** Bit mask for tashkeel aggregation. @stable ICU 3.6 */
    315 #define U_SHAPE_AGGREGATE_TASHKEEL_MASK         0x4000
    316 
    317 /** 
    318 * Presentation form option:
    319 * Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B
    320 * characters with 0+06xx characters, before shaping.
    321 * @stable ICU 3.6
    322 */
    323 #define U_SHAPE_PRESERVE_PRESENTATION           0x8000
    324 /** Presentation form option: 
    325 * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with 
    326 * their unshaped correspondents in range 0+06xx, before shaping.
    327 * @stable ICU 3.6 
    328 */
    329 #define U_SHAPE_PRESERVE_PRESENTATION_NOOP      0
    330 /** Bit mask for preserve presentation form. @stable ICU 3.6 */
    331 #define U_SHAPE_PRESERVE_PRESENTATION_MASK      0x8000
    332 
    333 /* Seen Tail option */ 
    334 /**
    335 * Memory option: the result must have the same length as the source.
    336 * Shaping mode: The SEEN family character will expand into two characters using space near 
    337 *               the SEEN family character(i.e. the space after the character).
    338 *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) 
    339 *               will be set in pErrorCode
    340 *
    341 * De-shaping mode: Any Seen character followed by Tail character will be
    342 *                  replaced by one cell Seen and a space will replace the Tail.
    343 * Affects: Seen options
    344 * @stable ICU 4.2
    345 */
    346 #define U_SHAPE_SEEN_TWOCELL_NEAR     0x200000
    347 
    348 /**
    349 * Bit mask for Seen memory options. 
    350 * @stable ICU 4.2
    351 */
    352 #define U_SHAPE_SEEN_MASK             0x700000
    353 
    354 /* YehHamza option */ 
    355 /**
    356 * Memory option: the result must have the same length as the source.
    357 * Shaping mode: The YEHHAMZA character will expand into two characters using space near it 
    358 *              (i.e. the space after the character
    359 *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) 
    360 *               will be set in pErrorCode
    361 *
    362 * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be
    363 *                  replaced by one cell YehHamza and space will replace the Hamza.
    364 * Affects: YehHamza options
    365 * @stable ICU 4.2
    366 */
    367 #define U_SHAPE_YEHHAMZA_TWOCELL_NEAR      0x1000000
    368 
    369 
    370 /**
    371 * Bit mask for YehHamza memory options. 
    372 * @stable ICU 4.2
    373 */
    374 #define U_SHAPE_YEHHAMZA_MASK              0x3800000
    375 
    376 /* New Tashkeel options */ 
    377 /**
    378 * Memory option: the result must have the same length as the source.
    379 * Shaping mode: Tashkeel characters will be replaced by spaces. 
    380 *               Spaces will be placed at beginning of the buffer
    381 *
    382 * De-shaping mode: N/A
    383 * Affects: Tashkeel options
    384 * @stable ICU 4.2
    385 */
    386 #define U_SHAPE_TASHKEEL_BEGIN                      0x40000
    387 
    388 /**
    389 * Memory option: the result must have the same length as the source.
    390 * Shaping mode: Tashkeel characters will be replaced by spaces. 
    391 *               Spaces will be placed at end of the buffer
    392 *
    393 * De-shaping mode: N/A
    394 * Affects: Tashkeel options
    395 * @stable ICU 4.2
    396 */
    397 #define U_SHAPE_TASHKEEL_END                        0x60000
    398 
    399 /**
    400 * Memory option: allow the result to have a different length than the source.
    401 * Shaping mode: Tashkeel characters will be removed, buffer length will shrink. 
    402 * De-shaping mode: N/A 
    403 *
    404 * Affect: Tashkeel options
    405 * @stable ICU 4.2
    406 */
    407 #define U_SHAPE_TASHKEEL_RESIZE                     0x80000
    408 
    409 /**
    410 * Memory option: the result must have the same length as the source.
    411 * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent
    412 *               characters (i.e. shaped on Tatweel) or replaced by space if it is not connected.
    413 *
    414 * De-shaping mode: N/A
    415 * Affects: YehHamza options
    416 * @stable ICU 4.2
    417 */
    418 #define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL         0xC0000
    419 
    420 /** 
    421 * Bit mask for Tashkeel replacement with Space or Tatweel memory options. 
    422 * @stable ICU 4.2
    423 */
    424 #define U_SHAPE_TASHKEEL_MASK                       0xE0000
    425 
    426 
    427 /* Space location Control options */ 
    428 /**
    429 * This option affect the meaning of BEGIN and END options. if this option is not used the default
    430 * for BEGIN and END will be as following: 
    431 * The Default (for both Visual LTR, Visual RTL and Logical Text)
    432 *           1. BEGIN always refers to the start address of physical memory.
    433 *           2. END always refers to the end address of physical memory.
    434 *
    435 * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text. 
    436 *
    437 * The effect on BEGIN and END Memory Options will be as following:
    438 *    A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text(
    439 *       corresponding to the physical memory address end for Visual LTR text, Same as END in 
    440 *       default behavior)
    441 *    B. BEGIN For Logical text: Same as BEGIN in default behavior. 
    442 *    C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding
    443 *       to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior.
    444 *    D. END For Logical text: Same as END in default behavior). 
    445 * Affects: All LamAlef BEGIN, END and AUTO options.
    446 * @stable ICU 4.2
    447 */
    448 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000
    449 
    450 /**
    451 * Bit mask for swapping BEGIN and END for Visual LTR text 
    452 * @stable ICU 4.2
    453 */
    454 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK      0x4000000
    455 
    456 /**
    457 * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73). 
    458 * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B)
    459 * De-shaping will not use this option as it will always search for both the new Unicode code point for the 
    460 * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the
    461 * Seen-Family letter accordingly.
    462 *
    463 * Shaping Mode: Only shaping.
    464 * De-shaping Mode: N/A.
    465 * Affects: All Seen options
    466 * @stable ICU 4.8
    467 */
    468 #define U_SHAPE_TAIL_NEW_UNICODE        0x8000000
    469 
    470 /**
    471 * Bit mask for new Unicode Tail option 
    472 * @stable ICU 4.8
    473 */
    474 #define U_SHAPE_TAIL_TYPE_MASK          0x8000000
    475 
    476 #endif