tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

genmbcs.h (5417B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2008, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  genmbcs.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jul10
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __GENMBCS_H__
     20 #define __GENMBCS_H__
     21 
     22 #include "makeconv.h"
     23 
     24 enum {
     25    /*
     26     * TODO: Consider using ucnvmbcs.h constants.
     27     * However, not all values need to be exactly the same, for example
     28     * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX
     29     * may be higher in makeconv than in the runtime code because that
     30     * affects only a small number of .cnv files [if any] but all
     31     * runtime UConverterSharedData objects.
     32     */
     33    MBCS_STAGE_2_SHIFT=4,
     34    MBCS_STAGE_2_BLOCK_SIZE=0x40,       /* =64=1<<6 for 6 bits in stage 2 */
     35    MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6,    /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
     36    MBCS_STAGE_2_BLOCK_MASK=0x3f,       /* for after shifting by MBCS_STAGE_2_SHIFT */
     37    MBCS_STAGE_1_SHIFT=10,
     38    MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */
     39    MBCS_STAGE_1_SIZE=0x440,    /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */
     40    MBCS_STAGE_2_SIZE=0xfbc0,   /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */
     41    MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
     42    MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
     43 
     44    MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
     45    MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
     46 
     47    MBCS_STAGE_3_BLOCK_SIZE=16,         /* =16=1<<4 for 4 bits in stage 3 */
     48    MBCS_STAGE_3_BLOCK_MASK=0xf,
     49    MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
     50 
     51    MBCS_STAGE_3_GRANULARITY=16,        /* =1<<4: MBCS stage 2 indexes are shifted left 4 */
     52    MBCS_STAGE_3_SBCS_SIZE=0x10000,     /* max 64k mappings for SBCS */
     53    MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */
     54 
     55    /*
     56     * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures.
     57     * Possible values are 0x01ff..0xffff, in steps of 0x100.
     58     *
     59     * Unlike for MBCS, this constant only affects the stage 3 block allocation size;
     60     * there is no additional stage 1/2 table stored in the .cnv file.
     61     * The max value should be at least 0x7ff to cover 2-byte UTF-8.
     62     * 0xfff also covers a number other small scripts which have legacy charsets
     63     * (like Thai).
     64     * Higher values up to 0x1fff are harmless and potentially useful because
     65     * that covers small-script blocks which usually have either dense mappings
     66     * or no mappings at all.
     67     * Starting at U+2000, there are mostly symbols and format characters
     68     * with a low density of SBCS mappings, which would result in more wasted
     69     * stage 3 entries with the larger block size.
     70     */
     71    SBCS_UTF8_MAX=0x1fff,
     72 
     73    /*
     74     * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures.
     75     * Possible values are 0x01ff..0xffff, in steps of 0x100.
     76     *
     77     * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table
     78     * with extreme input data. The function checks for this overflow.
     79     *
     80     * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul.
     81     * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc.
     82     * Larger values cause slightly larger MBCS .cnv files.
     83     */
     84    MBCS_UTF8_MAX=0xd7ff,
     85    MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1,    /* =0xd800 */
     86 
     87    MBCS_UTF8_STAGE_SHIFT=6,
     88    MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40,  /* =64=1<<6 for 6 bits from last trail byte */
     89    MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f,
     90 
     91    /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */
     92    MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */
     93 
     94    MBCS_FROM_U_EXT_FLAG=0x10,          /* UCMapping.f bit for base table mappings that fit into the base toU table */
     95    MBCS_FROM_U_EXT_MASK=0x0f,          /* but need to go into the extension fromU table */
     96 
     97    /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */
     98    MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE,
     99 
    100    MBCS_MAX_FALLBACK_COUNT=8192
    101 };
    102 
    103 U_CFUNC NewConverter *
    104 MBCSOpen(UCMFile *ucm);
    105 
    106 struct MBCSData;
    107 typedef struct MBCSData MBCSData;
    108 
    109 /*
    110 * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode()
    111 * for creating an extension-only file.
    112 * Assume maxCharLength>1.
    113 */
    114 U_CFUNC const MBCSData *
    115 MBCSGetDummy(void);
    116 
    117 /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
    118 U_CFUNC UBool
    119 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
    120                         const uint8_t *bytes, int32_t length,
    121                         UChar32 c, int8_t flag);
    122 
    123 U_CFUNC NewConverter *
    124 CnvExtOpen(UCMFile *ucm);
    125 
    126 #endif /* __GENMBCS_H__ */