tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucm.h (9188B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2003-2013, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  ucm.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003jun20
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Definitions for the .ucm file parser and handler module ucm.c.
     17 */
     18 
     19 #ifndef __UCM_H__
     20 #define __UCM_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "ucnvmbcs.h"
     24 #include "ucnv_ext.h"
     25 #include "filestrm.h"
     26 #include <stdio.h>
     27 
     28 #if !UCONFIG_NO_CONVERSION
     29 
     30 U_CDECL_BEGIN
     31 
     32 /* constants for UCMapping.moveFlag */
     33 enum {
     34    UCM_MOVE_TO_EXT=1,
     35    UCM_REMOVE_MAPPING=2
     36 };
     37 
     38 /*
     39 * Per-mapping data structure
     40 *
     41 * u if uLen==1: Unicode code point
     42 *   else index to uLen code points
     43 * b if bLen<=4: up to 4 bytes
     44 *   else index to bLen bytes
     45 * uLen number of code points
     46 * bLen number of words containing left-justified bytes
     47 * bIsMultipleChars indicates that the bytes contain more than one sequence
     48 *                  according to the state table
     49 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
     50 *   or "good one-way" mapping (4).
     51 *   Same values as in the source file after |
     52 */
     53 typedef struct UCMapping {
     54    UChar32 u;
     55    union {
     56        uint32_t idx;
     57        uint8_t bytes[4];
     58    } b;
     59    int8_t uLen, bLen, f, moveFlag;
     60 } UCMapping;
     61 
     62 /* constants for UCMTable.flagsType */
     63 enum {
     64    UCM_FLAGS_INITIAL,  /* no mappings parsed yet */
     65    UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
     66    UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
     67    UCM_FLAGS_MIXED     /* both implicit and explicit */
     68 };
     69 
     70 typedef struct UCMTable {
     71    UCMapping *mappings;
     72    int32_t mappingsCapacity, mappingsLength;
     73 
     74    UChar32 *codePoints;
     75    int32_t codePointsCapacity, codePointsLength;
     76 
     77    uint8_t *bytes;
     78    int32_t bytesCapacity, bytesLength;
     79 
     80    /* index map for mapping by bytes first */
     81    int32_t *reverseMap;
     82 
     83    uint8_t unicodeMask;
     84    int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
     85    UBool isSorted;
     86 } UCMTable;
     87 
     88 enum {
     89    MBCS_STATE_FLAG_DIRECT=1,
     90    MBCS_STATE_FLAG_SURROGATES,
     91 
     92    MBCS_STATE_FLAG_READY=16
     93 };
     94 
     95 typedef struct UCMStates {
     96    int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
     97    uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
     98             stateOffsetSum[MBCS_MAX_STATE_COUNT];
     99 
    100    int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
    101    int8_t conversionType, outputType;
    102 } UCMStates;
    103 
    104 typedef struct UCMFile {
    105    UCMTable *base, *ext;
    106    UCMStates states;
    107 
    108    char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
    109 } UCMFile;
    110 
    111 /* simple accesses ---------------------------------------------------------- */
    112 
    113 #define UCM_GET_CODE_POINTS(t, m) \
    114    (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
    115 
    116 #define UCM_GET_BYTES(t, m) \
    117    (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
    118 
    119 /* APIs --------------------------------------------------------------------- */
    120 
    121 U_CAPI UCMFile * U_EXPORT2
    122 ucm_open(void);
    123 
    124 U_CAPI void U_EXPORT2
    125 ucm_close(UCMFile *ucm);
    126 
    127 U_CAPI UBool U_EXPORT2
    128 ucm_parseHeaderLine(UCMFile *ucm,
    129                    char *line, char **pKey, char **pValue);
    130 
    131 /* @return -1 illegal bytes  0 suitable for base table  1 needs to go into extension table */
    132 U_CAPI int32_t U_EXPORT2
    133 ucm_mappingType(UCMStates *baseStates,
    134                UCMapping *m,
    135                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    136                uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    137 
    138 /* add a mapping to the base or extension table as appropriate */
    139 U_CAPI UBool U_EXPORT2
    140 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
    141                   UCMapping *m,
    142                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    143                   uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    144 
    145 U_CAPI UBool U_EXPORT2
    146 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
    147 
    148 
    149 U_CAPI UCMTable * U_EXPORT2
    150 ucm_openTable(void);
    151 
    152 U_CAPI void U_EXPORT2
    153 ucm_closeTable(UCMTable *table);
    154 
    155 U_CAPI void U_EXPORT2
    156 ucm_resetTable(UCMTable *table);
    157 
    158 U_CAPI void U_EXPORT2
    159 ucm_sortTable(UCMTable *t);
    160 
    161 /*
    162 * Remove mappings with their move flag set from the base table
    163 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
    164 */
    165 U_CAPI void U_EXPORT2
    166 ucm_moveMappings(UCMTable *base, UCMTable *ext);
    167 
    168 /**
    169 * Read a table from a .ucm file, from after the CHARMAP line to
    170 * including the END CHARMAP line.
    171 */
    172 U_CAPI void U_EXPORT2
    173 ucm_readTable(UCMFile *ucm, FileStream* convFile,
    174              UBool forBase, UCMStates *baseStates,
    175              UErrorCode *pErrorCode);
    176 
    177 /**
    178 * Check the validity of mappings against a base table's states;
    179 * necessary for extension-only tables that were read before their base tables.
    180 */
    181 U_CAPI UBool U_EXPORT2
    182 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
    183 
    184 /**
    185 * Check a base table against an extension table.
    186 * Set the moveTarget!=NULL if it is possible to move mappings from the base.
    187 * This is the case where base and extension tables are parsed from a single file
    188 * (moveTarget==ext)
    189 * or when delta file mappings are subtracted from a base table.
    190 *
    191 * When a base table cannot be modified because a delta file is parsed in makeconv,
    192 * then set moveTarget=NULL.
    193 *
    194 * if(intersectBase) then mappings that exist in the base table but not in
    195 * the extension table are moved to moveTarget instead of showing an error.
    196 *
    197 * Special mode:
    198 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
    199 * not moved out of the base unless their Unicode input requires it.
    200 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
    201 *
    202 * For both tables in the same file, the extension table is automatically
    203 * built.
    204 * For separate files, the extension file can use a complete mapping table (.ucm file),
    205 * so that common mappings need not be stripped out manually.
    206 *
    207 *
    208 * Sort both tables, and then for each mapping direction:
    209 *
    210 * If intersectBase is true and the base table contains a mapping
    211 * that does not exist in the extension table, then this mapping is moved
    212 * to moveTarget.
    213 *
    214 * - otherwise -
    215 *
    216 * If the base table contains a mapping for which the input sequence is
    217 * the same as the extension input, then
    218 * - if the output is the same: remove the extension mapping
    219 * - else: error
    220 *
    221 * If the base table contains a mapping for which the input sequence is
    222 * a prefix of the extension input, then
    223 * - if moveTarget!=NULL: move the base mapping to the moveTarget table
    224 * - else: error
    225 *
    226 * @return false in case of an irreparable error
    227 */
    228 U_CAPI UBool U_EXPORT2
    229 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    230                 UCMTable *moveTarget, int8_t intersectBase);
    231 
    232 U_CAPI void U_EXPORT2
    233 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
    234 
    235 U_CAPI void U_EXPORT2
    236 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
    237 
    238 
    239 U_CAPI void U_EXPORT2
    240 ucm_addState(UCMStates *states, const char *s);
    241 
    242 U_CAPI void U_EXPORT2
    243 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);
    244 
    245 U_CAPI int32_t U_EXPORT2
    246 ucm_countChars(UCMStates *states,
    247               const uint8_t *bytes, int32_t length);
    248 
    249 
    250 U_CAPI int8_t U_EXPORT2
    251 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
    252 
    253 U_CAPI UBool U_EXPORT2
    254 ucm_parseMappingLine(UCMapping *m,
    255                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    256                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
    257                     const char *line);
    258 
    259 U_CAPI void U_EXPORT2
    260 ucm_addMapping(UCMTable *table,
    261               UCMapping *m,
    262               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    263               uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    264 
    265 /* very makeconv-specific functions ----------------------------------------- */
    266 
    267 /* finalize and optimize states after the toUnicode mappings are processed */
    268 U_CAPI void U_EXPORT2
    269 ucm_optimizeStates(UCMStates *states,
    270                   uint16_t **pUnicodeCodeUnits,
    271                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    272                   UBool verbose);
    273 
    274 /* moved here because it is used inside ucmstate.c */
    275 U_CAPI int32_t U_EXPORT2
    276 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    277                 uint32_t offset);
    278 
    279 /* very rptp2ucm-specific functions ----------------------------------------- */
    280 
    281 /*
    282 * Input: Separate tables with mappings from/to Unicode,
    283 * subchar and subchar1 (0 if none).
    284 * All mappings must have flag 0.
    285 *
    286 * Output: fromUTable will contain the union of mappings with the correct
    287 * precision flags, and be sorted.
    288 */
    289 U_CAPI void U_EXPORT2
    290 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    291                const uint8_t *subchar, int32_t subcharLength,
    292                uint8_t subchar1);
    293 
    294 U_CAPI UBool U_EXPORT2
    295 ucm_separateMappings(UCMFile *ucm, UBool isSISO);
    296 
    297 U_CDECL_END
    298 
    299 #endif
    300 
    301 #endif