ucnvisci.cpp (72789B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2000-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnvisci.c 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2001JUN26 14 * created by: Ram Viswanadha 15 * 16 * Date Name Description 17 * 24/7/2001 Ram Added support for EXT character handling 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 23 24 #include "unicode/ucnv.h" 25 #include "unicode/ucnv_cb.h" 26 #include "unicode/utf16.h" 27 #include "cmemory.h" 28 #include "ucnv_bld.h" 29 #include "ucnv_cnv.h" 30 #include "cstring.h" 31 #include "uassert.h" 32 33 #define UCNV_OPTIONS_VERSION_MASK 0xf 34 #define NUKTA 0x093c 35 #define HALANT 0x094d 36 #define ZWNJ 0x200c /* Zero Width Non Joiner */ 37 #define ZWJ 0x200d /* Zero width Joiner */ 38 #define INVALID_CHAR 0xffff 39 #define ATR 0xEF /* Attribute code */ 40 #define EXT 0xF0 /* Extension code */ 41 #define DANDA 0x0964 42 #define DOUBLE_DANDA 0x0965 43 #define ISCII_NUKTA 0xE9 44 #define ISCII_HALANT 0xE8 45 #define ISCII_DANDA 0xEA 46 #define ISCII_INV 0xD9 47 #define ISCII_VOWEL_SIGN_E 0xE0 48 #define INDIC_BLOCK_BEGIN 0x0900 49 #define INDIC_BLOCK_END 0x0D7F 50 #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN) 51 #define VOCALLIC_RR 0x0931 52 #define LF 0x0A 53 #define ASCII_END 0xA0 54 #define NO_CHAR_MARKER 0xFFFE 55 #define TELUGU_DELTA DELTA * TELUGU 56 #define DEV_ABBR_SIGN 0x0970 57 #define DEV_ANUDATTA 0x0952 58 #define EXT_RANGE_BEGIN 0xA1 59 #define EXT_RANGE_END 0xEE 60 61 #define PNJ_DELTA 0x0100 62 #define PNJ_BINDI 0x0A02 63 #define PNJ_TIPPI 0x0A70 64 #define PNJ_SIGN_VIRAMA 0x0A4D 65 #define PNJ_ADHAK 0x0A71 66 #define PNJ_HA 0x0A39 67 #define PNJ_RRA 0x0A5C 68 69 typedef enum { 70 DEVANAGARI =0, 71 BENGALI, 72 GURMUKHI, 73 GUJARATI, 74 ORIYA, 75 TAMIL, 76 TELUGU, 77 KANNADA, 78 MALAYALAM, 79 DELTA=0x80 80 }UniLang; 81 82 /** 83 * Enumeration for switching code pages if <ATR>+<one of below values> 84 * is encountered 85 */ 86 typedef enum { 87 DEF = 0x40, 88 RMN = 0x41, 89 DEV = 0x42, 90 BNG = 0x43, 91 TML = 0x44, 92 TLG = 0x45, 93 ASM = 0x46, 94 ORI = 0x47, 95 KND = 0x48, 96 MLM = 0x49, 97 GJR = 0x4A, 98 PNJ = 0x4B, 99 ARB = 0x71, 100 PES = 0x72, 101 URD = 0x73, 102 SND = 0x74, 103 KSM = 0x75, 104 PST = 0x76 105 }ISCIILang; 106 107 typedef enum { 108 DEV_MASK =0x80, 109 PNJ_MASK =0x40, 110 GJR_MASK =0x20, 111 ORI_MASK =0x10, 112 BNG_MASK =0x08, 113 KND_MASK =0x04, 114 MLM_MASK =0x02, 115 TML_MASK =0x01, 116 ZERO =0x00 117 }MaskEnum; 118 119 #define ISCII_CNV_PREFIX "ISCII,version=" 120 121 typedef struct { 122 char16_t contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */ 123 char16_t contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */ 124 uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */ 125 uint16_t currentDeltaFromUnicode; /* current delta in Indic block */ 126 uint16_t currentDeltaToUnicode; /* current delta in Indic block */ 127 MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */ 128 MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */ 129 MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */ 130 UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */ 131 UBool resetToDefaultToUnicode; /* boolean for resetting to default delta and mask when a newline is encountered*/ 132 char name[sizeof(ISCII_CNV_PREFIX) + 1]; 133 UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */ 134 } UConverterDataISCII; 135 136 typedef struct LookupDataStruct { 137 UniLang uniLang; 138 MaskEnum maskEnum; 139 ISCIILang isciiLang; 140 } LookupDataStruct; 141 142 static const LookupDataStruct lookupInitialData[]={ 143 { DEVANAGARI, DEV_MASK, DEV }, 144 { BENGALI, BNG_MASK, BNG }, 145 { GURMUKHI, PNJ_MASK, PNJ }, 146 { GUJARATI, GJR_MASK, GJR }, 147 { ORIYA, ORI_MASK, ORI }, 148 { TAMIL, TML_MASK, TML }, 149 { TELUGU, KND_MASK, TLG }, 150 { KANNADA, KND_MASK, KND }, 151 { MALAYALAM, MLM_MASK, MLM } 152 }; 153 154 /* 155 * For special handling of certain Gurmukhi characters. 156 * Bit 0 (value 1): PNJ consonant 157 * Bit 1 (value 2): PNJ Bindi Tippi 158 */ 159 static const uint8_t pnjMap[80] = { 160 /* 0A00..0A0F */ 161 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 162 /* 0A10..0A1F */ 163 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 164 /* 0A20..0A2F */ 165 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 166 /* 0A30..0A3F */ 167 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2, 168 /* 0A40..0A4F */ 169 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 170 }; 171 172 static UBool 173 isPNJConsonant(UChar32 c) { 174 if (c < 0xa00 || 0xa50 <= c) { 175 return false; 176 } else { 177 return pnjMap[c - 0xa00] & 1; 178 } 179 } 180 181 static UBool 182 isPNJBindiTippi(UChar32 c) { 183 if (c < 0xa00 || 0xa50 <= c) { 184 return false; 185 } else { 186 return pnjMap[c - 0xa00] >> 1; 187 } 188 } 189 U_CDECL_BEGIN 190 static void U_CALLCONV 191 _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) { 192 if(pArgs->onlyTestIsLoadable) { 193 return; 194 } 195 196 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII)); 197 198 if (cnv->extraInfo != nullptr) { 199 int32_t len=0; 200 UConverterDataISCII *converterData= 201 (UConverterDataISCII *) cnv->extraInfo; 202 converterData->contextCharToUnicode=NO_CHAR_MARKER; 203 cnv->toUnicodeStatus = missingCharMarker; 204 converterData->contextCharFromUnicode=0x0000; 205 converterData->resetToDefaultToUnicode=false; 206 /* check if the version requested is supported */ 207 if ((pArgs->options & UCNV_OPTIONS_VERSION_MASK) < 9) { 208 /* initialize state variables */ 209 converterData->currentDeltaFromUnicode 210 = converterData->currentDeltaToUnicode 211 = converterData->defDeltaToUnicode = (uint16_t)(lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA); 212 213 converterData->currentMaskFromUnicode 214 = converterData->currentMaskToUnicode 215 = converterData->defMaskToUnicode = lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].maskEnum; 216 217 converterData->isFirstBuffer=true; 218 (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX); 219 len = (int32_t)uprv_strlen(converterData->name); 220 converterData->name[len]= (char)((pArgs->options & UCNV_OPTIONS_VERSION_MASK) + '0'); 221 converterData->name[len+1]=0; 222 223 converterData->prevToUnicodeStatus = 0x0000; 224 } else { 225 uprv_free(cnv->extraInfo); 226 cnv->extraInfo = nullptr; 227 *errorCode = U_ILLEGAL_ARGUMENT_ERROR; 228 } 229 230 } else { 231 *errorCode =U_MEMORY_ALLOCATION_ERROR; 232 } 233 } 234 235 static void U_CALLCONV 236 _ISCIIClose(UConverter *cnv) { 237 if (cnv->extraInfo!=nullptr) { 238 if (!cnv->isExtraLocal) { 239 uprv_free(cnv->extraInfo); 240 } 241 cnv->extraInfo=nullptr; 242 } 243 } 244 245 static const char* U_CALLCONV 246 _ISCIIgetName(const UConverter* cnv) { 247 if (cnv->extraInfo) { 248 UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo; 249 return myData->name; 250 } 251 return nullptr; 252 } 253 254 static void U_CALLCONV 255 _ISCIIReset(UConverter *cnv, UConverterResetChoice choice) { 256 UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo); 257 if (choice<=UCNV_RESET_TO_UNICODE) { 258 cnv->toUnicodeStatus = missingCharMarker; 259 cnv->mode=0; 260 data->currentDeltaToUnicode=data->defDeltaToUnicode; 261 data->currentMaskToUnicode = data->defMaskToUnicode; 262 data->contextCharToUnicode=NO_CHAR_MARKER; 263 data->prevToUnicodeStatus = 0x0000; 264 } 265 if (choice!=UCNV_RESET_TO_UNICODE) { 266 cnv->fromUChar32=0x0000; 267 data->contextCharFromUnicode=0x00; 268 data->currentMaskFromUnicode=data->defMaskToUnicode; 269 data->currentDeltaFromUnicode=data->defDeltaToUnicode; 270 data->isFirstBuffer=true; 271 data->resetToDefaultToUnicode=false; 272 } 273 } 274 275 /** 276 * The values in validity table are indexed by the lower bits of Unicode 277 * range 0x0900 - 0x09ff. The values have a structure like: 278 * --------------------------------------------------------------- 279 * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML | 280 * | | | | | ASM | KND | | | 281 * --------------------------------------------------------------- 282 * If a code point is valid in a particular script 283 * then that bit is turned on 284 * 285 * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for 286 * to represent these languages 287 * 288 * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case 289 * and combine and use 1 bit to represent these languages. 290 * 291 * TODO: It is probably easier to understand and maintain to change this 292 * to use uint16_t and give each of the 9 Unicode/script blocks its own bit. 293 */ 294 295 static const uint8_t validityTable[128] = { 296 /* This state table is tool generated please do not edit unless you know exactly what you are doing */ 297 /* Note: This table was edited to mirror the Windows XP implementation */ 298 /*ISCII:Valid:Unicode */ 299 /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 300 /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , 301 /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 302 /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 303 /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 304 /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 305 /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 306 /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 307 /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 308 /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 309 /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 310 /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 311 /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 312 /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , 313 /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , 314 /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 315 /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 316 /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , 317 /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , 318 /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 319 /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 320 /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 321 /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 322 /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 323 /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 324 /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 325 /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 326 /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 327 /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 328 /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 329 /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 330 /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 331 /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 332 /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 333 /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 334 /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 335 /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 336 /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 337 /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 338 /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 339 /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 340 /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK , 341 /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 342 /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 343 /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 344 /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 345 /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 346 /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 347 /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 348 /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , 349 /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 350 /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , 351 /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , 352 /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , 353 /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 354 /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 355 /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 356 /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 357 /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 358 /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 359 /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , 360 /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 361 /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 362 /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 363 /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 364 /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 365 /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 366 /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 367 /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO , 368 /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , 369 /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , 370 /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 371 /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 372 /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , 373 /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , 374 /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 375 /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 376 /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 377 /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 378 /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 379 /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , 380 /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 381 /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 382 /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 383 /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 384 /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO , 385 /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO , 386 /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO , 387 /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 388 /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 389 /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 390 /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 391 /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , 392 /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , 393 /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 394 /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , 395 /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 396 /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , 397 /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , 398 /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , 399 /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 400 /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 401 /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 402 /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 403 /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 404 /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 405 /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 406 /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 407 /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 408 /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 409 /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 410 /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , 411 /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , 412 /* 413 * The length of the array is 128 to provide values for 0x900..0x97f. 414 * The last 15 entries for 0x971..0x97f of the validity table are all zero 415 * because no Indic script uses such Unicode code points. 416 */ 417 /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO 418 }; 419 420 static const uint16_t fromUnicodeTable[128]={ 421 0x00a0 ,/* 0x0900 */ 422 0x00a1 ,/* 0x0901 */ 423 0x00a2 ,/* 0x0902 */ 424 0x00a3 ,/* 0x0903 */ 425 0xa4e0 ,/* 0x0904 */ 426 0x00a4 ,/* 0x0905 */ 427 0x00a5 ,/* 0x0906 */ 428 0x00a6 ,/* 0x0907 */ 429 0x00a7 ,/* 0x0908 */ 430 0x00a8 ,/* 0x0909 */ 431 0x00a9 ,/* 0x090a */ 432 0x00aa ,/* 0x090b */ 433 0xA6E9 ,/* 0x090c */ 434 0x00ae ,/* 0x090d */ 435 0x00ab ,/* 0x090e */ 436 0x00ac ,/* 0x090f */ 437 0x00ad ,/* 0x0910 */ 438 0x00b2 ,/* 0x0911 */ 439 0x00af ,/* 0x0912 */ 440 0x00b0 ,/* 0x0913 */ 441 0x00b1 ,/* 0x0914 */ 442 0x00b3 ,/* 0x0915 */ 443 0x00b4 ,/* 0x0916 */ 444 0x00b5 ,/* 0x0917 */ 445 0x00b6 ,/* 0x0918 */ 446 0x00b7 ,/* 0x0919 */ 447 0x00b8 ,/* 0x091a */ 448 0x00b9 ,/* 0x091b */ 449 0x00ba ,/* 0x091c */ 450 0x00bb ,/* 0x091d */ 451 0x00bc ,/* 0x091e */ 452 0x00bd ,/* 0x091f */ 453 0x00be ,/* 0x0920 */ 454 0x00bf ,/* 0x0921 */ 455 0x00c0 ,/* 0x0922 */ 456 0x00c1 ,/* 0x0923 */ 457 0x00c2 ,/* 0x0924 */ 458 0x00c3 ,/* 0x0925 */ 459 0x00c4 ,/* 0x0926 */ 460 0x00c5 ,/* 0x0927 */ 461 0x00c6 ,/* 0x0928 */ 462 0x00c7 ,/* 0x0929 */ 463 0x00c8 ,/* 0x092a */ 464 0x00c9 ,/* 0x092b */ 465 0x00ca ,/* 0x092c */ 466 0x00cb ,/* 0x092d */ 467 0x00cc ,/* 0x092e */ 468 0x00cd ,/* 0x092f */ 469 0x00cf ,/* 0x0930 */ 470 0x00d0 ,/* 0x0931 */ 471 0x00d1 ,/* 0x0932 */ 472 0x00d2 ,/* 0x0933 */ 473 0x00d3 ,/* 0x0934 */ 474 0x00d4 ,/* 0x0935 */ 475 0x00d5 ,/* 0x0936 */ 476 0x00d6 ,/* 0x0937 */ 477 0x00d7 ,/* 0x0938 */ 478 0x00d8 ,/* 0x0939 */ 479 0xFFFF ,/* 0x093A */ 480 0xFFFF ,/* 0x093B */ 481 0x00e9 ,/* 0x093c */ 482 0xEAE9 ,/* 0x093d */ 483 0x00da ,/* 0x093e */ 484 0x00db ,/* 0x093f */ 485 0x00dc ,/* 0x0940 */ 486 0x00dd ,/* 0x0941 */ 487 0x00de ,/* 0x0942 */ 488 0x00df ,/* 0x0943 */ 489 0xDFE9 ,/* 0x0944 */ 490 0x00e3 ,/* 0x0945 */ 491 0x00e0 ,/* 0x0946 */ 492 0x00e1 ,/* 0x0947 */ 493 0x00e2 ,/* 0x0948 */ 494 0x00e7 ,/* 0x0949 */ 495 0x00e4 ,/* 0x094a */ 496 0x00e5 ,/* 0x094b */ 497 0x00e6 ,/* 0x094c */ 498 0x00e8 ,/* 0x094d */ 499 0x00ec ,/* 0x094e */ 500 0x00ed ,/* 0x094f */ 501 0xA1E9 ,/* 0x0950 */ /* OM Symbol */ 502 0xFFFF ,/* 0x0951 */ 503 0xF0B8 ,/* 0x0952 */ 504 0xFFFF ,/* 0x0953 */ 505 0xFFFF ,/* 0x0954 */ 506 0xFFFF ,/* 0x0955 */ 507 0xFFFF ,/* 0x0956 */ 508 0xFFFF ,/* 0x0957 */ 509 0xb3e9 ,/* 0x0958 */ 510 0xb4e9 ,/* 0x0959 */ 511 0xb5e9 ,/* 0x095a */ 512 0xbae9 ,/* 0x095b */ 513 0xbfe9 ,/* 0x095c */ 514 0xC0E9 ,/* 0x095d */ 515 0xc9e9 ,/* 0x095e */ 516 0x00ce ,/* 0x095f */ 517 0xAAe9 ,/* 0x0960 */ 518 0xA7E9 ,/* 0x0961 */ 519 0xDBE9 ,/* 0x0962 */ 520 0xDCE9 ,/* 0x0963 */ 521 0x00ea ,/* 0x0964 */ 522 0xeaea ,/* 0x0965 */ 523 0x00f1 ,/* 0x0966 */ 524 0x00f2 ,/* 0x0967 */ 525 0x00f3 ,/* 0x0968 */ 526 0x00f4 ,/* 0x0969 */ 527 0x00f5 ,/* 0x096a */ 528 0x00f6 ,/* 0x096b */ 529 0x00f7 ,/* 0x096c */ 530 0x00f8 ,/* 0x096d */ 531 0x00f9 ,/* 0x096e */ 532 0x00fa ,/* 0x096f */ 533 0xF0BF ,/* 0x0970 */ 534 0xFFFF ,/* 0x0971 */ 535 0xFFFF ,/* 0x0972 */ 536 0xFFFF ,/* 0x0973 */ 537 0xFFFF ,/* 0x0974 */ 538 0xFFFF ,/* 0x0975 */ 539 0xFFFF ,/* 0x0976 */ 540 0xFFFF ,/* 0x0977 */ 541 0xFFFF ,/* 0x0978 */ 542 0xFFFF ,/* 0x0979 */ 543 0xFFFF ,/* 0x097a */ 544 0xFFFF ,/* 0x097b */ 545 0xFFFF ,/* 0x097c */ 546 0xFFFF ,/* 0x097d */ 547 0xFFFF ,/* 0x097e */ 548 0xFFFF ,/* 0x097f */ 549 }; 550 static const uint16_t toUnicodeTable[256]={ 551 0x0000,/* 0x00 */ 552 0x0001,/* 0x01 */ 553 0x0002,/* 0x02 */ 554 0x0003,/* 0x03 */ 555 0x0004,/* 0x04 */ 556 0x0005,/* 0x05 */ 557 0x0006,/* 0x06 */ 558 0x0007,/* 0x07 */ 559 0x0008,/* 0x08 */ 560 0x0009,/* 0x09 */ 561 0x000a,/* 0x0a */ 562 0x000b,/* 0x0b */ 563 0x000c,/* 0x0c */ 564 0x000d,/* 0x0d */ 565 0x000e,/* 0x0e */ 566 0x000f,/* 0x0f */ 567 0x0010,/* 0x10 */ 568 0x0011,/* 0x11 */ 569 0x0012,/* 0x12 */ 570 0x0013,/* 0x13 */ 571 0x0014,/* 0x14 */ 572 0x0015,/* 0x15 */ 573 0x0016,/* 0x16 */ 574 0x0017,/* 0x17 */ 575 0x0018,/* 0x18 */ 576 0x0019,/* 0x19 */ 577 0x001a,/* 0x1a */ 578 0x001b,/* 0x1b */ 579 0x001c,/* 0x1c */ 580 0x001d,/* 0x1d */ 581 0x001e,/* 0x1e */ 582 0x001f,/* 0x1f */ 583 0x0020,/* 0x20 */ 584 0x0021,/* 0x21 */ 585 0x0022,/* 0x22 */ 586 0x0023,/* 0x23 */ 587 0x0024,/* 0x24 */ 588 0x0025,/* 0x25 */ 589 0x0026,/* 0x26 */ 590 0x0027,/* 0x27 */ 591 0x0028,/* 0x28 */ 592 0x0029,/* 0x29 */ 593 0x002a,/* 0x2a */ 594 0x002b,/* 0x2b */ 595 0x002c,/* 0x2c */ 596 0x002d,/* 0x2d */ 597 0x002e,/* 0x2e */ 598 0x002f,/* 0x2f */ 599 0x0030,/* 0x30 */ 600 0x0031,/* 0x31 */ 601 0x0032,/* 0x32 */ 602 0x0033,/* 0x33 */ 603 0x0034,/* 0x34 */ 604 0x0035,/* 0x35 */ 605 0x0036,/* 0x36 */ 606 0x0037,/* 0x37 */ 607 0x0038,/* 0x38 */ 608 0x0039,/* 0x39 */ 609 0x003A,/* 0x3A */ 610 0x003B,/* 0x3B */ 611 0x003c,/* 0x3c */ 612 0x003d,/* 0x3d */ 613 0x003e,/* 0x3e */ 614 0x003f,/* 0x3f */ 615 0x0040,/* 0x40 */ 616 0x0041,/* 0x41 */ 617 0x0042,/* 0x42 */ 618 0x0043,/* 0x43 */ 619 0x0044,/* 0x44 */ 620 0x0045,/* 0x45 */ 621 0x0046,/* 0x46 */ 622 0x0047,/* 0x47 */ 623 0x0048,/* 0x48 */ 624 0x0049,/* 0x49 */ 625 0x004a,/* 0x4a */ 626 0x004b,/* 0x4b */ 627 0x004c,/* 0x4c */ 628 0x004d,/* 0x4d */ 629 0x004e,/* 0x4e */ 630 0x004f,/* 0x4f */ 631 0x0050,/* 0x50 */ 632 0x0051,/* 0x51 */ 633 0x0052,/* 0x52 */ 634 0x0053,/* 0x53 */ 635 0x0054,/* 0x54 */ 636 0x0055,/* 0x55 */ 637 0x0056,/* 0x56 */ 638 0x0057,/* 0x57 */ 639 0x0058,/* 0x58 */ 640 0x0059,/* 0x59 */ 641 0x005a,/* 0x5a */ 642 0x005b,/* 0x5b */ 643 0x005c,/* 0x5c */ 644 0x005d,/* 0x5d */ 645 0x005e,/* 0x5e */ 646 0x005f,/* 0x5f */ 647 0x0060,/* 0x60 */ 648 0x0061,/* 0x61 */ 649 0x0062,/* 0x62 */ 650 0x0063,/* 0x63 */ 651 0x0064,/* 0x64 */ 652 0x0065,/* 0x65 */ 653 0x0066,/* 0x66 */ 654 0x0067,/* 0x67 */ 655 0x0068,/* 0x68 */ 656 0x0069,/* 0x69 */ 657 0x006a,/* 0x6a */ 658 0x006b,/* 0x6b */ 659 0x006c,/* 0x6c */ 660 0x006d,/* 0x6d */ 661 0x006e,/* 0x6e */ 662 0x006f,/* 0x6f */ 663 0x0070,/* 0x70 */ 664 0x0071,/* 0x71 */ 665 0x0072,/* 0x72 */ 666 0x0073,/* 0x73 */ 667 0x0074,/* 0x74 */ 668 0x0075,/* 0x75 */ 669 0x0076,/* 0x76 */ 670 0x0077,/* 0x77 */ 671 0x0078,/* 0x78 */ 672 0x0079,/* 0x79 */ 673 0x007a,/* 0x7a */ 674 0x007b,/* 0x7b */ 675 0x007c,/* 0x7c */ 676 0x007d,/* 0x7d */ 677 0x007e,/* 0x7e */ 678 0x007f,/* 0x7f */ 679 0x0080,/* 0x80 */ 680 0x0081,/* 0x81 */ 681 0x0082,/* 0x82 */ 682 0x0083,/* 0x83 */ 683 0x0084,/* 0x84 */ 684 0x0085,/* 0x85 */ 685 0x0086,/* 0x86 */ 686 0x0087,/* 0x87 */ 687 0x0088,/* 0x88 */ 688 0x0089,/* 0x89 */ 689 0x008a,/* 0x8a */ 690 0x008b,/* 0x8b */ 691 0x008c,/* 0x8c */ 692 0x008d,/* 0x8d */ 693 0x008e,/* 0x8e */ 694 0x008f,/* 0x8f */ 695 0x0090,/* 0x90 */ 696 0x0091,/* 0x91 */ 697 0x0092,/* 0x92 */ 698 0x0093,/* 0x93 */ 699 0x0094,/* 0x94 */ 700 0x0095,/* 0x95 */ 701 0x0096,/* 0x96 */ 702 0x0097,/* 0x97 */ 703 0x0098,/* 0x98 */ 704 0x0099,/* 0x99 */ 705 0x009a,/* 0x9a */ 706 0x009b,/* 0x9b */ 707 0x009c,/* 0x9c */ 708 0x009d,/* 0x9d */ 709 0x009e,/* 0x9e */ 710 0x009f,/* 0x9f */ 711 0x00A0,/* 0xa0 */ 712 0x0901,/* 0xa1 */ 713 0x0902,/* 0xa2 */ 714 0x0903,/* 0xa3 */ 715 0x0905,/* 0xa4 */ 716 0x0906,/* 0xa5 */ 717 0x0907,/* 0xa6 */ 718 0x0908,/* 0xa7 */ 719 0x0909,/* 0xa8 */ 720 0x090a,/* 0xa9 */ 721 0x090b,/* 0xaa */ 722 0x090e,/* 0xab */ 723 0x090f,/* 0xac */ 724 0x0910,/* 0xad */ 725 0x090d,/* 0xae */ 726 0x0912,/* 0xaf */ 727 0x0913,/* 0xb0 */ 728 0x0914,/* 0xb1 */ 729 0x0911,/* 0xb2 */ 730 0x0915,/* 0xb3 */ 731 0x0916,/* 0xb4 */ 732 0x0917,/* 0xb5 */ 733 0x0918,/* 0xb6 */ 734 0x0919,/* 0xb7 */ 735 0x091a,/* 0xb8 */ 736 0x091b,/* 0xb9 */ 737 0x091c,/* 0xba */ 738 0x091d,/* 0xbb */ 739 0x091e,/* 0xbc */ 740 0x091f,/* 0xbd */ 741 0x0920,/* 0xbe */ 742 0x0921,/* 0xbf */ 743 0x0922,/* 0xc0 */ 744 0x0923,/* 0xc1 */ 745 0x0924,/* 0xc2 */ 746 0x0925,/* 0xc3 */ 747 0x0926,/* 0xc4 */ 748 0x0927,/* 0xc5 */ 749 0x0928,/* 0xc6 */ 750 0x0929,/* 0xc7 */ 751 0x092a,/* 0xc8 */ 752 0x092b,/* 0xc9 */ 753 0x092c,/* 0xca */ 754 0x092d,/* 0xcb */ 755 0x092e,/* 0xcc */ 756 0x092f,/* 0xcd */ 757 0x095f,/* 0xce */ 758 0x0930,/* 0xcf */ 759 0x0931,/* 0xd0 */ 760 0x0932,/* 0xd1 */ 761 0x0933,/* 0xd2 */ 762 0x0934,/* 0xd3 */ 763 0x0935,/* 0xd4 */ 764 0x0936,/* 0xd5 */ 765 0x0937,/* 0xd6 */ 766 0x0938,/* 0xd7 */ 767 0x0939,/* 0xd8 */ 768 0x200D,/* 0xd9 */ 769 0x093e,/* 0xda */ 770 0x093f,/* 0xdb */ 771 0x0940,/* 0xdc */ 772 0x0941,/* 0xdd */ 773 0x0942,/* 0xde */ 774 0x0943,/* 0xdf */ 775 0x0946,/* 0xe0 */ 776 0x0947,/* 0xe1 */ 777 0x0948,/* 0xe2 */ 778 0x0945,/* 0xe3 */ 779 0x094a,/* 0xe4 */ 780 0x094b,/* 0xe5 */ 781 0x094c,/* 0xe6 */ 782 0x0949,/* 0xe7 */ 783 0x094d,/* 0xe8 */ 784 0x093c,/* 0xe9 */ 785 0x0964,/* 0xea */ 786 0xFFFF,/* 0xeb */ 787 0xFFFF,/* 0xec */ 788 0xFFFF,/* 0xed */ 789 0xFFFF,/* 0xee */ 790 0xFFFF,/* 0xef */ 791 0xFFFF,/* 0xf0 */ 792 0x0966,/* 0xf1 */ 793 0x0967,/* 0xf2 */ 794 0x0968,/* 0xf3 */ 795 0x0969,/* 0xf4 */ 796 0x096a,/* 0xf5 */ 797 0x096b,/* 0xf6 */ 798 0x096c,/* 0xf7 */ 799 0x096d,/* 0xf8 */ 800 0x096e,/* 0xf9 */ 801 0x096f,/* 0xfa */ 802 0xFFFF,/* 0xfb */ 803 0xFFFF,/* 0xfc */ 804 0xFFFF,/* 0xfd */ 805 0xFFFF,/* 0xfe */ 806 0xFFFF /* 0xff */ 807 }; 808 809 static const uint16_t vowelSignESpecialCases[][2]={ 810 { 2 /*length of array*/ , 0 }, 811 { 0xA4 , 0x0904 }, 812 }; 813 814 static const uint16_t nuktaSpecialCases[][2]={ 815 { 16 /*length of array*/ , 0 }, 816 { 0xA6 , 0x090c }, 817 { 0xEA , 0x093D }, 818 { 0xDF , 0x0944 }, 819 { 0xA1 , 0x0950 }, 820 { 0xb3 , 0x0958 }, 821 { 0xb4 , 0x0959 }, 822 { 0xb5 , 0x095a }, 823 { 0xba , 0x095b }, 824 { 0xbf , 0x095c }, 825 { 0xC0 , 0x095d }, 826 { 0xc9 , 0x095e }, 827 { 0xAA , 0x0960 }, 828 { 0xA7 , 0x0961 }, 829 { 0xDB , 0x0962 }, 830 { 0xDC , 0x0963 }, 831 }; 832 833 834 #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err) UPRV_BLOCK_MACRO_BEGIN { \ 835 int32_t offset = (int32_t)(source - args->source-1); \ 836 /* write the targetUniChar to target */ \ 837 if(target < targetLimit){ \ 838 if(targetByteUnit <= 0xFF){ \ 839 *(target)++ = (uint8_t)(targetByteUnit); \ 840 if(offsets){ \ 841 *(offsets++) = offset; \ 842 } \ 843 }else{ \ 844 if (targetByteUnit > 0xFFFF) { \ 845 *(target)++ = (uint8_t)(targetByteUnit>>16); \ 846 if (offsets) { \ 847 --offset; \ 848 *(offsets++) = offset; \ 849 } \ 850 } \ 851 if (!(target < targetLimit)) { \ 852 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ 853 (uint8_t)(targetByteUnit >> 8); \ 854 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ 855 (uint8_t)targetByteUnit; \ 856 *err = U_BUFFER_OVERFLOW_ERROR; \ 857 } else { \ 858 *(target)++ = (uint8_t)(targetByteUnit>>8); \ 859 if(offsets){ \ 860 *(offsets++) = offset; \ 861 } \ 862 if(target < targetLimit){ \ 863 *(target)++ = (uint8_t) targetByteUnit; \ 864 if(offsets){ \ 865 *(offsets++) = offset ; \ 866 } \ 867 }else{ \ 868 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\ 869 (uint8_t) (targetByteUnit); \ 870 *err = U_BUFFER_OVERFLOW_ERROR; \ 871 } \ 872 } \ 873 } \ 874 }else{ \ 875 if (targetByteUnit & 0xFF0000) { \ 876 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ 877 (uint8_t) (targetByteUnit >>16); \ 878 } \ 879 if(targetByteUnit & 0xFF00){ \ 880 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ 881 (uint8_t) (targetByteUnit >>8); \ 882 } \ 883 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ 884 (uint8_t) (targetByteUnit); \ 885 *err = U_BUFFER_OVERFLOW_ERROR; \ 886 } \ 887 } UPRV_BLOCK_MACRO_END 888 889 /* Rules: 890 * Explicit Halant : 891 * <HALANT> + <ZWNJ> 892 * Soft Halant : 893 * <HALANT> + <ZWJ> 894 */ 895 static void U_CALLCONV 896 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC( 897 UConverterFromUnicodeArgs * args, UErrorCode * err) { 898 const char16_t *source = args->source; 899 const char16_t *sourceLimit = args->sourceLimit; 900 unsigned char *target = (unsigned char *) args->target; 901 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 902 int32_t* offsets = args->offsets; 903 uint32_t targetByteUnit = 0x0000; 904 UChar32 sourceChar = 0x0000; 905 UChar32 tempContextFromUnicode = 0x0000; /* For special handling of the Gurmukhi script. */ 906 UConverterDataISCII *converterData; 907 uint16_t newDelta=0; 908 uint16_t range = 0; 909 UBool deltaChanged = false; 910 911 if ((args->converter == nullptr) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) { 912 *err = U_ILLEGAL_ARGUMENT_ERROR; 913 return; 914 } 915 /* initialize data */ 916 converterData=(UConverterDataISCII*)args->converter->extraInfo; 917 newDelta=converterData->currentDeltaFromUnicode; 918 range = (uint16_t)(newDelta/DELTA); 919 920 if ((sourceChar = args->converter->fromUChar32)!=0) { 921 goto getTrail; 922 } 923 924 /*writing the char to the output stream */ 925 while (source < sourceLimit) { 926 /* Write the language code following LF only if LF is not the last character. */ 927 if (args->converter->fromUnicodeStatus == LF) { 928 targetByteUnit = ATR<<8; 929 targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang; 930 args->converter->fromUnicodeStatus = 0x0000; 931 /* now append ATR and language code */ 932 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); 933 if (U_FAILURE(*err)) { 934 break; 935 } 936 } 937 938 sourceChar = *source++; 939 tempContextFromUnicode = converterData->contextCharFromUnicode; 940 941 targetByteUnit = missingCharMarker; 942 943 /*check if input is in ASCII and C0 control codes range*/ 944 if (sourceChar <= ASCII_END) { 945 args->converter->fromUnicodeStatus = sourceChar; 946 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err); 947 if (U_FAILURE(*err)) { 948 break; 949 } 950 continue; 951 } 952 switch (sourceChar) { 953 case ZWNJ: 954 /* contextChar has HALANT */ 955 if (converterData->contextCharFromUnicode) { 956 converterData->contextCharFromUnicode = 0x00; 957 targetByteUnit = ISCII_HALANT; 958 } else { 959 /* consume ZWNJ and continue */ 960 converterData->contextCharFromUnicode = 0x00; 961 continue; 962 } 963 break; 964 case ZWJ: 965 /* contextChar has HALANT */ 966 if (converterData->contextCharFromUnicode) { 967 targetByteUnit = ISCII_NUKTA; 968 } else { 969 targetByteUnit =ISCII_INV; 970 } 971 converterData->contextCharFromUnicode = 0x00; 972 break; 973 default: 974 /* is the sourceChar in the INDIC_RANGE? */ 975 if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) { 976 /* Danda and Double Danda are valid in Northern scripts.. since Unicode 977 * does not include these codepoints in all Northern scrips we need to 978 * filter them out 979 */ 980 if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) { 981 /* find out to which block the souceChar belongs*/ 982 range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA); 983 newDelta =(uint16_t)(range*DELTA); 984 985 /* Now are we in the same block as the previous? */ 986 if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) { 987 converterData->currentDeltaFromUnicode = newDelta; 988 converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum; 989 deltaChanged =true; 990 converterData->isFirstBuffer=false; 991 } 992 993 if (converterData->currentDeltaFromUnicode == PNJ_DELTA) { 994 if (sourceChar == PNJ_TIPPI) { 995 /* Make sure Tippi is converted to Bindi. */ 996 sourceChar = PNJ_BINDI; 997 } else if (sourceChar == PNJ_ADHAK) { 998 /* This is for consonant cluster handling. */ 999 converterData->contextCharFromUnicode = PNJ_ADHAK; 1000 } 1001 1002 } 1003 /* Normalize all Indic codepoints to Devanagari and map them to ISCII */ 1004 /* now subtract the new delta from sourceChar*/ 1005 sourceChar -= converterData->currentDeltaFromUnicode; 1006 } 1007 1008 /* get the target byte unit */ 1009 targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar]; 1010 1011 /* is the code point valid in current script? */ 1012 if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) { 1013 /* Vocallic RR is assigned in ISCII Telugu and Unicode */ 1014 if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) { 1015 targetByteUnit=missingCharMarker; 1016 } 1017 } 1018 1019 if (deltaChanged) { 1020 /* we are in a script block which is different than 1021 * previous sourceChar's script block write ATR and language codes 1022 */ 1023 uint32_t temp=0; 1024 temp =(uint16_t)(ATR<<8); 1025 temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang); 1026 /* reset */ 1027 deltaChanged=false; 1028 /* now append ATR and language code */ 1029 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err); 1030 if (U_FAILURE(*err)) { 1031 break; 1032 } 1033 } 1034 1035 if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) { 1036 continue; 1037 } 1038 } 1039 /* reset context char */ 1040 converterData->contextCharFromUnicode = 0x00; 1041 break; 1042 } 1043 if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) { 1044 /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */ 1045 /* reset context char */ 1046 converterData->contextCharFromUnicode = 0x0000; 1047 targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit; 1048 /* write targetByteUnit to target */ 1049 WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err); 1050 if (U_FAILURE(*err)) { 1051 break; 1052 } 1053 } else if (targetByteUnit != missingCharMarker) { 1054 if (targetByteUnit==ISCII_HALANT) { 1055 converterData->contextCharFromUnicode = (char16_t)targetByteUnit; 1056 } 1057 /* write targetByteUnit to target*/ 1058 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); 1059 if (U_FAILURE(*err)) { 1060 break; 1061 } 1062 } else { 1063 /* oops.. the code point is unassigned */ 1064 /*check if the char is a First surrogate*/ 1065 if (U16_IS_SURROGATE(sourceChar)) { 1066 if (U16_IS_SURROGATE_LEAD(sourceChar)) { 1067 getTrail: 1068 /*look ahead to find the trail surrogate*/ 1069 if (source < sourceLimit) { 1070 /* test the following code unit */ 1071 char16_t trail= (*source); 1072 if (U16_IS_TRAIL(trail)) { 1073 source++; 1074 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1075 *err =U_INVALID_CHAR_FOUND; 1076 /* convert this surrogate code point */ 1077 /* exit this condition tree */ 1078 } else { 1079 /* this is an unmatched lead code unit (1st surrogate) */ 1080 /* callback(illegal) */ 1081 *err=U_ILLEGAL_CHAR_FOUND; 1082 } 1083 } else { 1084 /* no more input */ 1085 *err = U_ZERO_ERROR; 1086 } 1087 } else { 1088 /* this is an unmatched trail code unit (2nd surrogate) */ 1089 /* callback(illegal) */ 1090 *err=U_ILLEGAL_CHAR_FOUND; 1091 } 1092 } else { 1093 /* callback(unassigned) for a BMP code point */ 1094 *err = U_INVALID_CHAR_FOUND; 1095 } 1096 1097 args->converter->fromUChar32=sourceChar; 1098 break; 1099 } 1100 }/* end while(mySourceIndex<mySourceLength) */ 1101 1102 /*save the state and return */ 1103 args->source = source; 1104 args->target = (char*)target; 1105 } 1106 1107 static const uint16_t lookupTable[][2]={ 1108 { ZERO, ZERO }, /*DEFAULT*/ 1109 { ZERO, ZERO }, /*ROMAN*/ 1110 { DEVANAGARI, DEV_MASK }, 1111 { BENGALI, BNG_MASK }, 1112 { TAMIL, TML_MASK }, 1113 { TELUGU, KND_MASK }, 1114 { BENGALI, BNG_MASK }, 1115 { ORIYA, ORI_MASK }, 1116 { KANNADA, KND_MASK }, 1117 { MALAYALAM, MLM_MASK }, 1118 { GUJARATI, GJR_MASK }, 1119 { GURMUKHI, PNJ_MASK } 1120 }; 1121 1122 #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err) UPRV_BLOCK_MACRO_BEGIN { \ 1123 /* add offset to current Indic Block */ \ 1124 if(targetUniChar>ASCII_END && \ 1125 targetUniChar != ZWJ && \ 1126 targetUniChar != ZWNJ && \ 1127 targetUniChar != DANDA && \ 1128 targetUniChar != DOUBLE_DANDA){ \ 1129 \ 1130 targetUniChar+=(uint16_t)(delta); \ 1131 } \ 1132 /* now write the targetUniChar */ \ 1133 if(target<args->targetLimit){ \ 1134 *(target)++ = (char16_t)targetUniChar; \ 1135 if(offsets){ \ 1136 *(offsets)++ = (int32_t)(offset); \ 1137 } \ 1138 }else{ \ 1139 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \ 1140 (char16_t)targetUniChar; \ 1141 *err = U_BUFFER_OVERFLOW_ERROR; \ 1142 } \ 1143 } UPRV_BLOCK_MACRO_END 1144 1145 #define GET_MAPPING(sourceChar,targetUniChar,data) UPRV_BLOCK_MACRO_BEGIN { \ 1146 targetUniChar = toUnicodeTable[(sourceChar)] ; \ 1147 /* is the code point valid in current script? */ \ 1148 if(sourceChar> ASCII_END && \ 1149 (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \ 1150 /* Vocallic RR is assigned in ISCII Telugu and Unicode */ \ 1151 if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \ 1152 targetUniChar!=VOCALLIC_RR){ \ 1153 targetUniChar=missingCharMarker; \ 1154 } \ 1155 } \ 1156 } UPRV_BLOCK_MACRO_END 1157 1158 /*********** 1159 * Rules for ISCII to Unicode converter 1160 * ISCII is stateful encoding. To convert ISCII bytes to Unicode, 1161 * which has both precomposed and decomposed forms characters 1162 * pre-context and post-context need to be considered. 1163 * 1164 * Post context 1165 * i) ATR : Attribute code is used to declare the font and script switching. 1166 * Currently we only switch scripts and font codes consumed without generating an error 1167 * ii) EXT : Extension code is used to declare switching to Sanskrit and for obscure, 1168 * obsolete characters 1169 * Pre context 1170 * i) Halant: if preceded by a halant then it is a explicit halant 1171 * ii) Nukta : 1172 * a) if preceded by a halant then it is a soft halant 1173 * b) if preceded by specific consonants and the ligatures have pre-composed 1174 * characters in Unicode then convert to pre-composed characters 1175 * iii) Danda: If Danda is preceded by a Danda then convert to Double Danda 1176 * 1177 */ 1178 1179 static void U_CALLCONV 1180 UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err) { 1181 const char *source = ( char *) args->source; 1182 char16_t *target = args->target; 1183 const char *sourceLimit = args->sourceLimit; 1184 const char16_t* targetLimit = args->targetLimit; 1185 uint32_t targetUniChar = 0x0000; 1186 uint8_t sourceChar = 0x0000; 1187 UConverterDataISCII* data; 1188 UChar32* toUnicodeStatus=nullptr; 1189 UChar32 tempTargetUniChar = 0x0000; 1190 char16_t* contextCharToUnicode= nullptr; 1191 UBool found; 1192 int i; 1193 int offset = 0; 1194 1195 if ((args->converter == nullptr) || (target < args->target) || (source < args->source)) { 1196 *err = U_ILLEGAL_ARGUMENT_ERROR; 1197 return; 1198 } 1199 1200 data = (UConverterDataISCII*)(args->converter->extraInfo); 1201 contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */ 1202 toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/ 1203 1204 while (U_SUCCESS(*err) && source<sourceLimit) { 1205 1206 targetUniChar = missingCharMarker; 1207 1208 if (target < targetLimit) { 1209 sourceChar = (unsigned char)*(source)++; 1210 1211 /* look at the post-context perform special processing */ 1212 if (*contextCharToUnicode==ATR) { 1213 1214 /* If we have ATR in *contextCharToUnicode then we need to change our 1215 * state to the Indic Script specified by sourceChar 1216 */ 1217 1218 /* check if the sourceChar is supported script range*/ 1219 if ((uint8_t)(PNJ-sourceChar)<=PNJ-DEV) { 1220 data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA); 1221 data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1]; 1222 } else if (sourceChar==DEF) { 1223 /* switch back to default */ 1224 data->currentDeltaToUnicode = data->defDeltaToUnicode; 1225 data->currentMaskToUnicode = data->defMaskToUnicode; 1226 } else { 1227 if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) { 1228 /* these are display codes consume and continue */ 1229 } else { 1230 *err =U_ILLEGAL_CHAR_FOUND; 1231 /* reset */ 1232 *contextCharToUnicode=NO_CHAR_MARKER; 1233 goto CALLBACK; 1234 } 1235 } 1236 1237 /* reset */ 1238 *contextCharToUnicode=NO_CHAR_MARKER; 1239 1240 continue; 1241 1242 } else if (*contextCharToUnicode==EXT) { 1243 /* check if sourceChar is in 0xA1-0xEE range */ 1244 if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) { 1245 /* We currently support only Anudatta and Devanagari abbreviation sign */ 1246 if (sourceChar==0xBF || sourceChar == 0xB8) { 1247 targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA; 1248 1249 /* find out if the mapping is valid in this state */ 1250 if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { 1251 *contextCharToUnicode= NO_CHAR_MARKER; 1252 1253 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ 1254 if (data->prevToUnicodeStatus) { 1255 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); 1256 data->prevToUnicodeStatus = 0x0000; 1257 } 1258 /* write to target */ 1259 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); 1260 1261 continue; 1262 } 1263 } 1264 /* byte unit is unassigned */ 1265 targetUniChar = missingCharMarker; 1266 *err= U_INVALID_CHAR_FOUND; 1267 } else { 1268 /* only 0xA1 - 0xEE are legal after EXT char */ 1269 *contextCharToUnicode= NO_CHAR_MARKER; 1270 *err = U_ILLEGAL_CHAR_FOUND; 1271 } 1272 goto CALLBACK; 1273 } else if (*contextCharToUnicode==ISCII_INV) { 1274 if (sourceChar==ISCII_HALANT) { 1275 targetUniChar = 0x0020; /* replace with space according to Indic FAQ */ 1276 } else { 1277 targetUniChar = ZWJ; 1278 } 1279 1280 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ 1281 if (data->prevToUnicodeStatus) { 1282 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); 1283 data->prevToUnicodeStatus = 0x0000; 1284 } 1285 /* write to target */ 1286 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); 1287 /* reset */ 1288 *contextCharToUnicode=NO_CHAR_MARKER; 1289 } 1290 1291 /* look at the pre-context and perform special processing */ 1292 switch (sourceChar) { 1293 case ISCII_INV: 1294 case EXT: 1295 case ATR: 1296 *contextCharToUnicode = (char16_t)sourceChar; 1297 1298 if (*toUnicodeStatus != missingCharMarker) { 1299 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ 1300 if (data->prevToUnicodeStatus) { 1301 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); 1302 data->prevToUnicodeStatus = 0x0000; 1303 } 1304 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err); 1305 *toUnicodeStatus = missingCharMarker; 1306 } 1307 continue; 1308 case ISCII_DANDA: 1309 /* handle double danda*/ 1310 if (*contextCharToUnicode== ISCII_DANDA) { 1311 targetUniChar = DOUBLE_DANDA; 1312 /* clear the context */ 1313 *contextCharToUnicode = NO_CHAR_MARKER; 1314 *toUnicodeStatus = missingCharMarker; 1315 } else { 1316 GET_MAPPING(sourceChar,targetUniChar,data); 1317 *contextCharToUnicode = sourceChar; 1318 } 1319 break; 1320 case ISCII_HALANT: 1321 /* handle explicit halant */ 1322 if (*contextCharToUnicode == ISCII_HALANT) { 1323 targetUniChar = ZWNJ; 1324 /* clear the context */ 1325 *contextCharToUnicode = NO_CHAR_MARKER; 1326 } else { 1327 GET_MAPPING(sourceChar,targetUniChar,data); 1328 *contextCharToUnicode = sourceChar; 1329 } 1330 break; 1331 case 0x0A: 1332 case 0x0D: 1333 data->resetToDefaultToUnicode = true; 1334 GET_MAPPING(sourceChar,targetUniChar,data) 1335 ; 1336 *contextCharToUnicode = sourceChar; 1337 break; 1338 1339 case ISCII_VOWEL_SIGN_E: 1340 i=1; 1341 found=false; 1342 for (; i<vowelSignESpecialCases[0][0]; i++) { 1343 U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases)); 1344 if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) { 1345 targetUniChar=vowelSignESpecialCases[i][1]; 1346 found=true; 1347 break; 1348 } 1349 } 1350 if (found) { 1351 /* find out if the mapping is valid in this state */ 1352 if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { 1353 /*targetUniChar += data->currentDeltaToUnicode ;*/ 1354 *contextCharToUnicode= NO_CHAR_MARKER; 1355 *toUnicodeStatus = missingCharMarker; 1356 break; 1357 } 1358 } 1359 GET_MAPPING(sourceChar,targetUniChar,data); 1360 *contextCharToUnicode = sourceChar; 1361 break; 1362 1363 case ISCII_NUKTA: 1364 /* handle soft halant */ 1365 if (*contextCharToUnicode == ISCII_HALANT) { 1366 targetUniChar = ZWJ; 1367 /* clear the context */ 1368 *contextCharToUnicode = NO_CHAR_MARKER; 1369 break; 1370 } else if (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) { 1371 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ 1372 if (data->prevToUnicodeStatus) { 1373 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); 1374 data->prevToUnicodeStatus = 0x0000; 1375 } 1376 /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi. 1377 * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39). 1378 */ 1379 targetUniChar = PNJ_RRA; 1380 WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); 1381 if (U_SUCCESS(*err)) { 1382 targetUniChar = PNJ_SIGN_VIRAMA; 1383 WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); 1384 if (U_SUCCESS(*err)) { 1385 targetUniChar = PNJ_HA; 1386 WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); 1387 } else { 1388 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA; 1389 } 1390 } else { 1391 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA; 1392 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA; 1393 } 1394 *toUnicodeStatus = missingCharMarker; 1395 data->contextCharToUnicode = NO_CHAR_MARKER; 1396 continue; 1397 } else { 1398 /* try to handle <CHAR> + ISCII_NUKTA special mappings */ 1399 i=1; 1400 found =false; 1401 for (; i<nuktaSpecialCases[0][0]; i++) { 1402 if (nuktaSpecialCases[i][0]==(uint8_t) 1403 *contextCharToUnicode) { 1404 targetUniChar=nuktaSpecialCases[i][1]; 1405 found =true; 1406 break; 1407 } 1408 } 1409 if (found) { 1410 /* find out if the mapping is valid in this state */ 1411 if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { 1412 /*targetUniChar += data->currentDeltaToUnicode ;*/ 1413 *contextCharToUnicode= NO_CHAR_MARKER; 1414 *toUnicodeStatus = missingCharMarker; 1415 if (data->currentDeltaToUnicode == PNJ_DELTA) { 1416 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ 1417 if (data->prevToUnicodeStatus) { 1418 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); 1419 data->prevToUnicodeStatus = 0x0000; 1420 } 1421 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); 1422 continue; 1423 } 1424 break; 1425 } 1426 /* else fall through to default */ 1427 } 1428 /* else fall through to default */ 1429 U_FALLTHROUGH; 1430 } 1431 default:GET_MAPPING(sourceChar,targetUniChar,data) 1432 ; 1433 *contextCharToUnicode = sourceChar; 1434 break; 1435 } 1436 1437 if (*toUnicodeStatus != missingCharMarker) { 1438 /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */ 1439 if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) && 1440 (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && ((UChar32)(targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus)) { 1441 /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */ 1442 offset = (int)(source-args->source - 3); 1443 tempTargetUniChar = PNJ_ADHAK; /* This is necessary to avoid some compiler warnings. */ 1444 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,tempTargetUniChar,0,err); 1445 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,data->prevToUnicodeStatus,0,err); 1446 data->prevToUnicodeStatus = 0x0000; /* reset the previous unicode code point */ 1447 *toUnicodeStatus = missingCharMarker; 1448 continue; 1449 } else { 1450 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ 1451 if (data->prevToUnicodeStatus) { 1452 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); 1453 data->prevToUnicodeStatus = 0x0000; 1454 } 1455 /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script. 1456 * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi. 1457 */ 1458 if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) { 1459 targetUniChar = PNJ_TIPPI - PNJ_DELTA; 1460 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err); 1461 } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) { 1462 /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */ 1463 data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA; 1464 } else { 1465 /* write the previously mapped codepoint */ 1466 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err); 1467 } 1468 } 1469 *toUnicodeStatus = missingCharMarker; 1470 } 1471 1472 if (targetUniChar != missingCharMarker) { 1473 /* now save the targetUniChar for delayed write */ 1474 *toUnicodeStatus = (char16_t) targetUniChar; 1475 if (data->resetToDefaultToUnicode) { 1476 data->currentDeltaToUnicode = data->defDeltaToUnicode; 1477 data->currentMaskToUnicode = data->defMaskToUnicode; 1478 data->resetToDefaultToUnicode=false; 1479 } 1480 } else { 1481 1482 /* we reach here only if targetUniChar == missingCharMarker 1483 * so assign codes to reason and err 1484 */ 1485 *err = U_INVALID_CHAR_FOUND; 1486 CALLBACK: 1487 args->converter->toUBytes[0] = sourceChar; 1488 args->converter->toULength = 1; 1489 break; 1490 } 1491 1492 } else { 1493 *err =U_BUFFER_OVERFLOW_ERROR; 1494 break; 1495 } 1496 } 1497 1498 if (U_SUCCESS(*err) && args->flush && source == sourceLimit) { 1499 /* end of the input stream */ 1500 UConverter *cnv = args->converter; 1501 1502 if (*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV) { 1503 /* set toUBytes[] */ 1504 cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode; 1505 cnv->toULength = 1; 1506 1507 /* avoid looping on truncated sequences */ 1508 *contextCharToUnicode = NO_CHAR_MARKER; 1509 } else { 1510 cnv->toULength = 0; 1511 } 1512 1513 if (*toUnicodeStatus != missingCharMarker) { 1514 /* output a remaining target character */ 1515 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),*toUnicodeStatus,data->currentDeltaToUnicode,err); 1516 *toUnicodeStatus = missingCharMarker; 1517 } 1518 } 1519 1520 args->target = target; 1521 args->source = source; 1522 } 1523 1524 /* structure for SafeClone calculations */ 1525 struct cloneISCIIStruct { 1526 UConverter cnv; 1527 UConverterDataISCII mydata; 1528 }; 1529 1530 static UConverter * U_CALLCONV 1531 _ISCII_SafeClone(const UConverter *cnv, 1532 void *stackBuffer, 1533 int32_t *pBufferSize, 1534 UErrorCode *status) 1535 { 1536 struct cloneISCIIStruct * localClone; 1537 int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct); 1538 1539 if (U_FAILURE(*status)) { 1540 return nullptr; 1541 } 1542 1543 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 1544 *pBufferSize = bufferSizeNeeded; 1545 return nullptr; 1546 } 1547 1548 localClone = (struct cloneISCIIStruct *)stackBuffer; 1549 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1550 1551 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII)); 1552 localClone->cnv.extraInfo = &localClone->mydata; 1553 localClone->cnv.isExtraLocal = true; 1554 1555 return &localClone->cnv; 1556 } 1557 1558 static void U_CALLCONV 1559 _ISCIIGetUnicodeSet(const UConverter *cnv, 1560 const USetAdder *sa, 1561 UConverterUnicodeSet which, 1562 UErrorCode *pErrorCode) 1563 { 1564 (void)cnv; 1565 (void)which; 1566 (void)pErrorCode; 1567 int32_t idx, script; 1568 uint8_t mask; 1569 1570 /* Since all ISCII versions allow switching to other ISCII 1571 scripts, we add all roundtrippable characters to this set. */ 1572 sa->addRange(sa->set, 0, ASCII_END); 1573 for (script = DEVANAGARI; script <= MALAYALAM; script++) { 1574 mask = (uint8_t)(lookupInitialData[script].maskEnum); 1575 for (idx = 0; idx < DELTA; idx++) { 1576 /* added check for TELUGU character */ 1577 if ((validityTable[idx] & mask) || (script==TELUGU && idx==0x31)) { 1578 sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN); 1579 } 1580 } 1581 } 1582 sa->add(sa->set, DANDA); 1583 sa->add(sa->set, DOUBLE_DANDA); 1584 sa->add(sa->set, ZWNJ); 1585 sa->add(sa->set, ZWJ); 1586 } 1587 U_CDECL_END 1588 static const UConverterImpl _ISCIIImpl={ 1589 1590 UCNV_ISCII, 1591 1592 nullptr, 1593 nullptr, 1594 1595 _ISCIIOpen, 1596 _ISCIIClose, 1597 _ISCIIReset, 1598 1599 UConverter_toUnicode_ISCII_OFFSETS_LOGIC, 1600 UConverter_toUnicode_ISCII_OFFSETS_LOGIC, 1601 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, 1602 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, 1603 nullptr, 1604 1605 nullptr, 1606 _ISCIIgetName, 1607 nullptr, 1608 _ISCII_SafeClone, 1609 _ISCIIGetUnicodeSet, 1610 nullptr, 1611 nullptr 1612 }; 1613 1614 static const UConverterStaticData _ISCIIStaticData={ 1615 sizeof(UConverterStaticData), 1616 "ISCII", 1617 0, 1618 UCNV_IBM, 1619 UCNV_ISCII, 1620 1, 1621 4, 1622 { 0x1a, 0, 0, 0 }, 1623 0x1, 1624 false, 1625 false, 1626 0x0, 1627 0x0, 1628 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ 1629 1630 }; 1631 1632 const UConverterSharedData _ISCIIData= 1633 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl); 1634 1635 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */