utf8.c (13228B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #include "seccomon.h" 6 #include "secport.h" 7 8 /* 9 * From RFC 2044: 10 * 11 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 12 * 0000 0000-0000 007F 0xxxxxxx 13 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 14 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 15 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 16 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 17 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx 18 */ 19 20 /* 21 * From http://www.imc.org/draft-hoffman-utf16 22 * 23 * For U on [0x00010000,0x0010FFFF]: Let U' = U - 0x00010000 24 * 25 * U' = yyyyyyyyyyxxxxxxxxxx 26 * W1 = 110110yyyyyyyyyy 27 * W2 = 110111xxxxxxxxxx 28 */ 29 30 /* 31 * This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit 32 * character values. If you wish to use this code for working with 33 * host byte order values, define the following: 34 * 35 * #if IS_BIG_ENDIAN 36 * #define L_0 0 37 * #define L_1 1 38 * #define L_2 2 39 * #define L_3 3 40 * #define H_0 0 41 * #define H_1 1 42 * #else / * not everyone has elif * / 43 * #if IS_LITTLE_ENDIAN 44 * #define L_0 3 45 * #define L_1 2 46 * #define L_2 1 47 * #define L_3 0 48 * #define H_0 1 49 * #define H_1 0 50 * #else 51 * #error "PDP and NUXI support deferred" 52 * #endif / * IS_LITTLE_ENDIAN * / 53 * #endif / * IS_BIG_ENDIAN * / 54 */ 55 56 #define L_0 0 57 #define L_1 1 58 #define L_2 2 59 #define L_3 3 60 #define H_0 0 61 #define H_1 1 62 63 #define BAD_UTF8 ((PRUint32)-1) 64 65 /* 66 * Parse a single UTF-8 character per the spec. in section 3.9 (D36) 67 * of Unicode 4.0.0. 68 * 69 * Parameters: 70 * index - Points to the byte offset in inBuf of character to read. On success, 71 * updated to the offset of the following character. 72 * inBuf - Input buffer, UTF-8 encoded 73 * inbufLen - Length of input buffer, in bytes. 74 * 75 * Returns: 76 * Success - The UCS4 encoded character 77 * Failure - BAD_UTF8 78 */ 79 static PRUint32 80 sec_port_read_utf8(unsigned int *index, unsigned char *inBuf, unsigned int inBufLen) 81 { 82 PRUint32 result; 83 unsigned int i = *index; 84 int bytes_left; 85 PRUint32 min_value; 86 87 PORT_Assert(i < inBufLen); 88 89 if ((inBuf[i] & 0x80) == 0x00) { 90 result = inBuf[i++]; 91 bytes_left = 0; 92 min_value = 0; 93 } else if ((inBuf[i] & 0xE0) == 0xC0) { 94 result = inBuf[i++] & 0x1F; 95 bytes_left = 1; 96 min_value = 0x80; 97 } else if ((inBuf[i] & 0xF0) == 0xE0) { 98 result = inBuf[i++] & 0x0F; 99 bytes_left = 2; 100 min_value = 0x800; 101 } else if ((inBuf[i] & 0xF8) == 0xF0) { 102 result = inBuf[i++] & 0x07; 103 bytes_left = 3; 104 min_value = 0x10000; 105 } else { 106 return BAD_UTF8; 107 } 108 109 while (bytes_left--) { 110 if (i >= inBufLen || (inBuf[i] & 0xC0) != 0x80) 111 return BAD_UTF8; 112 result = (result << 6) | (inBuf[i++] & 0x3F); 113 } 114 115 /* Check for overlong sequences, surrogates, and outside unicode range */ 116 if (result < min_value || (result & 0xFFFFF800) == 0xD800 || result > 0x10FFFF) { 117 return BAD_UTF8; 118 } 119 120 *index = i; 121 return result; 122 } 123 124 PRBool 125 sec_port_ucs4_utf8_conversion_function( 126 PRBool toUnicode, 127 unsigned char *inBuf, 128 unsigned int inBufLen, 129 unsigned char *outBuf, 130 unsigned int maxOutBufLen, 131 unsigned int *outBufLen) 132 { 133 PORT_Assert((unsigned int *)NULL != outBufLen); 134 135 if (toUnicode) { 136 unsigned int i, len = 0; 137 138 for (i = 0; i < inBufLen;) { 139 if ((inBuf[i] & 0x80) == 0x00) 140 i += 1; 141 else if ((inBuf[i] & 0xE0) == 0xC0) 142 i += 2; 143 else if ((inBuf[i] & 0xF0) == 0xE0) 144 i += 3; 145 else if ((inBuf[i] & 0xF8) == 0xF0) 146 i += 4; 147 else 148 return PR_FALSE; 149 150 len += 4; 151 } 152 153 if (len > maxOutBufLen) { 154 *outBufLen = len; 155 return PR_FALSE; 156 } 157 158 len = 0; 159 160 for (i = 0; i < inBufLen;) { 161 PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen); 162 163 if (ucs4 == BAD_UTF8) 164 return PR_FALSE; 165 166 outBuf[len + L_0] = 0x00; 167 outBuf[len + L_1] = (unsigned char)(ucs4 >> 16); 168 outBuf[len + L_2] = (unsigned char)(ucs4 >> 8); 169 outBuf[len + L_3] = (unsigned char)ucs4; 170 171 len += 4; 172 } 173 174 *outBufLen = len; 175 return PR_TRUE; 176 } else { 177 unsigned int i, len = 0; 178 PORT_Assert((inBufLen % 4) == 0); 179 if ((inBufLen % 4) != 0) { 180 *outBufLen = 0; 181 return PR_FALSE; 182 } 183 184 for (i = 0; i < inBufLen; i += 4) { 185 if ((inBuf[i + L_0] > 0x00) || (inBuf[i + L_1] > 0x10)) { 186 *outBufLen = 0; 187 return PR_FALSE; 188 } else if (inBuf[i + L_1] >= 0x01) 189 len += 4; 190 else if (inBuf[i + L_2] >= 0x08) 191 len += 3; 192 else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) 193 len += 2; 194 else 195 len += 1; 196 } 197 198 if (len > maxOutBufLen) { 199 *outBufLen = len; 200 return PR_FALSE; 201 } 202 203 len = 0; 204 205 for (i = 0; i < inBufLen; i += 4) { 206 if (inBuf[i + L_1] >= 0x01) { 207 /* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 208 /* 00000000 000abcde fghijklm nopqrstu -> 209 11110abc 10defghi 10jklmno 10pqrstu */ 210 211 outBuf[len + 0] = 0xF0 | ((inBuf[i + L_1] & 0x1C) >> 2); 212 outBuf[len + 1] = 0x80 | ((inBuf[i + L_1] & 0x03) << 4) | ((inBuf[i + L_2] & 0xF0) >> 4); 213 outBuf[len + 2] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6); 214 outBuf[len + 3] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0); 215 216 len += 4; 217 } else if (inBuf[i + L_2] >= 0x08) { 218 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 219 /* 00000000 00000000 abcdefgh ijklmnop -> 220 1110abcd 10efghij 10klmnop */ 221 222 outBuf[len + 0] = 0xE0 | ((inBuf[i + L_2] & 0xF0) >> 4); 223 outBuf[len + 1] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6); 224 outBuf[len + 2] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0); 225 226 len += 3; 227 } else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) { 228 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 229 /* 00000000 00000000 00000abc defghijk -> 230 110abcde 10fghijk */ 231 232 outBuf[len + 0] = 0xC0 | ((inBuf[i + L_2] & 0x07) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6); 233 outBuf[len + 1] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0); 234 235 len += 2; 236 } else { 237 /* 0000 0000-0000 007F -> 0xxxxxx */ 238 /* 00000000 00000000 00000000 0abcdefg -> 239 0abcdefg */ 240 241 outBuf[len + 0] = (inBuf[i + L_3] & 0x7F); 242 243 len += 1; 244 } 245 } 246 247 *outBufLen = len; 248 return PR_TRUE; 249 } 250 } 251 252 PRBool 253 sec_port_ucs2_utf8_conversion_function( 254 PRBool toUnicode, 255 unsigned char *inBuf, 256 unsigned int inBufLen, 257 unsigned char *outBuf, 258 unsigned int maxOutBufLen, 259 unsigned int *outBufLen) 260 { 261 PORT_Assert((unsigned int *)NULL != outBufLen); 262 263 if (toUnicode) { 264 unsigned int i, len = 0; 265 266 for (i = 0; i < inBufLen;) { 267 if ((inBuf[i] & 0x80) == 0x00) { 268 i += 1; 269 len += 2; 270 } else if ((inBuf[i] & 0xE0) == 0xC0) { 271 i += 2; 272 len += 2; 273 } else if ((inBuf[i] & 0xF0) == 0xE0) { 274 i += 3; 275 len += 2; 276 } else if ((inBuf[i] & 0xF8) == 0xF0) { 277 i += 4; 278 len += 4; 279 } else 280 return PR_FALSE; 281 } 282 283 if (len > maxOutBufLen) { 284 *outBufLen = len; 285 return PR_FALSE; 286 } 287 288 len = 0; 289 290 for (i = 0; i < inBufLen;) { 291 PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen); 292 293 if (ucs4 == BAD_UTF8) 294 return PR_FALSE; 295 296 if (ucs4 < 0x10000) { 297 outBuf[len + H_0] = (unsigned char)(ucs4 >> 8); 298 outBuf[len + H_1] = (unsigned char)ucs4; 299 len += 2; 300 } else { 301 ucs4 -= 0x10000; 302 outBuf[len + 0 + H_0] = (unsigned char)(0xD8 | ((ucs4 >> 18) & 0x3)); 303 outBuf[len + 0 + H_1] = (unsigned char)(ucs4 >> 10); 304 outBuf[len + 2 + H_0] = (unsigned char)(0xDC | ((ucs4 >> 8) & 0x3)); 305 outBuf[len + 2 + H_1] = (unsigned char)ucs4; 306 len += 4; 307 } 308 } 309 310 *outBufLen = len; 311 return PR_TRUE; 312 } else { 313 unsigned int i, len = 0; 314 PORT_Assert((inBufLen % 2) == 0); 315 if ((inBufLen % 2) != 0) { 316 *outBufLen = 0; 317 return PR_FALSE; 318 } 319 320 for (i = 0; i < inBufLen; i += 2) { 321 if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) 322 len += 1; 323 else if (inBuf[i + H_0] < 0x08) 324 len += 2; 325 else if (((inBuf[i + H_0] & 0xFC) == 0xD8)) { 326 if (((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)) { 327 i += 2; 328 len += 4; 329 } else { 330 return PR_FALSE; 331 } 332 } else if ((inBuf[i + H_0] & 0xFC) == 0xDC) { 333 return PR_FALSE; 334 } else { 335 len += 3; 336 } 337 } 338 339 if (len > maxOutBufLen) { 340 *outBufLen = len; 341 return PR_FALSE; 342 } 343 344 len = 0; 345 346 for (i = 0; i < inBufLen; i += 2) { 347 if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) { 348 /* 0000-007F -> 0xxxxxx */ 349 /* 00000000 0abcdefg -> 0abcdefg */ 350 351 outBuf[len] = inBuf[i + H_1] & 0x7F; 352 353 len += 1; 354 } else if (inBuf[i + H_0] < 0x08) { 355 /* 0080-07FF -> 110xxxxx 10xxxxxx */ 356 /* 00000abc defghijk -> 110abcde 10fghijk */ 357 358 outBuf[len + 0] = 0xC0 | ((inBuf[i + H_0] & 0x07) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6); 359 outBuf[len + 1] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0); 360 361 len += 2; 362 } else if ((inBuf[i + H_0] & 0xFC) == 0xD8) { 363 int abcde, BCDE; 364 365 PORT_Assert(((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)); 366 367 /* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 368 /* 110110BC DEfghijk 110111lm nopqrstu -> 369 { Let abcde = BCDE + 1 } 370 11110abc 10defghi 10jklmno 10pqrstu */ 371 372 BCDE = ((inBuf[i + H_0] & 0x03) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6); 373 abcde = BCDE + 1; 374 375 outBuf[len + 0] = 0xF0 | ((abcde & 0x1C) >> 2); 376 outBuf[len + 1] = 0x80 | ((abcde & 0x03) << 4) | ((inBuf[i + 0 + H_1] & 0x3C) >> 2); 377 outBuf[len + 2] = 0x80 | ((inBuf[i + 0 + H_1] & 0x03) << 4) | ((inBuf[i + 2 + H_0] & 0x03) << 2) | ((inBuf[i + 2 + H_1] & 0xC0) >> 6); 378 outBuf[len + 3] = 0x80 | ((inBuf[i + 2 + H_1] & 0x3F) >> 0); 379 380 i += 2; 381 len += 4; 382 } else { 383 /* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 384 /* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */ 385 386 outBuf[len + 0] = 0xE0 | ((inBuf[i + H_0] & 0xF0) >> 4); 387 outBuf[len + 1] = 0x80 | ((inBuf[i + H_0] & 0x0F) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6); 388 outBuf[len + 2] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0); 389 390 len += 3; 391 } 392 } 393 394 *outBufLen = len; 395 return PR_TRUE; 396 } 397 } 398 399 PRBool 400 sec_port_iso88591_utf8_conversion_function( 401 const unsigned char *inBuf, 402 unsigned int inBufLen, 403 unsigned char *outBuf, 404 unsigned int maxOutBufLen, 405 unsigned int *outBufLen) 406 { 407 unsigned int i, len = 0; 408 409 PORT_Assert((unsigned int *)NULL != outBufLen); 410 411 for (i = 0; i < inBufLen; i++) { 412 if ((inBuf[i] & 0x80) == 0x00) 413 len += 1; 414 else 415 len += 2; 416 } 417 418 if (len > maxOutBufLen) { 419 *outBufLen = len; 420 return PR_FALSE; 421 } 422 423 len = 0; 424 425 for (i = 0; i < inBufLen; i++) { 426 if ((inBuf[i] & 0x80) == 0x00) { 427 /* 00-7F -> 0xxxxxxx */ 428 /* 0abcdefg -> 0abcdefg */ 429 430 outBuf[len] = inBuf[i]; 431 len += 1; 432 } else { 433 /* 80-FF <- 110xxxxx 10xxxxxx */ 434 /* 00000000 abcdefgh -> 110000ab 10cdefgh */ 435 436 outBuf[len + 0] = 0xC0 | ((inBuf[i] & 0xC0) >> 6); 437 outBuf[len + 1] = 0x80 | ((inBuf[i] & 0x3F) >> 0); 438 439 len += 2; 440 } 441 } 442 443 *outBufLen = len; 444 return PR_TRUE; 445 }