hb-utf.hh (11163B)
1 /* 2 * Copyright © 2011,2012,2014 Google, Inc. 3 * 4 * This is part of HarfBuzz, a text shaping library. 5 * 6 * Permission is hereby granted, without written agreement and without 7 * license or royalty fees, to use, copy, modify, and distribute this 8 * software and its documentation for any purpose, provided that the 9 * above copyright notice and the following two paragraphs appear in 10 * all copies of this software. 11 * 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 16 * DAMAGE. 17 * 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 * 24 * Google Author(s): Behdad Esfahbod 25 */ 26 27 #ifndef HB_UTF_HH 28 #define HB_UTF_HH 29 30 #include "hb.hh" 31 32 #include "hb-open-type.hh" 33 34 35 struct hb_utf8_t 36 { 37 typedef uint8_t codepoint_t; 38 static constexpr unsigned max_len = 4; 39 40 static inline const codepoint_t * 41 next (const codepoint_t *text, 42 const codepoint_t *end, 43 hb_codepoint_t *unicode, 44 hb_codepoint_t replacement) 45 { 46 /* Written to only accept well-formed sequences. 47 * Based on ideas from ICU's U8_NEXT. 48 * Generates one "replacement" for each ill-formed byte. */ 49 50 hb_codepoint_t c = *text++; 51 52 if (c > 0x7Fu) 53 { 54 if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ 55 { 56 unsigned int t1; 57 if (likely (text < end && 58 (t1 = text[0] - 0x80u) <= 0x3Fu)) 59 { 60 c = ((c&0x1Fu)<<6) | t1; 61 text++; 62 } 63 else 64 goto error; 65 } 66 else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ 67 { 68 unsigned int t1, t2; 69 if (likely (1 < end - text && 70 (t1 = text[0] - 0x80u) <= 0x3Fu && 71 (t2 = text[1] - 0x80u) <= 0x3Fu)) 72 { 73 c = ((c&0xFu)<<12) | (t1<<6) | t2; 74 if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) 75 goto error; 76 text += 2; 77 } 78 else 79 goto error; 80 } 81 else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ 82 { 83 unsigned int t1, t2, t3; 84 if (likely (2 < end - text && 85 (t1 = text[0] - 0x80u) <= 0x3Fu && 86 (t2 = text[1] - 0x80u) <= 0x3Fu && 87 (t3 = text[2] - 0x80u) <= 0x3Fu)) 88 { 89 c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; 90 if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) 91 goto error; 92 text += 3; 93 } 94 else 95 goto error; 96 } 97 else 98 goto error; 99 } 100 101 *unicode = c; 102 return text; 103 104 error: 105 *unicode = replacement; 106 return text; 107 } 108 109 static inline const codepoint_t * 110 prev (const codepoint_t *text, 111 const codepoint_t *start, 112 hb_codepoint_t *unicode, 113 hb_codepoint_t replacement) 114 { 115 const codepoint_t *end = text--; 116 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) 117 text--; 118 119 if (likely (next (text, end, unicode, replacement) == end)) 120 return text; 121 122 *unicode = replacement; 123 return end - 1; 124 } 125 126 static unsigned int 127 strlen (const codepoint_t *text) 128 { return ::strlen ((const char *) text); } 129 130 static unsigned int 131 encode_len (hb_codepoint_t unicode) 132 { 133 if (unicode < 0x0080u) return 1; 134 if (unicode < 0x0800u) return 2; 135 if (unicode < 0x10000u) return 3; 136 if (unicode < 0x110000u) return 4; 137 return 3; 138 } 139 140 static codepoint_t * 141 encode (codepoint_t *text, 142 const codepoint_t *end, 143 hb_codepoint_t unicode) 144 { 145 if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) 146 unicode = 0xFFFDu; 147 if (unicode < 0x0080u) 148 *text++ = unicode; 149 else if (unicode < 0x0800u) 150 { 151 if (end - text >= 2) 152 { 153 *text++ = 0xC0u + (0x1Fu & (unicode >> 6)); 154 *text++ = 0x80u + (0x3Fu & (unicode )); 155 } 156 } 157 else if (unicode < 0x10000u) 158 { 159 if (end - text >= 3) 160 { 161 *text++ = 0xE0u + (0x0Fu & (unicode >> 12)); 162 *text++ = 0x80u + (0x3Fu & (unicode >> 6)); 163 *text++ = 0x80u + (0x3Fu & (unicode )); 164 } 165 } 166 else 167 { 168 if (end - text >= 4) 169 { 170 *text++ = 0xF0u + (0x07u & (unicode >> 18)); 171 *text++ = 0x80u + (0x3Fu & (unicode >> 12)); 172 *text++ = 0x80u + (0x3Fu & (unicode >> 6)); 173 *text++ = 0x80u + (0x3Fu & (unicode )); 174 } 175 } 176 return text; 177 } 178 }; 179 180 181 template <typename TCodepoint> 182 struct hb_utf16_xe_t 183 { 184 static_assert (sizeof (TCodepoint) == 2, ""); 185 typedef TCodepoint codepoint_t; 186 static constexpr unsigned max_len = 2; 187 188 static inline const codepoint_t * 189 next (const codepoint_t *text, 190 const codepoint_t *end, 191 hb_codepoint_t *unicode, 192 hb_codepoint_t replacement) 193 { 194 hb_codepoint_t c = *text++; 195 196 if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) 197 { 198 *unicode = c; 199 return text; 200 } 201 202 if (likely (c <= 0xDBFFu && text < end)) 203 { 204 /* High-surrogate in c */ 205 hb_codepoint_t l = *text; 206 if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) 207 { 208 /* Low-surrogate in l */ 209 *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); 210 text++; 211 return text; 212 } 213 } 214 215 /* Lonely / out-of-order surrogate. */ 216 *unicode = replacement; 217 return text; 218 } 219 220 static inline const codepoint_t * 221 prev (const codepoint_t *text, 222 const codepoint_t *start, 223 hb_codepoint_t *unicode, 224 hb_codepoint_t replacement) 225 { 226 hb_codepoint_t c = *--text; 227 228 if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) 229 { 230 *unicode = c; 231 return text; 232 } 233 234 if (likely (c >= 0xDC00u && start < text)) 235 { 236 /* Low-surrogate in c */ 237 hb_codepoint_t h = text[-1]; 238 if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) 239 { 240 /* High-surrogate in h */ 241 *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); 242 text--; 243 return text; 244 } 245 } 246 247 /* Lonely / out-of-order surrogate. */ 248 *unicode = replacement; 249 return text; 250 } 251 252 253 static unsigned int 254 strlen (const codepoint_t *text) 255 { 256 unsigned int l = 0; 257 while (*text++) l++; 258 return l; 259 } 260 261 static unsigned int 262 encode_len (hb_codepoint_t unicode) 263 { 264 return unicode < 0x10000 ? 1 : 2; 265 } 266 267 static codepoint_t * 268 encode (codepoint_t *text, 269 const codepoint_t *end, 270 hb_codepoint_t unicode) 271 { 272 if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) 273 unicode = 0xFFFDu; 274 if (unicode < 0x10000u) 275 *text++ = unicode; 276 else if (end - text >= 2) 277 { 278 unicode -= 0x10000u; 279 *text++ = 0xD800u + (unicode >> 10); 280 *text++ = 0xDC00u + (unicode & 0x03FFu); 281 } 282 return text; 283 } 284 }; 285 286 typedef hb_utf16_xe_t<uint16_t> hb_utf16_t; 287 typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t; 288 289 290 template <typename TCodepoint, bool validate=true> 291 struct hb_utf32_xe_t 292 { 293 static_assert (sizeof (TCodepoint) == 4, ""); 294 typedef TCodepoint codepoint_t; 295 static constexpr unsigned max_len = 1; 296 297 static inline const TCodepoint * 298 next (const TCodepoint *text, 299 const TCodepoint *end HB_UNUSED, 300 hb_codepoint_t *unicode, 301 hb_codepoint_t replacement) 302 { 303 hb_codepoint_t c = *unicode = *text++; 304 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) 305 *unicode = replacement; 306 return text; 307 } 308 309 static inline const TCodepoint * 310 prev (const TCodepoint *text, 311 const TCodepoint *start HB_UNUSED, 312 hb_codepoint_t *unicode, 313 hb_codepoint_t replacement) 314 { 315 hb_codepoint_t c = *unicode = *--text; 316 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) 317 *unicode = replacement; 318 return text; 319 } 320 321 static unsigned int 322 strlen (const TCodepoint *text) 323 { 324 unsigned int l = 0; 325 while (*text++) l++; 326 return l; 327 } 328 329 static unsigned int 330 encode_len (hb_codepoint_t unicode HB_UNUSED) 331 { 332 return 1; 333 } 334 335 static codepoint_t * 336 encode (codepoint_t *text, 337 const codepoint_t *end HB_UNUSED, 338 hb_codepoint_t unicode) 339 { 340 if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) 341 unicode = 0xFFFDu; 342 *text++ = unicode; 343 return text; 344 } 345 }; 346 347 typedef hb_utf32_xe_t<uint32_t> hb_utf32_t; 348 typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t; 349 350 351 struct hb_latin1_t 352 { 353 typedef uint8_t codepoint_t; 354 static constexpr unsigned max_len = 1; 355 356 static inline const codepoint_t * 357 next (const codepoint_t *text, 358 const codepoint_t *end HB_UNUSED, 359 hb_codepoint_t *unicode, 360 hb_codepoint_t replacement HB_UNUSED) 361 { 362 *unicode = *text++; 363 return text; 364 } 365 366 static inline const codepoint_t * 367 prev (const codepoint_t *text, 368 const codepoint_t *start HB_UNUSED, 369 hb_codepoint_t *unicode, 370 hb_codepoint_t replacement HB_UNUSED) 371 { 372 *unicode = *--text; 373 return text; 374 } 375 376 static unsigned int 377 strlen (const codepoint_t *text) 378 { 379 unsigned int l = 0; 380 while (*text++) l++; 381 return l; 382 } 383 384 static unsigned int 385 encode_len (hb_codepoint_t unicode HB_UNUSED) 386 { 387 return 1; 388 } 389 390 static codepoint_t * 391 encode (codepoint_t *text, 392 const codepoint_t *end HB_UNUSED, 393 hb_codepoint_t unicode) 394 { 395 if (unlikely (unicode >= 0x0100u)) 396 unicode = '?'; 397 *text++ = unicode; 398 return text; 399 } 400 }; 401 402 403 struct hb_ascii_t 404 { 405 typedef uint8_t codepoint_t; 406 static constexpr unsigned max_len = 1; 407 408 static inline const codepoint_t * 409 next (const codepoint_t *text, 410 const codepoint_t *end HB_UNUSED, 411 hb_codepoint_t *unicode, 412 hb_codepoint_t replacement) 413 { 414 *unicode = *text++; 415 if (*unicode >= 0x0080u) 416 *unicode = replacement; 417 return text; 418 } 419 420 static inline const codepoint_t * 421 prev (const codepoint_t *text, 422 const codepoint_t *start HB_UNUSED, 423 hb_codepoint_t *unicode, 424 hb_codepoint_t replacement) 425 { 426 *unicode = *--text; 427 if (*unicode >= 0x0080u) 428 *unicode = replacement; 429 return text; 430 } 431 432 static unsigned int 433 strlen (const codepoint_t *text) 434 { 435 unsigned int l = 0; 436 while (*text++) l++; 437 return l; 438 } 439 440 static unsigned int 441 encode_len (hb_codepoint_t unicode HB_UNUSED) 442 { 443 return 1; 444 } 445 446 static codepoint_t * 447 encode (codepoint_t *text, 448 const codepoint_t *end HB_UNUSED, 449 hb_codepoint_t unicode) 450 { 451 if (unlikely (unicode >= 0x0080u)) 452 unicode = '?'; 453 *text++ = unicode; 454 return text; 455 } 456 }; 457 458 template <typename utf_t> 459 static inline const typename utf_t::codepoint_t * 460 hb_utf_offset_to_pointer (const typename utf_t::codepoint_t *start, 461 const typename utf_t::codepoint_t *text, 462 unsigned text_len, 463 signed offset) 464 { 465 hb_codepoint_t unicode; 466 467 while (offset-- > 0) 468 start = utf_t::next (start, 469 text + text_len, 470 &unicode, 471 HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT); 472 473 while (offset++ < 0) 474 start = utf_t::prev (start, 475 text, 476 &unicode, 477 HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT); 478 479 return start; 480 } 481 482 483 #endif /* HB_UTF_HH */