hb-icu.cc (9560B)
1 /* 2 * Copyright © 2009 Red Hat, Inc. 3 * Copyright © 2009 Keith Stribley 4 * Copyright © 2011 Google, Inc. 5 * 6 * This is part of HarfBuzz, a text shaping library. 7 * 8 * Permission is hereby granted, without written agreement and without 9 * license or royalty fees, to use, copy, modify, and distribute this 10 * software and its documentation for any purpose, provided that the 11 * above copyright notice and the following two paragraphs appear in 12 * all copies of this software. 13 * 14 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 15 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 16 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 17 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 18 * DAMAGE. 19 * 20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 25 * 26 * Red Hat Author(s): Behdad Esfahbod 27 * Google Author(s): Behdad Esfahbod 28 */ 29 30 #include "hb.hh" 31 32 #ifdef HAVE_ICU 33 34 #pragma GCC diagnostic push 35 36 // https://github.com/harfbuzz/harfbuzz/issues/4915 37 #pragma GCC diagnostic ignored "-Wredundant-decls" 38 39 #include "hb-icu.h" 40 41 #include "hb-machinery.hh" 42 43 #include <unicode/uchar.h> 44 #include <unicode/unorm2.h> 45 #include <unicode/ustring.h> 46 #include <unicode/utf16.h> 47 #include <unicode/uversion.h> 48 49 /* ICU extra semicolon, fixed since 65, https://github.com/unicode-org/icu/commit/480bec3 */ 50 #if U_ICU_VERSION_MAJOR_NUM < 65 && (defined(__GNUC__) || defined(__clang__)) 51 #define HB_ICU_EXTRA_SEMI_IGNORED 52 #pragma GCC diagnostic ignored "-Wextra-semi-stmt" 53 #endif 54 55 /** 56 * SECTION:hb-icu 57 * @title: hb-icu 58 * @short_description: ICU integration 59 * @include: hb-icu.h 60 * 61 * Functions for using HarfBuzz with the International Components for Unicode 62 * (ICU) library. HarfBuzz supports using ICU to provide Unicode data, by attaching 63 * ICU functions to the virtual methods in a #hb_unicode_funcs_t function 64 * structure. 65 **/ 66 67 /** 68 * hb_icu_script_to_script: 69 * @script: The UScriptCode identifier to query 70 * 71 * Fetches the #hb_script_t script that corresponds to the 72 * specified UScriptCode identifier. 73 * 74 * Return value: the #hb_script_t script found 75 * 76 **/ 77 78 hb_script_t 79 hb_icu_script_to_script (UScriptCode script) 80 { 81 if (unlikely (script == USCRIPT_INVALID_CODE)) 82 return HB_SCRIPT_INVALID; 83 84 return hb_script_from_string (uscript_getShortName (script), -1); 85 } 86 87 /** 88 * hb_icu_script_from_script: 89 * @script: The #hb_script_t script to query 90 * 91 * Fetches the UScriptCode identifier that corresponds to the 92 * specified #hb_script_t script. 93 * 94 * Return value: the UScriptCode identifier found 95 * 96 **/ 97 UScriptCode 98 hb_icu_script_from_script (hb_script_t script) 99 { 100 UScriptCode out = USCRIPT_INVALID_CODE; 101 102 if (unlikely (script == HB_SCRIPT_INVALID)) 103 return out; 104 105 UErrorCode icu_err = U_ZERO_ERROR; 106 const unsigned char buf[5] = {HB_UNTAG (script), 0}; 107 uscript_getCode ((const char *) buf, &out, 1, &icu_err); 108 109 return out; 110 } 111 112 113 static hb_unicode_combining_class_t 114 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, 115 hb_codepoint_t unicode, 116 void *user_data HB_UNUSED) 117 118 { 119 return (hb_unicode_combining_class_t) u_getCombiningClass (unicode); 120 } 121 122 static hb_unicode_general_category_t 123 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, 124 hb_codepoint_t unicode, 125 void *user_data HB_UNUSED) 126 { 127 switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) 128 { 129 case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 130 131 case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; 132 case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; 133 case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; 134 case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; 135 case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; 136 137 case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; 138 case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; 139 case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; 140 141 case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; 142 case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; 143 case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; 144 145 case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; 146 case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; 147 case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; 148 149 case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; 150 case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; 151 case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; 152 case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; 153 154 155 case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; 156 case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; 157 case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; 158 case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; 159 case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; 160 161 case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; 162 case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; 163 case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; 164 case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; 165 166 case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; 167 case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; 168 } 169 170 return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 171 } 172 173 static hb_codepoint_t 174 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, 175 hb_codepoint_t unicode, 176 void *user_data HB_UNUSED) 177 { 178 return u_charMirror(unicode); 179 } 180 181 static hb_script_t 182 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, 183 hb_codepoint_t unicode, 184 void *user_data HB_UNUSED) 185 { 186 UErrorCode status = U_ZERO_ERROR; 187 UScriptCode scriptCode = uscript_getScript(unicode, &status); 188 189 if (unlikely (U_FAILURE (status))) 190 return HB_SCRIPT_UNKNOWN; 191 192 return hb_icu_script_to_script (scriptCode); 193 } 194 195 static hb_bool_t 196 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 197 hb_codepoint_t a, 198 hb_codepoint_t b, 199 hb_codepoint_t *ab, 200 void *user_data) 201 { 202 const UNormalizer2 *normalizer = (const UNormalizer2 *) user_data; 203 UChar32 ret = unorm2_composePair (normalizer, a, b); 204 if (ret < 0) return false; 205 *ab = ret; 206 return true; 207 } 208 209 static hb_bool_t 210 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 211 hb_codepoint_t ab, 212 hb_codepoint_t *a, 213 hb_codepoint_t *b, 214 void *user_data) 215 { 216 const UNormalizer2 *normalizer = (const UNormalizer2 *) user_data; 217 UChar decomposed[4]; 218 int len; 219 UErrorCode icu_err = U_ZERO_ERROR; 220 len = unorm2_getRawDecomposition (normalizer, ab, decomposed, 221 ARRAY_LENGTH (decomposed), &icu_err); 222 if (U_FAILURE (icu_err) || len < 0) return false; 223 224 len = u_countChar32 (decomposed, len); 225 if (len == 1) 226 { 227 U16_GET_UNSAFE (decomposed, 0, *a); 228 *b = 0; 229 return *a != ab; 230 } 231 else if (len == 2) 232 { 233 len = 0; 234 U16_NEXT_UNSAFE (decomposed, len, *a); 235 U16_NEXT_UNSAFE (decomposed, len, *b); 236 } 237 return true; 238 } 239 240 241 static inline void free_static_icu_funcs (); 242 243 static struct hb_icu_unicode_funcs_lazy_loader_t : hb_unicode_funcs_lazy_loader_t<hb_icu_unicode_funcs_lazy_loader_t> 244 { 245 static hb_unicode_funcs_t *create () 246 { 247 void *user_data = nullptr; 248 UErrorCode icu_err = U_ZERO_ERROR; 249 user_data = (void *) unorm2_getNFCInstance (&icu_err); 250 assert (user_data); 251 252 hb_unicode_funcs_t *funcs = hb_unicode_funcs_create (nullptr); 253 254 hb_unicode_funcs_set_combining_class_func (funcs, hb_icu_unicode_combining_class, nullptr, nullptr); 255 hb_unicode_funcs_set_general_category_func (funcs, hb_icu_unicode_general_category, nullptr, nullptr); 256 hb_unicode_funcs_set_mirroring_func (funcs, hb_icu_unicode_mirroring, nullptr, nullptr); 257 hb_unicode_funcs_set_script_func (funcs, hb_icu_unicode_script, nullptr, nullptr); 258 hb_unicode_funcs_set_compose_func (funcs, hb_icu_unicode_compose, user_data, nullptr); 259 hb_unicode_funcs_set_decompose_func (funcs, hb_icu_unicode_decompose, user_data, nullptr); 260 261 hb_unicode_funcs_make_immutable (funcs); 262 263 hb_atexit (free_static_icu_funcs); 264 265 return funcs; 266 } 267 } static_icu_funcs; 268 269 static inline 270 void free_static_icu_funcs () 271 { 272 static_icu_funcs.free_instance (); 273 } 274 275 /** 276 * hb_icu_get_unicode_funcs: 277 * 278 * Fetches a Unicode-functions structure that is populated 279 * with the appropriate ICU function for each method. 280 * 281 * Return value: (transfer none): a pointer to the #hb_unicode_funcs_t Unicode-functions structure 282 * 283 * Since: 0.9.38 284 **/ 285 hb_unicode_funcs_t * 286 hb_icu_get_unicode_funcs () 287 { 288 return static_icu_funcs.get_unconst (); 289 } 290 291 #pragma GCC diagnostic pop 292 293 #endif