arabic.c (9156B)
1 /// @file arabic.c 2 /// 3 /// Functions for Arabic language. 4 /// 5 /// Author: Nadim Shaikli & Isam Bayazidi 6 /// Farsi support and restructuring to make adding new letters easier by Ali 7 /// Gholami Rudi. Further work by Ameretat Reith. 8 9 /// Sorted list of unicode Arabic characters. Each entry holds the 10 /// presentation forms of a letter. 11 /// 12 /// Arabic characters are categorized into following types: 13 /// 14 /// Isolated - iso-8859-6 form char denoted with a_* 15 /// Initial - unicode form-B start char denoted with a_i_* 16 /// Medial - unicode form-B middle char denoted with a_m_* 17 /// Final - unicode form-B final char denoted with a_f_* 18 /// Stand-Alone - unicode form-B isolated char denoted with a_s_* (NOT USED) 19 20 #include <stdbool.h> 21 #include <stddef.h> 22 23 #include "nvim/arabic.h" 24 #include "nvim/ascii_defs.h" 25 #include "nvim/macros_defs.h" 26 #include "nvim/option_vars.h" 27 28 // Unicode values for Arabic characters. 29 enum { 30 a_HAMZA = 0x0621, 31 a_ALEF_MADDA = 0x0622, 32 a_ALEF_HAMZA_ABOVE = 0x0623, 33 a_WAW_HAMZA = 0x0624, 34 a_ALEF_HAMZA_BELOW = 0x0625, 35 a_YEH_HAMZA = 0x0626, 36 a_ALEF = 0x0627, 37 a_BEH = 0x0628, 38 a_TEH_MARBUTA = 0x0629, 39 a_TEH = 0x062a, 40 a_THEH = 0x062b, 41 a_JEEM = 0x062c, 42 a_HAH = 0x062d, 43 a_KHAH = 0x062e, 44 a_DAL = 0x062f, 45 a_THAL = 0x0630, 46 a_REH = 0x0631, 47 a_ZAIN = 0x0632, 48 a_SEEN = 0x0633, 49 a_SHEEN = 0x0634, 50 a_SAD = 0x0635, 51 a_DAD = 0x0636, 52 a_TAH = 0x0637, 53 a_ZAH = 0x0638, 54 a_AIN = 0x0639, 55 a_GHAIN = 0x063a, 56 a_TATWEEL = 0x0640, 57 a_FEH = 0x0641, 58 a_QAF = 0x0642, 59 a_KAF = 0x0643, 60 a_LAM = 0x0644, 61 a_MEEM = 0x0645, 62 a_NOON = 0x0646, 63 a_HEH = 0x0647, 64 a_WAW = 0x0648, 65 a_ALEF_MAKSURA = 0x0649, 66 a_YEH = 0x064a, 67 a_FATHATAN = 0x064b, 68 a_DAMMATAN = 0x064c, 69 a_KASRATAN = 0x064d, 70 a_FATHA = 0x064e, 71 a_DAMMA = 0x064f, 72 a_KASRA = 0x0650, 73 a_SHADDA = 0x0651, 74 a_SUKUN = 0x0652, 75 a_MADDA_ABOVE = 0x0653, 76 a_HAMZA_ABOVE = 0x0654, 77 a_HAMZA_BELOW = 0x0655, 78 79 a_PEH = 0x067e, 80 a_TCHEH = 0x0686, 81 a_JEH = 0x0698, 82 a_FKAF = 0x06a9, 83 a_GAF = 0x06af, 84 a_FYEH = 0x06cc, 85 86 a_s_LAM_ALEF_MADDA_ABOVE = 0xfef5, 87 a_f_LAM_ALEF_MADDA_ABOVE = 0xfef6, 88 a_s_LAM_ALEF_HAMZA_ABOVE = 0xfef7, 89 a_f_LAM_ALEF_HAMZA_ABOVE = 0xfef8, 90 a_s_LAM_ALEF_HAMZA_BELOW = 0xfef9, 91 a_f_LAM_ALEF_HAMZA_BELOW = 0xfefa, 92 a_s_LAM_ALEF = 0xfefb, 93 a_f_LAM_ALEF = 0xfefc, 94 }; 95 96 static struct achar { 97 unsigned c; 98 unsigned isolated; 99 unsigned initial; 100 unsigned medial; 101 unsigned final; 102 } achars[] = { 103 { a_HAMZA, 0xfe80, 0, 0, 0 }, 104 { a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82 }, 105 { a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84 }, 106 { a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86 }, 107 { a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88 }, 108 { a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a }, 109 { a_ALEF, 0xfe8d, 0, 0, 0xfe8e }, 110 { a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90 }, 111 { a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94 }, 112 { a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96 }, 113 { a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a }, 114 { a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e }, 115 { a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2 }, 116 { a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6 }, 117 { a_DAL, 0xfea9, 0, 0, 0xfeaa }, 118 { a_THAL, 0xfeab, 0, 0, 0xfeac }, 119 { a_REH, 0xfead, 0, 0, 0xfeae }, 120 { a_ZAIN, 0xfeaf, 0, 0, 0xfeb0 }, 121 { a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2 }, 122 { a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6 }, 123 { a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba }, 124 { a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe }, 125 { a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2 }, 126 { a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6 }, 127 { a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca }, 128 { a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece }, 129 { a_TATWEEL, 0, 0x0640, 0x0640, 0x0640 }, 130 { a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2 }, 131 { a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6 }, 132 { a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda }, 133 { a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede }, 134 { a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2 }, 135 { a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6 }, 136 { a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea }, 137 { a_WAW, 0xfeed, 0, 0, 0xfeee }, 138 { a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0 }, 139 { a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2 }, 140 { a_FATHATAN, 0xfe70, 0, 0, 0 }, 141 { a_DAMMATAN, 0xfe72, 0, 0, 0 }, 142 { a_KASRATAN, 0xfe74, 0, 0, 0 }, 143 { a_FATHA, 0xfe76, 0, 0xfe77, 0 }, 144 { a_DAMMA, 0xfe78, 0, 0xfe79, 0 }, 145 { a_KASRA, 0xfe7a, 0, 0xfe7b, 0 }, 146 { a_SHADDA, 0xfe7c, 0, 0xfe7c, 0 }, 147 { a_SUKUN, 0xfe7e, 0, 0xfe7f, 0 }, 148 { a_MADDA_ABOVE, 0, 0, 0, 0 }, 149 { a_HAMZA_ABOVE, 0, 0, 0, 0 }, 150 { a_HAMZA_BELOW, 0, 0, 0, 0 }, 151 { a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57 }, 152 { a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b }, 153 { a_JEH, 0xfb8a, 0, 0, 0xfb8b }, 154 { a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f }, 155 { a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93 }, 156 { a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd }, 157 }; 158 159 #define a_BYTE_ORDER_MARK 0xfeff 160 161 #include "arabic.c.generated.h" 162 163 /// Find the struct achar pointer to the given Arabic char. 164 /// Returns NULL if not found. 165 static struct achar *find_achar(int c) 166 { 167 // using binary search to find c 168 int h = ARRAY_SIZE(achars); 169 int l = 0; 170 while (l < h) { 171 int m = (h + l) / 2; 172 if (achars[m].c == (unsigned)c) { 173 return &achars[m]; 174 } 175 if ((unsigned)c < achars[m].c) { 176 h = m; 177 } else { 178 l = m + 1; 179 } 180 } 181 return NULL; 182 } 183 184 /// Change shape - from Combination (2 char) to an Isolated 185 static int chg_c_laa2i(int hid_c) 186 { 187 int tempc; 188 189 switch (hid_c) { 190 case a_ALEF_MADDA: 191 tempc = a_s_LAM_ALEF_MADDA_ABOVE; 192 break; 193 case a_ALEF_HAMZA_ABOVE: 194 tempc = a_s_LAM_ALEF_HAMZA_ABOVE; 195 break; 196 case a_ALEF_HAMZA_BELOW: 197 tempc = a_s_LAM_ALEF_HAMZA_BELOW; 198 break; 199 case a_ALEF: 200 tempc = a_s_LAM_ALEF; 201 break; 202 default: 203 tempc = 0; 204 } 205 206 return tempc; 207 } 208 209 /// Change shape - from Combination-Isolated to Final 210 static int chg_c_laa2f(int hid_c) 211 { 212 int tempc; 213 214 switch (hid_c) { 215 case a_ALEF_MADDA: 216 tempc = a_f_LAM_ALEF_MADDA_ABOVE; 217 break; 218 case a_ALEF_HAMZA_ABOVE: 219 tempc = a_f_LAM_ALEF_HAMZA_ABOVE; 220 break; 221 case a_ALEF_HAMZA_BELOW: 222 tempc = a_f_LAM_ALEF_HAMZA_BELOW; 223 break; 224 case a_ALEF: 225 tempc = a_f_LAM_ALEF; 226 break; 227 default: 228 tempc = 0; 229 } 230 231 return tempc; 232 } 233 234 /// Returns whether it is possible to join the given letters 235 static int can_join(int c1, int c2) 236 { 237 struct achar *a1 = find_achar(c1); 238 struct achar *a2 = find_achar(c2); 239 240 return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial); 241 } 242 243 /// Check whether we are dealing with a character that could be regarded as an 244 /// Arabic combining character, need to check the character before this. 245 bool arabic_maycombine(int two) 246 FUNC_ATTR_PURE 247 { 248 if (p_arshape && !p_tbidi) { 249 return two == a_ALEF_MADDA 250 || two == a_ALEF_HAMZA_ABOVE 251 || two == a_ALEF_HAMZA_BELOW 252 || two == a_ALEF; 253 } 254 return false; 255 } 256 257 /// Check whether we are dealing with Arabic combining characters. 258 /// Returns false for negative values. 259 /// Note: these are NOT really composing characters! 260 /// 261 /// @param one First character. 262 /// @param two Character just after "one". 263 bool arabic_combine(int one, int two) 264 FUNC_ATTR_PURE 265 { 266 if (one == a_LAM) { 267 return arabic_maycombine(two); 268 } 269 return false; 270 } 271 272 /// @return true if 'c' is an Arabic ISO-8859-6 character 273 /// (alphabet/number/punctuation) 274 static bool A_is_iso(int c) 275 { 276 return find_achar(c) != NULL; 277 } 278 279 /// @return true if 'c' is an Arabic 10646 (8859-6 or Form-B) 280 static bool A_is_ok(int c) 281 { 282 return (A_is_iso(c) || c == a_BYTE_ORDER_MARK); 283 } 284 285 /// @return true if 'c' is an Arabic 10646 (8859-6 or Form-B) 286 /// with some exceptions/exclusions 287 static bool A_is_valid(int c) 288 { 289 return (A_is_ok(c) && c != a_HAMZA); 290 } 291 292 // Do Arabic shaping on character "c". Returns the shaped character. 293 // in/out: "c1p" points to the first composing char for "c". 294 // in: "prev_c" is the previous character (not shaped) 295 // in: "prev_c1" is the first composing char for the previous char 296 // (not shaped) 297 // in: "next_c" is the next character (not shaped). 298 int arabic_shape(int c, int *c1p, int prev_c, int prev_c1, int next_c) 299 { 300 // Deal only with Arabic character, pass back all others 301 if (!A_is_ok(c)) { 302 return c; 303 } 304 305 int curr_c; 306 bool curr_laa = arabic_combine(c, *c1p); 307 bool prev_laa = arabic_combine(prev_c, prev_c1); 308 309 if (curr_laa) { 310 if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa) { 311 curr_c = chg_c_laa2f(*c1p); 312 } else { 313 curr_c = chg_c_laa2i(*c1p); 314 } 315 // Remove the composing character 316 *c1p = 0; 317 } else { 318 struct achar *curr_a = find_achar(c); 319 int backward_combine = !prev_laa && can_join(prev_c, c); 320 int forward_combine = can_join(c, next_c); 321 322 if (backward_combine) { 323 if (forward_combine) { 324 curr_c = (int)curr_a->medial; 325 } else { 326 curr_c = (int)curr_a->final; 327 } 328 } else { 329 if (forward_combine) { 330 curr_c = (int)curr_a->initial; 331 } else { 332 curr_c = (int)curr_a->isolated; 333 } 334 } 335 } 336 337 // Character missing from the table means using original character. 338 if (curr_c == NUL) { 339 curr_c = c; 340 } 341 342 // Return the shaped character 343 return curr_c; 344 }