neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

arabic.c (9156B)


      1 /// @file arabic.c
      2 ///
      3 /// Functions for Arabic language.
      4 ///
      5 /// Author: Nadim Shaikli & Isam Bayazidi
      6 /// Farsi support and restructuring to make adding new letters easier by Ali
      7 /// Gholami Rudi.  Further work by Ameretat Reith.
      8 
      9 /// Sorted list of unicode Arabic characters.  Each entry holds the
     10 /// presentation forms of a letter.
     11 ///
     12 /// Arabic characters are categorized into following types:
     13 ///
     14 /// Isolated    - iso-8859-6 form         char denoted with  a_*
     15 /// Initial     - unicode form-B start    char denoted with  a_i_*
     16 /// Medial      - unicode form-B middle   char denoted with  a_m_*
     17 /// Final       - unicode form-B final    char denoted with  a_f_*
     18 /// Stand-Alone - unicode form-B isolated char denoted with  a_s_* (NOT USED)
     19 
     20 #include <stdbool.h>
     21 #include <stddef.h>
     22 
     23 #include "nvim/arabic.h"
     24 #include "nvim/ascii_defs.h"
     25 #include "nvim/macros_defs.h"
     26 #include "nvim/option_vars.h"
     27 
     28 // Unicode values for Arabic characters.
     29 enum {
     30  a_HAMZA = 0x0621,
     31  a_ALEF_MADDA = 0x0622,
     32  a_ALEF_HAMZA_ABOVE = 0x0623,
     33  a_WAW_HAMZA = 0x0624,
     34  a_ALEF_HAMZA_BELOW = 0x0625,
     35  a_YEH_HAMZA = 0x0626,
     36  a_ALEF = 0x0627,
     37  a_BEH = 0x0628,
     38  a_TEH_MARBUTA = 0x0629,
     39  a_TEH = 0x062a,
     40  a_THEH = 0x062b,
     41  a_JEEM = 0x062c,
     42  a_HAH = 0x062d,
     43  a_KHAH = 0x062e,
     44  a_DAL = 0x062f,
     45  a_THAL = 0x0630,
     46  a_REH = 0x0631,
     47  a_ZAIN = 0x0632,
     48  a_SEEN = 0x0633,
     49  a_SHEEN = 0x0634,
     50  a_SAD = 0x0635,
     51  a_DAD = 0x0636,
     52  a_TAH = 0x0637,
     53  a_ZAH = 0x0638,
     54  a_AIN = 0x0639,
     55  a_GHAIN = 0x063a,
     56  a_TATWEEL = 0x0640,
     57  a_FEH = 0x0641,
     58  a_QAF = 0x0642,
     59  a_KAF = 0x0643,
     60  a_LAM = 0x0644,
     61  a_MEEM = 0x0645,
     62  a_NOON = 0x0646,
     63  a_HEH = 0x0647,
     64  a_WAW = 0x0648,
     65  a_ALEF_MAKSURA = 0x0649,
     66  a_YEH = 0x064a,
     67  a_FATHATAN = 0x064b,
     68  a_DAMMATAN = 0x064c,
     69  a_KASRATAN = 0x064d,
     70  a_FATHA = 0x064e,
     71  a_DAMMA = 0x064f,
     72  a_KASRA = 0x0650,
     73  a_SHADDA = 0x0651,
     74  a_SUKUN = 0x0652,
     75  a_MADDA_ABOVE = 0x0653,
     76  a_HAMZA_ABOVE = 0x0654,
     77  a_HAMZA_BELOW = 0x0655,
     78 
     79  a_PEH = 0x067e,
     80  a_TCHEH = 0x0686,
     81  a_JEH = 0x0698,
     82  a_FKAF = 0x06a9,
     83  a_GAF = 0x06af,
     84  a_FYEH = 0x06cc,
     85 
     86  a_s_LAM_ALEF_MADDA_ABOVE = 0xfef5,
     87  a_f_LAM_ALEF_MADDA_ABOVE = 0xfef6,
     88  a_s_LAM_ALEF_HAMZA_ABOVE = 0xfef7,
     89  a_f_LAM_ALEF_HAMZA_ABOVE = 0xfef8,
     90  a_s_LAM_ALEF_HAMZA_BELOW = 0xfef9,
     91  a_f_LAM_ALEF_HAMZA_BELOW = 0xfefa,
     92  a_s_LAM_ALEF = 0xfefb,
     93  a_f_LAM_ALEF = 0xfefc,
     94 };
     95 
     96 static struct achar {
     97  unsigned c;
     98  unsigned isolated;
     99  unsigned initial;
    100  unsigned medial;
    101  unsigned final;
    102 } achars[] = {
    103  { a_HAMZA, 0xfe80, 0, 0, 0 },
    104  { a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82 },
    105  { a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84 },
    106  { a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86 },
    107  { a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88 },
    108  { a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a },
    109  { a_ALEF, 0xfe8d, 0, 0, 0xfe8e },
    110  { a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90 },
    111  { a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94 },
    112  { a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96 },
    113  { a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a },
    114  { a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e },
    115  { a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2 },
    116  { a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6 },
    117  { a_DAL, 0xfea9, 0, 0, 0xfeaa },
    118  { a_THAL, 0xfeab, 0, 0, 0xfeac },
    119  { a_REH, 0xfead, 0, 0, 0xfeae },
    120  { a_ZAIN, 0xfeaf, 0, 0, 0xfeb0 },
    121  { a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2 },
    122  { a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6 },
    123  { a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba },
    124  { a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe },
    125  { a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2 },
    126  { a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6 },
    127  { a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca },
    128  { a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece },
    129  { a_TATWEEL, 0, 0x0640, 0x0640, 0x0640 },
    130  { a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2 },
    131  { a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6 },
    132  { a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda },
    133  { a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede },
    134  { a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2 },
    135  { a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6 },
    136  { a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea },
    137  { a_WAW, 0xfeed, 0, 0, 0xfeee },
    138  { a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0 },
    139  { a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2 },
    140  { a_FATHATAN, 0xfe70, 0, 0, 0 },
    141  { a_DAMMATAN, 0xfe72, 0, 0, 0 },
    142  { a_KASRATAN, 0xfe74, 0, 0, 0 },
    143  { a_FATHA, 0xfe76, 0, 0xfe77, 0 },
    144  { a_DAMMA, 0xfe78, 0, 0xfe79, 0 },
    145  { a_KASRA, 0xfe7a, 0, 0xfe7b, 0 },
    146  { a_SHADDA, 0xfe7c, 0, 0xfe7c, 0 },
    147  { a_SUKUN, 0xfe7e, 0, 0xfe7f, 0 },
    148  { a_MADDA_ABOVE, 0, 0, 0, 0 },
    149  { a_HAMZA_ABOVE, 0, 0, 0, 0 },
    150  { a_HAMZA_BELOW, 0, 0, 0, 0 },
    151  { a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57 },
    152  { a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b },
    153  { a_JEH, 0xfb8a, 0, 0, 0xfb8b },
    154  { a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f },
    155  { a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93 },
    156  { a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd },
    157 };
    158 
    159 #define a_BYTE_ORDER_MARK               0xfeff
    160 
    161 #include "arabic.c.generated.h"
    162 
    163 /// Find the struct achar pointer to the given Arabic char.
    164 /// Returns NULL if not found.
    165 static struct achar *find_achar(int c)
    166 {
    167  // using binary search to find c
    168  int h = ARRAY_SIZE(achars);
    169  int l = 0;
    170  while (l < h) {
    171    int m = (h + l) / 2;
    172    if (achars[m].c == (unsigned)c) {
    173      return &achars[m];
    174    }
    175    if ((unsigned)c < achars[m].c) {
    176      h = m;
    177    } else {
    178      l = m + 1;
    179    }
    180  }
    181  return NULL;
    182 }
    183 
    184 /// Change shape - from Combination (2 char) to an Isolated
    185 static int chg_c_laa2i(int hid_c)
    186 {
    187  int tempc;
    188 
    189  switch (hid_c) {
    190  case a_ALEF_MADDA:
    191    tempc = a_s_LAM_ALEF_MADDA_ABOVE;
    192    break;
    193  case a_ALEF_HAMZA_ABOVE:
    194    tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
    195    break;
    196  case a_ALEF_HAMZA_BELOW:
    197    tempc = a_s_LAM_ALEF_HAMZA_BELOW;
    198    break;
    199  case a_ALEF:
    200    tempc = a_s_LAM_ALEF;
    201    break;
    202  default:
    203    tempc = 0;
    204  }
    205 
    206  return tempc;
    207 }
    208 
    209 /// Change shape - from Combination-Isolated to Final
    210 static int chg_c_laa2f(int hid_c)
    211 {
    212  int tempc;
    213 
    214  switch (hid_c) {
    215  case a_ALEF_MADDA:
    216    tempc = a_f_LAM_ALEF_MADDA_ABOVE;
    217    break;
    218  case a_ALEF_HAMZA_ABOVE:
    219    tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
    220    break;
    221  case a_ALEF_HAMZA_BELOW:
    222    tempc = a_f_LAM_ALEF_HAMZA_BELOW;
    223    break;
    224  case a_ALEF:
    225    tempc = a_f_LAM_ALEF;
    226    break;
    227  default:
    228    tempc = 0;
    229  }
    230 
    231  return tempc;
    232 }
    233 
    234 /// Returns whether it is possible to join the given letters
    235 static int can_join(int c1, int c2)
    236 {
    237  struct achar *a1 = find_achar(c1);
    238  struct achar *a2 = find_achar(c2);
    239 
    240  return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
    241 }
    242 
    243 /// Check whether we are dealing with a character that could be regarded as an
    244 /// Arabic combining character, need to check the character before this.
    245 bool arabic_maycombine(int two)
    246  FUNC_ATTR_PURE
    247 {
    248  if (p_arshape && !p_tbidi) {
    249    return two == a_ALEF_MADDA
    250           || two == a_ALEF_HAMZA_ABOVE
    251           || two == a_ALEF_HAMZA_BELOW
    252           || two == a_ALEF;
    253  }
    254  return false;
    255 }
    256 
    257 /// Check whether we are dealing with Arabic combining characters.
    258 /// Returns false for negative values.
    259 /// Note: these are NOT really composing characters!
    260 ///
    261 /// @param one First character.
    262 /// @param two Character just after "one".
    263 bool arabic_combine(int one, int two)
    264  FUNC_ATTR_PURE
    265 {
    266  if (one == a_LAM) {
    267    return arabic_maycombine(two);
    268  }
    269  return false;
    270 }
    271 
    272 /// @return  true if 'c' is an Arabic ISO-8859-6 character
    273 ///          (alphabet/number/punctuation)
    274 static bool A_is_iso(int c)
    275 {
    276  return find_achar(c) != NULL;
    277 }
    278 
    279 /// @return  true if 'c' is an Arabic 10646 (8859-6 or Form-B)
    280 static bool A_is_ok(int c)
    281 {
    282  return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
    283 }
    284 
    285 /// @return  true if 'c' is an Arabic 10646 (8859-6 or Form-B)
    286 ///          with some exceptions/exclusions
    287 static bool A_is_valid(int c)
    288 {
    289  return (A_is_ok(c) && c != a_HAMZA);
    290 }
    291 
    292 // Do Arabic shaping on character "c".  Returns the shaped character.
    293 // in/out: "c1p" points to the first composing char for "c".
    294 // in:     "prev_c"  is the previous character (not shaped)
    295 // in:     "prev_c1" is the first composing char for the previous char
    296 //          (not shaped)
    297 // in:     "next_c"  is the next character (not shaped).
    298 int arabic_shape(int c, int *c1p, int prev_c, int prev_c1, int next_c)
    299 {
    300  // Deal only with Arabic character, pass back all others
    301  if (!A_is_ok(c)) {
    302    return c;
    303  }
    304 
    305  int curr_c;
    306  bool curr_laa = arabic_combine(c, *c1p);
    307  bool prev_laa = arabic_combine(prev_c, prev_c1);
    308 
    309  if (curr_laa) {
    310    if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa) {
    311      curr_c = chg_c_laa2f(*c1p);
    312    } else {
    313      curr_c = chg_c_laa2i(*c1p);
    314    }
    315    // Remove the composing character
    316    *c1p = 0;
    317  } else {
    318    struct achar *curr_a = find_achar(c);
    319    int backward_combine = !prev_laa && can_join(prev_c, c);
    320    int forward_combine = can_join(c, next_c);
    321 
    322    if (backward_combine) {
    323      if (forward_combine) {
    324        curr_c = (int)curr_a->medial;
    325      } else {
    326        curr_c = (int)curr_a->final;
    327      }
    328    } else {
    329      if (forward_combine) {
    330        curr_c = (int)curr_a->initial;
    331      } else {
    332        curr_c = (int)curr_a->isolated;
    333      }
    334    }
    335  }
    336 
    337  // Character missing from the table means using original character.
    338  if (curr_c == NUL) {
    339    curr_c = c;
    340  }
    341 
    342  // Return the shaped character
    343  return curr_c;
    344 }