tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rulebrk.c (9134B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 #define TH_UNICODE
      5 
      6 #include <assert.h>
      7 #include "th_char.h"
      8 #define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
      9 #define th_isspace(c) ((c) == ' ' || (c) == '\t')
     10 
     11 /*
     12 /////////////////////////////////////////////////
     13 // Thai character type array
     14 */
     15 
     16 typedef unsigned short twb_t;
     17 extern const twb_t _TwbType[0x100 - 0xa0];
     18 
     19 /*
     20 // bit definition
     21 */
     22 
     23 #define VRS 0x0001
     24 #define VRE 0x0002
     25 #define VRX 0x0004
     26 
     27 #define VRA 0x0008
     28 
     29 #define VLA 0x0010
     30 #define VLO 0x0020
     31 #define VLI 0x0040
     32 
     33 #define VC 0x0080
     34 
     35 #define CC 0x0100
     36 #define CS 0x0200
     37 
     38 #define C2 0x0400
     39 #define CHB 0x0800
     40 #define CHE 0x1000
     41 
     42 #define MT 0x2000
     43 /*
     44 //_#define me 0x2000
     45 */
     46 #define M 0x4000
     47 
     48 #define T 0x8000
     49 
     50 #define VL (VLA | VLO | VLI)
     51 #define VR (VRS | VRE | VRX)
     52 #define NE (VL | VRS)
     53 #define NB (VR | M)
     54 #define V (VL | VR)
     55 #define CX (CC | CS)
     56 #define C (CX | VC)
     57 #define A (C | V | M)
     58 
     59 #define twbtype(c) (_TwbType[th_zcode(c)])
     60 
     61 #ifndef TRUE
     62 #  define TRUE 1
     63 #  define FALSE 0
     64 #endif
     65 #define RETURN(b) return (b)
     66 
     67 /*
     68 /////////////////////////////////////////////////
     69 */
     70 
     71 int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
     72                    int right)
     73 /*                 const ThBreakIterator *it, const th_char **p)*/
     74 {
     75  /*
     76  //int left, right;
     77  //const th_char *s = *p;
     78  */
     79  const th_char* lstr = pstr + left;
     80  th_char _c[6];
     81  twb_t _t[6];
     82 #define c(i) (_c[(i) + 3])
     83 #define t(i) (_t[(i) + 3])
     84  int i, j;
     85 
     86  /*
     87  //left = s - it->begin;
     88  */
     89  if (left < 0) return -1;
     90  /*
     91  //right = (it->end == NULL) ? 4 : it->begin - s;
     92  */
     93  if (right < 1) return -1;
     94 
     95  /*
     96  // get c(0), t(0)
     97  */
     98  c(0) = rstr[0]; /* may be '\0' */
     99  if (!th_isthai(c(0))) return -1;
    100  t(0) = twbtype(c(0));
    101  if (!(t(0) & A)) return -1;
    102 
    103  /*
    104  // get c(-1), t(-1)
    105  */
    106  if (left >= 1) {
    107    c(-1) = lstr[-1];
    108    if (!th_isthai(c(-1))) return 0;
    109    t(-1) = twbtype(c(-1));
    110    if (!(t(-1) & A)) return 0; /* handle punctuation marks here */
    111  } else {
    112    c(-1) = 0;
    113    t(-1) = 0;
    114  }
    115 
    116  /*
    117  // get c(1..2), t(1..2)
    118  */
    119  for (i = 1; i <= 2; i++) {
    120    if (i >= right) {
    121      c(i) = 0;
    122      t(i) = 0;
    123    } else {
    124      c(i) = rstr[i]; /* may be '\0'; */
    125      if (!th_isthai(c(i)))
    126        right = i--;
    127      else {
    128        t(i) = twbtype(c(i));
    129        if (!(t(i) & A)) right = i--;
    130      }
    131    }
    132  }
    133  /*
    134  // get c(-2..-3), t(-2..-3)
    135  */
    136  for (i = -2, j = -2; i >= -3; j--) {
    137    if (j < -left) {
    138      c(i) = 0;
    139      t(i) = 0;
    140      i--;
    141    } else {
    142      c(i) = lstr[j];
    143      if (!th_isthai(c(i)))
    144        left = 0;
    145      else {
    146        t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
    147        if (!(t(i) & A))
    148          left = 0;
    149        else {
    150          if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) {
    151            c(i + 1) = c(i);
    152            t(i + 1) = t(i);
    153          } else
    154            i--;
    155        }
    156      }
    157    }
    158  }
    159 
    160  /*
    161  // prohibit the unlikely
    162  */
    163  if ((t(-1) & C) && (t(0) & C)) {
    164    if ((t(-1) & CHE) || (t(0) & CHB)) return -1;
    165  }
    166  /*
    167  // special case : vlao, C/ sara_a|aa, !sara_a
    168  */
    169  if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
    170      (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA))
    171    return 0;
    172 
    173  /*
    174  // prohibit break
    175  */
    176  if (t(0) & NB) return -1;
    177  if (t(-1) & NE) return -1;
    178 
    179  /*
    180        // apply 100% rules
    181  */
    182  if (t(-1) & VRE) {
    183    if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
    184    return -1; /* usually too short syllable, part of word */
    185  }
    186 
    187  if (t(-2) & VRE) return -1;
    188 
    189  if ((t(0) & C) && (t(1) & (VR | MT)) &&
    190      (c(2) != TH_THANTHAKHAT)) {                              /*?C, NB */
    191    if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
    192    if (t(-1) & (V | M)) return 0;                             /* !C/ C, NB */
    193    if (t(-2) & VRS) return 0;               /* VRS, C / C, NB */
    194    if (!(t(0) & C2) && c(1) == TH_SARA_I) { /*	/ !C2 or /c, sara_i */
    195      if (t(-2) & VRX) return 0;             /* VRX, C / C, NB ? 100%? */
    196      if (t(-2) & VC) return 0;              /* VC, C / C, NB ? 100% */
    197    }
    198  }
    199  if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
    200  if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M)))
    201    return 0; /* VRS, C/ !C */
    202 
    203  if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
    204    if ((t(-2) & A) && (t(-1) & CX)) return 0;  /* A, CX / CX, C2 */
    205    if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
    206  }
    207  /*
    208  // apply 90% rules
    209  */
    210  if (t(0) & VL) return 0;
    211  if (t(1) & VL) return -1;
    212  if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING)
    213    return 0;
    214 
    215  /*
    216  //return -1;
    217  // apply 80% rules
    218  */
    219  if (t(0) & CHE) {
    220    if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
    221    /*if(t(-1) & VRX) return 0;					// VRX/ CHE */
    222    if (t(-1) & VC) return 0; /* VC/ CHE */
    223  }
    224  if (t(-1) & CHB) {
    225    if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
    226    if (t(0) & VC) return 0;                 /* CHB/ VC */
    227  }
    228 
    229  if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
    230    if (t(-2) & VLI)
    231      return 0;                        /* VLI,C/C,VR .*/
    232    else {                             /* vlao, C ? C , VR */
    233      if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
    234      if (t(-2) & VLO) return 0;       /* VLO, C/ C, !sara_a */
    235      if (!(t(1) & VRA)) return 0;     /* VLA, C/ C, !vca */
    236    }
    237  }
    238  /* C,MT,C */
    239  if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
    240 
    241  return -1;
    242 }
    243 
    244 int TrbFollowing(const th_char* begin, int length, int offset)
    245 /*
    246 //(ThBreakIterator *this, int offset)
    247 */
    248 {
    249  const th_char* w = begin + offset;
    250  const th_char* end = begin + length;
    251  while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
    252 
    253  if (w < end && *w && !th_isthai(*w)) {
    254    int english = FALSE;
    255    while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
    256      if (th_isalpha(*w)) english = TRUE;
    257      w++;
    258    }
    259    if (english || w == end || (!th_isthai(*w) && th_isspace(*w)))
    260      return w - begin;
    261  }
    262  if (w == end || *w == 0 || !th_isthai(*w)) return w - begin;
    263  w++;
    264  if (w < end && *w && th_isthai(*w)) {
    265    int brk = TrbWordBreakPos(begin, w - begin, w, end - w);
    266    while (brk < 0) {
    267      w++;
    268      if (w == end || *w == 0 || !th_isthai(*w)) break;
    269      brk = TrbWordBreakPos(begin, w - begin, w, end - w);
    270    }
    271    if (brk > 0) w += brk;
    272  }
    273  if (w < end && *w && !th_isthai(*w)) {
    274    while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) &&
    275           !th_isspace(*w))
    276      w++;
    277  }
    278  return w - begin;
    279 }
    280 
    281 /*
    282 /////////////////////////////////////////////////
    283 */
    284 const twb_t _TwbType[0x100 - 0xa0] = {
    285 #if 0
    286 /* 80 € */	T,
    287 /* 81-8f */	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    288 /* 90  */	T,
    289 /* 91-9f */	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    290 #endif
    291    /* a0   */ 0,
    292    /* a1 ¡ */ CS,
    293    /* a2 ¢ */ CS | CHE,
    294    /* a3 £ */ CC | CHE,
    295    /* a4 € */ CS | CHE,
    296    /* a5 ¥ */ CC | CHE,
    297    /* a6 Š */ CS,
    298    /* a7 § */ CS | CHB,
    299    /* a8 š */ CS,
    300    /* a9 © */ CC | CHE,
    301    /* aa ª */ CS,
    302    /* ab « */ CC | CHE,
    303    /* ac ¬ */ CC | CHB | CHE,
    304    /* ad ­ */ CS | CHB,
    305    /* ae ® */ CS | CHB,
    306    /* af ¯ */ CS | CHB,
    307    /* b0 ° */ CS,
    308    /* b1 ± */ CS | CHB | CHE,
    309    /* b2 ² */ CS | CHB | CHE,
    310    /* b3 ³ */ CS | CHB,
    311    /* b4 Ž */ CS,
    312    /* b5 µ */ CS,
    313    /* b6 ¶ */ CS,
    314    /* b7 · */ CS,
    315    /* b8 ž */ CS,
    316    /* b9 ¹ */ CS,
    317    /* ba º */ CS,
    318    /* bb » */ CS,
    319    /* bc Œ */ CC | CHE,
    320    /* bd œ */ CC | CHE,
    321    /* be Ÿ */ CS,
    322    /* bf ¿ */ CS,
    323    /* c0 À */ CS | CHE,
    324    /* c1 Á */ CS,
    325    /* c2 Â */ CS,
    326    /* c3 Ã */ CS | C2 | CHE, /* ? add CHE  */
    327    /* c4 Ä */ VC | CHE,
    328    /* c5 Å */ CS | C2,
    329    /* c6 Æ */ VC | CHE,
    330    /* c7 Ç */ VC | C2,
    331    /* c8 È */ CS,
    332    /* c9 É */ CS | CHB,
    333    /* ca Ê */ CS | CHE,
    334    /* cb Ë */ CC | CHE,
    335    /* CC Ì */ CS | CHB | CHE,
    336    /* cd Í */ VC,
    337    /* ce Î */ CC | CHE,
    338    /* cf Ï */ T,
    339    /* d0 Ð */ VRE | VRA,
    340    /* d1  Ñ */ VRS,
    341    /* d2 Ò */ VRX | VRA,
    342    /* d3  Ó */ VRE,
    343    /* d4  Ô */ VRX | VRA,
    344    /* d5  Õ */ VRX | VRA,
    345    /* d6  Ö */ VRS,
    346    /* d7  × */ VRS | VRA,
    347    /* d8  Ø */ VRX,
    348    /* d9  Ù */ VRX,
    349    /* da  Ú */ T,
    350    /* db Û */ 0,
    351    /* dc Ü */ 0,
    352    /* dd Ý */ 0,
    353    /* de Þ */ 0,
    354    /* df ß */ T,
    355    /* e0 à */ VLA,
    356    /* e1 á */ VLO,
    357    /* e2 â */ VLO,
    358    /* e3 ã */ VLI,
    359    /* e4 ä */ VLI,
    360    /* e5 å */ VRE,
    361    /* e6 æ */ M,
    362    /* e7  ç */ M,
    363    /* e8  è */ M | MT,
    364    /* e9  é */ M | MT,
    365    /* ea  ê */ M | MT,
    366    /* eb  ë */ M | MT,
    367    /* ec  ì */ M,
    368    /* ed  í */ T,
    369    /* ee  î */ T,
    370    /* ef ï */ T,
    371    /* f0 ð */ T,
    372    /* f1 ñ */ T,
    373    /* f2 ò */ T,
    374    /* f3 ó */ T,
    375    /* f4 ô */ T,
    376    /* f5 õ */ T,
    377    /* f6 ö */ T,
    378    /* f7 ÷ */ T,
    379    /* f8 ø */ T,
    380    /* f9 ù */ T,
    381    /* fa ú */ T,
    382    /* fb û */ T,
    383    /* fc ü */ 0,
    384    /* fd ý */ 0,
    385    /* fe þ */ 0,
    386    /* ff ’ */ 0};