rulebrk.c (9134B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 #define TH_UNICODE 5 6 #include <assert.h> 7 #include "th_char.h" 8 #define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) 9 #define th_isspace(c) ((c) == ' ' || (c) == '\t') 10 11 /* 12 ///////////////////////////////////////////////// 13 // Thai character type array 14 */ 15 16 typedef unsigned short twb_t; 17 extern const twb_t _TwbType[0x100 - 0xa0]; 18 19 /* 20 // bit definition 21 */ 22 23 #define VRS 0x0001 24 #define VRE 0x0002 25 #define VRX 0x0004 26 27 #define VRA 0x0008 28 29 #define VLA 0x0010 30 #define VLO 0x0020 31 #define VLI 0x0040 32 33 #define VC 0x0080 34 35 #define CC 0x0100 36 #define CS 0x0200 37 38 #define C2 0x0400 39 #define CHB 0x0800 40 #define CHE 0x1000 41 42 #define MT 0x2000 43 /* 44 //_#define me 0x2000 45 */ 46 #define M 0x4000 47 48 #define T 0x8000 49 50 #define VL (VLA | VLO | VLI) 51 #define VR (VRS | VRE | VRX) 52 #define NE (VL | VRS) 53 #define NB (VR | M) 54 #define V (VL | VR) 55 #define CX (CC | CS) 56 #define C (CX | VC) 57 #define A (C | V | M) 58 59 #define twbtype(c) (_TwbType[th_zcode(c)]) 60 61 #ifndef TRUE 62 # define TRUE 1 63 # define FALSE 0 64 #endif 65 #define RETURN(b) return (b) 66 67 /* 68 ///////////////////////////////////////////////// 69 */ 70 71 int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, 72 int right) 73 /* const ThBreakIterator *it, const th_char **p)*/ 74 { 75 /* 76 //int left, right; 77 //const th_char *s = *p; 78 */ 79 const th_char* lstr = pstr + left; 80 th_char _c[6]; 81 twb_t _t[6]; 82 #define c(i) (_c[(i) + 3]) 83 #define t(i) (_t[(i) + 3]) 84 int i, j; 85 86 /* 87 //left = s - it->begin; 88 */ 89 if (left < 0) return -1; 90 /* 91 //right = (it->end == NULL) ? 4 : it->begin - s; 92 */ 93 if (right < 1) return -1; 94 95 /* 96 // get c(0), t(0) 97 */ 98 c(0) = rstr[0]; /* may be '\0' */ 99 if (!th_isthai(c(0))) return -1; 100 t(0) = twbtype(c(0)); 101 if (!(t(0) & A)) return -1; 102 103 /* 104 // get c(-1), t(-1) 105 */ 106 if (left >= 1) { 107 c(-1) = lstr[-1]; 108 if (!th_isthai(c(-1))) return 0; 109 t(-1) = twbtype(c(-1)); 110 if (!(t(-1) & A)) return 0; /* handle punctuation marks here */ 111 } else { 112 c(-1) = 0; 113 t(-1) = 0; 114 } 115 116 /* 117 // get c(1..2), t(1..2) 118 */ 119 for (i = 1; i <= 2; i++) { 120 if (i >= right) { 121 c(i) = 0; 122 t(i) = 0; 123 } else { 124 c(i) = rstr[i]; /* may be '\0'; */ 125 if (!th_isthai(c(i))) 126 right = i--; 127 else { 128 t(i) = twbtype(c(i)); 129 if (!(t(i) & A)) right = i--; 130 } 131 } 132 } 133 /* 134 // get c(-2..-3), t(-2..-3) 135 */ 136 for (i = -2, j = -2; i >= -3; j--) { 137 if (j < -left) { 138 c(i) = 0; 139 t(i) = 0; 140 i--; 141 } else { 142 c(i) = lstr[j]; 143 if (!th_isthai(c(i))) 144 left = 0; 145 else { 146 t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); 147 if (!(t(i) & A)) 148 left = 0; 149 else { 150 if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) { 151 c(i + 1) = c(i); 152 t(i + 1) = t(i); 153 } else 154 i--; 155 } 156 } 157 } 158 } 159 160 /* 161 // prohibit the unlikely 162 */ 163 if ((t(-1) & C) && (t(0) & C)) { 164 if ((t(-1) & CHE) || (t(0) & CHB)) return -1; 165 } 166 /* 167 // special case : vlao, C/ sara_a|aa, !sara_a 168 */ 169 if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && 170 (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) 171 return 0; 172 173 /* 174 // prohibit break 175 */ 176 if (t(0) & NB) return -1; 177 if (t(-1) & NE) return -1; 178 179 /* 180 // apply 100% rules 181 */ 182 if (t(-1) & VRE) { 183 if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; 184 return -1; /* usually too short syllable, part of word */ 185 } 186 187 if (t(-2) & VRE) return -1; 188 189 if ((t(0) & C) && (t(1) & (VR | MT)) && 190 (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ 191 if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ 192 if (t(-1) & (V | M)) return 0; /* !C/ C, NB */ 193 if (t(-2) & VRS) return 0; /* VRS, C / C, NB */ 194 if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ 195 if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ 196 if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ 197 } 198 } 199 if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ 200 if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M))) 201 return 0; /* VRS, C/ !C */ 202 203 if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { 204 if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ 205 if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ 206 } 207 /* 208 // apply 90% rules 209 */ 210 if (t(0) & VL) return 0; 211 if (t(1) & VL) return -1; 212 if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) 213 return 0; 214 215 /* 216 //return -1; 217 // apply 80% rules 218 */ 219 if (t(0) & CHE) { 220 if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ 221 /*if(t(-1) & VRX) return 0; // VRX/ CHE */ 222 if (t(-1) & VC) return 0; /* VC/ CHE */ 223 } 224 if (t(-1) & CHB) { 225 if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ 226 if (t(0) & VC) return 0; /* CHB/ VC */ 227 } 228 229 if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ 230 if (t(-2) & VLI) 231 return 0; /* VLI,C/C,VR .*/ 232 else { /* vlao, C ? C , VR */ 233 if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ 234 if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ 235 if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ 236 } 237 } 238 /* C,MT,C */ 239 if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; 240 241 return -1; 242 } 243 244 int TrbFollowing(const th_char* begin, int length, int offset) 245 /* 246 //(ThBreakIterator *this, int offset) 247 */ 248 { 249 const th_char* w = begin + offset; 250 const th_char* end = begin + length; 251 while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; 252 253 if (w < end && *w && !th_isthai(*w)) { 254 int english = FALSE; 255 while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { 256 if (th_isalpha(*w)) english = TRUE; 257 w++; 258 } 259 if (english || w == end || (!th_isthai(*w) && th_isspace(*w))) 260 return w - begin; 261 } 262 if (w == end || *w == 0 || !th_isthai(*w)) return w - begin; 263 w++; 264 if (w < end && *w && th_isthai(*w)) { 265 int brk = TrbWordBreakPos(begin, w - begin, w, end - w); 266 while (brk < 0) { 267 w++; 268 if (w == end || *w == 0 || !th_isthai(*w)) break; 269 brk = TrbWordBreakPos(begin, w - begin, w, end - w); 270 } 271 if (brk > 0) w += brk; 272 } 273 if (w < end && *w && !th_isthai(*w)) { 274 while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) && 275 !th_isspace(*w)) 276 w++; 277 } 278 return w - begin; 279 } 280 281 /* 282 ///////////////////////////////////////////////// 283 */ 284 const twb_t _TwbType[0x100 - 0xa0] = { 285 #if 0 286 /* 80 */ T, 287 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 288 /* 90 */ T, 289 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 290 #endif 291 /* a0 */ 0, 292 /* a1 ¡ */ CS, 293 /* a2 ¢ */ CS | CHE, 294 /* a3 £ */ CC | CHE, 295 /* a4 € */ CS | CHE, 296 /* a5 ¥ */ CC | CHE, 297 /* a6 Š */ CS, 298 /* a7 § */ CS | CHB, 299 /* a8 š */ CS, 300 /* a9 © */ CC | CHE, 301 /* aa ª */ CS, 302 /* ab « */ CC | CHE, 303 /* ac ¬ */ CC | CHB | CHE, 304 /* ad */ CS | CHB, 305 /* ae ® */ CS | CHB, 306 /* af ¯ */ CS | CHB, 307 /* b0 ° */ CS, 308 /* b1 ± */ CS | CHB | CHE, 309 /* b2 ² */ CS | CHB | CHE, 310 /* b3 ³ */ CS | CHB, 311 /* b4 Ž */ CS, 312 /* b5 µ */ CS, 313 /* b6 ¶ */ CS, 314 /* b7 · */ CS, 315 /* b8 ž */ CS, 316 /* b9 ¹ */ CS, 317 /* ba º */ CS, 318 /* bb » */ CS, 319 /* bc Œ */ CC | CHE, 320 /* bd œ */ CC | CHE, 321 /* be Ÿ */ CS, 322 /* bf ¿ */ CS, 323 /* c0 À */ CS | CHE, 324 /* c1 Á */ CS, 325 /* c2  */ CS, 326 /* c3 à */ CS | C2 | CHE, /* ? add CHE */ 327 /* c4 Ä */ VC | CHE, 328 /* c5 Å */ CS | C2, 329 /* c6 Æ */ VC | CHE, 330 /* c7 Ç */ VC | C2, 331 /* c8 È */ CS, 332 /* c9 É */ CS | CHB, 333 /* ca Ê */ CS | CHE, 334 /* cb Ë */ CC | CHE, 335 /* CC Ì */ CS | CHB | CHE, 336 /* cd Í */ VC, 337 /* ce Î */ CC | CHE, 338 /* cf Ï */ T, 339 /* d0 Ð */ VRE | VRA, 340 /* d1 Ñ */ VRS, 341 /* d2 Ò */ VRX | VRA, 342 /* d3 Ó */ VRE, 343 /* d4 Ô */ VRX | VRA, 344 /* d5 Õ */ VRX | VRA, 345 /* d6 Ö */ VRS, 346 /* d7 × */ VRS | VRA, 347 /* d8 Ø */ VRX, 348 /* d9 Ù */ VRX, 349 /* da Ú */ T, 350 /* db Û */ 0, 351 /* dc Ü */ 0, 352 /* dd Ý */ 0, 353 /* de Þ */ 0, 354 /* df ß */ T, 355 /* e0 à */ VLA, 356 /* e1 á */ VLO, 357 /* e2 â */ VLO, 358 /* e3 ã */ VLI, 359 /* e4 ä */ VLI, 360 /* e5 å */ VRE, 361 /* e6 æ */ M, 362 /* e7 ç */ M, 363 /* e8 è */ M | MT, 364 /* e9 é */ M | MT, 365 /* ea ê */ M | MT, 366 /* eb ë */ M | MT, 367 /* ec ì */ M, 368 /* ed í */ T, 369 /* ee î */ T, 370 /* ef ï */ T, 371 /* f0 ð */ T, 372 /* f1 ñ */ T, 373 /* f2 ò */ T, 374 /* f3 ó */ T, 375 /* f4 ô */ T, 376 /* f5 õ */ T, 377 /* f6 ö */ T, 378 /* f7 ÷ */ T, 379 /* f8 ø */ T, 380 /* f9 ù */ T, 381 /* fa ú */ T, 382 /* fb û */ T, 383 /* fc ü */ 0, 384 /* fd ý */ 0, 385 /* fe þ */ 0, 386 /* ff */ 0};