mbyte.c (93849B)
1 /// mbyte.c: Code specifically for handling multi-byte characters. 2 /// Multibyte extensions partly by Sung-Hoon Baek 3 /// 4 /// Strings internal to Nvim are always encoded as UTF-8 (thus the legacy 5 /// 'encoding' option is always "utf-8"). 6 /// 7 /// The cell width on the display needs to be determined from the character 8 /// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char, 9 /// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte 10 /// character. To make things complicated, up to six composing characters 11 /// are allowed. These are drawn on top of the first char. For most editing 12 /// the sequence of bytes with composing characters included is considered to 13 /// be one character. 14 /// 15 /// UTF-8 is used everywhere in the core. This is in registers, text 16 /// manipulation, buffers, etc. Nvim core communicates with external plugins 17 /// and GUIs in this encoding. 18 /// 19 /// The encoding of a file is specified with 'fileencoding'. Conversion 20 /// is to be done when it's different from "utf-8". 21 /// 22 /// Vim scripts may contain an ":scriptencoding" command. This has an effect 23 /// for some commands, like ":menutrans". 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <iconv.h> 29 #include <limits.h> 30 #include <locale.h> 31 #include <stdbool.h> 32 #include <stddef.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <utf8proc.h> 37 #include <uv.h> 38 #include <wctype.h> 39 40 #include "auto/config.h" 41 #include "nvim/arabic.h" 42 #include "nvim/ascii_defs.h" 43 #include "nvim/buffer_defs.h" 44 #include "nvim/charset.h" 45 #include "nvim/cmdexpand_defs.h" 46 #include "nvim/cursor.h" 47 #include "nvim/drawscreen.h" 48 #include "nvim/errors.h" 49 #include "nvim/eval/typval.h" 50 #include "nvim/eval/typval_defs.h" 51 #include "nvim/getchar.h" 52 #include "nvim/gettext_defs.h" 53 #include "nvim/globals.h" 54 #include "nvim/grid.h" 55 #include "nvim/iconv_defs.h" 56 #include "nvim/keycodes.h" 57 #include "nvim/macros_defs.h" 58 #include "nvim/mark.h" 59 #include "nvim/mbyte.h" 60 #include "nvim/mbyte_defs.h" 61 #include "nvim/memline.h" 62 #include "nvim/memory.h" 63 #include "nvim/message.h" 64 #include "nvim/move.h" 65 #include "nvim/option_vars.h" 66 #include "nvim/optionstr.h" 67 #include "nvim/os/os.h" 68 #include "nvim/pos_defs.h" 69 #include "nvim/strings.h" 70 #include "nvim/types_defs.h" 71 #include "nvim/vim_defs.h" 72 73 typedef struct { 74 int rangeStart; 75 int rangeEnd; 76 int step; 77 int offset; 78 } convertStruct; 79 80 struct interval { 81 int first; 82 int last; 83 }; 84 85 // uncrustify:off 86 #include "mbyte.c.generated.h" 87 // uncrustify:on 88 89 static const char e_list_item_nr_is_not_list[] 90 = N_("E1109: List item %d is not a List"); 91 static const char e_list_item_nr_does_not_contain_3_numbers[] 92 = N_("E1110: List item %d does not contain 3 numbers"); 93 static const char e_list_item_nr_range_invalid[] 94 = N_("E1111: List item %d range invalid"); 95 static const char e_list_item_nr_cell_width_invalid[] 96 = N_("E1112: List item %d cell width invalid"); 97 static const char e_overlapping_ranges_for_nr[] 98 = N_("E1113: Overlapping ranges for 0x%lx"); 99 static const char e_only_values_of_0x80_and_higher_supported[] 100 = N_("E1114: Only values of 0x80 and higher supported"); 101 102 // To speed up BYTELEN(); keep a lookup table to quickly get the length in 103 // bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes 104 // which are illegal when used as the first byte have a 1. The NUL byte has 105 // length 1. 106 const uint8_t utf8len_tab[] = { 107 // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0? 109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1? 110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2? 111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3? 112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4? 113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5? 114 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6? 115 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7? 116 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8? 117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9? 118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A? 119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B? 120 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C? 121 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D? 122 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E? 123 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F? 124 }; 125 126 // Like utf8len_tab above, but using a zero for illegal lead bytes. 127 const uint8_t utf8len_tab_zero[] = { 128 // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F 129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0? 130 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1? 131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2? 132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3? 133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4? 134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5? 135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6? 136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7? 137 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8? 138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9? 139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A? 140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B? 141 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C? 142 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D? 143 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E? 144 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, // F? 145 }; 146 147 // Canonical encoding names and their properties. 148 // "iso-8859-n" is handled by enc_canonize() directly. 149 static struct 150 { const char *name; int prop; int codepage; } 151 enc_canon_table[] = { 152 #define IDX_LATIN_1 0 153 { "latin1", ENC_8BIT + ENC_LATIN1, 1252 }, 154 #define IDX_ISO_2 1 155 { "iso-8859-2", ENC_8BIT, 0 }, 156 #define IDX_ISO_3 2 157 { "iso-8859-3", ENC_8BIT, 0 }, 158 #define IDX_ISO_4 3 159 { "iso-8859-4", ENC_8BIT, 0 }, 160 #define IDX_ISO_5 4 161 { "iso-8859-5", ENC_8BIT, 0 }, 162 #define IDX_ISO_6 5 163 { "iso-8859-6", ENC_8BIT, 0 }, 164 #define IDX_ISO_7 6 165 { "iso-8859-7", ENC_8BIT, 0 }, 166 #define IDX_ISO_8 7 167 { "iso-8859-8", ENC_8BIT, 0 }, 168 #define IDX_ISO_9 8 169 { "iso-8859-9", ENC_8BIT, 0 }, 170 #define IDX_ISO_10 9 171 { "iso-8859-10", ENC_8BIT, 0 }, 172 #define IDX_ISO_11 10 173 { "iso-8859-11", ENC_8BIT, 0 }, 174 #define IDX_ISO_13 11 175 { "iso-8859-13", ENC_8BIT, 0 }, 176 #define IDX_ISO_14 12 177 { "iso-8859-14", ENC_8BIT, 0 }, 178 #define IDX_ISO_15 13 179 { "iso-8859-15", ENC_8BIT + ENC_LATIN9, 0 }, 180 #define IDX_KOI8_R 14 181 { "koi8-r", ENC_8BIT, 0 }, 182 #define IDX_KOI8_U 15 183 { "koi8-u", ENC_8BIT, 0 }, 184 #define IDX_UTF8 16 185 { "utf-8", ENC_UNICODE, 0 }, 186 #define IDX_UCS2 17 187 { "ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0 }, 188 #define IDX_UCS2LE 18 189 { "ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0 }, 190 #define IDX_UTF16 19 191 { "utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0 }, 192 #define IDX_UTF16LE 20 193 { "utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0 }, 194 #define IDX_UCS4 21 195 { "ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0 }, 196 #define IDX_UCS4LE 22 197 { "ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0 }, 198 199 // For debugging DBCS encoding on Unix. 200 #define IDX_DEBUG 23 201 { "debug", ENC_DBCS, DBCS_DEBUG }, 202 #define IDX_EUC_JP 24 203 { "euc-jp", ENC_DBCS, DBCS_JPNU }, 204 #define IDX_SJIS 25 205 { "sjis", ENC_DBCS, DBCS_JPN }, 206 #define IDX_EUC_KR 26 207 { "euc-kr", ENC_DBCS, DBCS_KORU }, 208 #define IDX_EUC_CN 27 209 { "euc-cn", ENC_DBCS, DBCS_CHSU }, 210 #define IDX_EUC_TW 28 211 { "euc-tw", ENC_DBCS, DBCS_CHTU }, 212 #define IDX_BIG5 29 213 { "big5", ENC_DBCS, DBCS_CHT }, 214 215 // MS-DOS and MS-Windows codepages are included here, so that they can be 216 // used on Unix too. Most of them are similar to ISO-8859 encodings, but 217 // not exactly the same. 218 #define IDX_CP437 30 219 { "cp437", ENC_8BIT, 437 }, // like iso-8859-1 220 #define IDX_CP737 31 221 { "cp737", ENC_8BIT, 737 }, // like iso-8859-7 222 #define IDX_CP775 32 223 { "cp775", ENC_8BIT, 775 }, // Baltic 224 #define IDX_CP850 33 225 { "cp850", ENC_8BIT, 850 }, // like iso-8859-4 226 #define IDX_CP852 34 227 { "cp852", ENC_8BIT, 852 }, // like iso-8859-1 228 #define IDX_CP855 35 229 { "cp855", ENC_8BIT, 855 }, // like iso-8859-2 230 #define IDX_CP857 36 231 { "cp857", ENC_8BIT, 857 }, // like iso-8859-5 232 #define IDX_CP860 37 233 { "cp860", ENC_8BIT, 860 }, // like iso-8859-9 234 #define IDX_CP861 38 235 { "cp861", ENC_8BIT, 861 }, // like iso-8859-1 236 #define IDX_CP862 39 237 { "cp862", ENC_8BIT, 862 }, // like iso-8859-1 238 #define IDX_CP863 40 239 { "cp863", ENC_8BIT, 863 }, // like iso-8859-8 240 #define IDX_CP865 41 241 { "cp865", ENC_8BIT, 865 }, // like iso-8859-1 242 #define IDX_CP866 42 243 { "cp866", ENC_8BIT, 866 }, // like iso-8859-5 244 #define IDX_CP869 43 245 { "cp869", ENC_8BIT, 869 }, // like iso-8859-7 246 #define IDX_CP874 44 247 { "cp874", ENC_8BIT, 874 }, // Thai 248 #define IDX_CP932 45 249 { "cp932", ENC_DBCS, DBCS_JPN }, 250 #define IDX_CP936 46 251 { "cp936", ENC_DBCS, DBCS_CHS }, 252 #define IDX_CP949 47 253 { "cp949", ENC_DBCS, DBCS_KOR }, 254 #define IDX_CP950 48 255 { "cp950", ENC_DBCS, DBCS_CHT }, 256 #define IDX_CP1250 49 257 { "cp1250", ENC_8BIT, 1250 }, // Czech, Polish, etc. 258 #define IDX_CP1251 50 259 { "cp1251", ENC_8BIT, 1251 }, // Cyrillic 260 // cp1252 is considered to be equal to latin1 261 #define IDX_CP1253 51 262 { "cp1253", ENC_8BIT, 1253 }, // Greek 263 #define IDX_CP1254 52 264 { "cp1254", ENC_8BIT, 1254 }, // Turkish 265 #define IDX_CP1255 53 266 { "cp1255", ENC_8BIT, 1255 }, // Hebrew 267 #define IDX_CP1256 54 268 { "cp1256", ENC_8BIT, 1256 }, // Arabic 269 #define IDX_CP1257 55 270 { "cp1257", ENC_8BIT, 1257 }, // Baltic 271 #define IDX_CP1258 56 272 { "cp1258", ENC_8BIT, 1258 }, // Vietnamese 273 274 #define IDX_MACROMAN 57 275 { "macroman", ENC_8BIT + ENC_MACROMAN, 0 }, // Mac OS 276 #define IDX_HPROMAN8 58 277 { "hp-roman8", ENC_8BIT, 0 }, // HP Roman8 278 #define IDX_COUNT 59 279 }; 280 281 // Aliases for encoding names. 282 static struct 283 { const char *name; int canon; } 284 enc_alias_table[] = { 285 { "ansi", IDX_LATIN_1 }, 286 { "iso-8859-1", IDX_LATIN_1 }, 287 { "latin2", IDX_ISO_2 }, 288 { "latin3", IDX_ISO_3 }, 289 { "latin4", IDX_ISO_4 }, 290 { "cyrillic", IDX_ISO_5 }, 291 { "arabic", IDX_ISO_6 }, 292 { "greek", IDX_ISO_7 }, 293 { "hebrew", IDX_ISO_8 }, 294 { "latin5", IDX_ISO_9 }, 295 { "turkish", IDX_ISO_9 }, // ? 296 { "latin6", IDX_ISO_10 }, 297 { "nordic", IDX_ISO_10 }, // ? 298 { "thai", IDX_ISO_11 }, // ? 299 { "latin7", IDX_ISO_13 }, 300 { "latin8", IDX_ISO_14 }, 301 { "latin9", IDX_ISO_15 }, 302 { "utf8", IDX_UTF8 }, 303 { "unicode", IDX_UCS2 }, 304 { "ucs2", IDX_UCS2 }, 305 { "ucs2be", IDX_UCS2 }, 306 { "ucs-2be", IDX_UCS2 }, 307 { "ucs2le", IDX_UCS2LE }, 308 { "utf16", IDX_UTF16 }, 309 { "utf16be", IDX_UTF16 }, 310 { "utf-16be", IDX_UTF16 }, 311 { "utf16le", IDX_UTF16LE }, 312 { "ucs4", IDX_UCS4 }, 313 { "ucs4be", IDX_UCS4 }, 314 { "ucs-4be", IDX_UCS4 }, 315 { "ucs4le", IDX_UCS4LE }, 316 { "utf32", IDX_UCS4 }, 317 { "utf-32", IDX_UCS4 }, 318 { "utf32be", IDX_UCS4 }, 319 { "utf-32be", IDX_UCS4 }, 320 { "utf32le", IDX_UCS4LE }, 321 { "utf-32le", IDX_UCS4LE }, 322 { "932", IDX_CP932 }, 323 { "949", IDX_CP949 }, 324 { "936", IDX_CP936 }, 325 { "gbk", IDX_CP936 }, 326 { "950", IDX_CP950 }, 327 { "eucjp", IDX_EUC_JP }, 328 { "unix-jis", IDX_EUC_JP }, 329 { "ujis", IDX_EUC_JP }, 330 { "shift-jis", IDX_SJIS }, 331 { "pck", IDX_SJIS }, // Sun: PCK 332 { "euckr", IDX_EUC_KR }, 333 { "5601", IDX_EUC_KR }, // Sun: KS C 5601 334 { "euccn", IDX_EUC_CN }, 335 { "gb2312", IDX_EUC_CN }, 336 { "euctw", IDX_EUC_TW }, 337 { "japan", IDX_EUC_JP }, 338 { "korea", IDX_EUC_KR }, 339 { "prc", IDX_EUC_CN }, 340 { "zh-cn", IDX_EUC_CN }, 341 { "chinese", IDX_EUC_CN }, 342 { "zh-tw", IDX_EUC_TW }, 343 { "taiwan", IDX_EUC_TW }, 344 { "cp950", IDX_BIG5 }, 345 { "950", IDX_BIG5 }, 346 { "mac", IDX_MACROMAN }, 347 { "mac-roman", IDX_MACROMAN }, 348 { NULL, 0 } 349 }; 350 351 /// Find encoding "name" in the list of canonical encoding names. 352 /// Returns -1 if not found. 353 static int enc_canon_search(const char *name) 354 FUNC_ATTR_PURE 355 { 356 for (int i = 0; i < IDX_COUNT; i++) { 357 if (strcmp(name, enc_canon_table[i].name) == 0) { 358 return i; 359 } 360 } 361 return -1; 362 } 363 364 // Find canonical encoding "name" in the list and return its properties. 365 // Returns 0 if not found. 366 int enc_canon_props(const char *name) 367 FUNC_ATTR_PURE 368 { 369 int i = enc_canon_search(name); 370 if (i >= 0) { 371 return enc_canon_table[i].prop; 372 } else if (strncmp(name, "2byte-", 6) == 0) { 373 return ENC_DBCS; 374 } else if (strncmp(name, "8bit-", 5) == 0 || strncmp(name, "iso-8859-", 9) == 0) { 375 return ENC_8BIT; 376 } 377 return 0; 378 } 379 380 // Return the size of the BOM for the current buffer: 381 // 0 - no BOM 382 // 2 - UCS-2 or UTF-16 BOM 383 // 4 - UCS-4 BOM 384 // 3 - UTF-8 BOM 385 int bomb_size(void) 386 FUNC_ATTR_PURE 387 { 388 int n = 0; 389 390 if (curbuf->b_p_bomb && !curbuf->b_p_bin) { 391 if (*curbuf->b_p_fenc == NUL 392 || strcmp(curbuf->b_p_fenc, "utf-8") == 0) { 393 n = 3; 394 } else if (strncmp(curbuf->b_p_fenc, "ucs-2", 5) == 0 395 || strncmp(curbuf->b_p_fenc, "utf-16", 6) == 0) { 396 n = 2; 397 } else if (strncmp(curbuf->b_p_fenc, "ucs-4", 5) == 0) { 398 n = 4; 399 } 400 } 401 return n; 402 } 403 404 // Remove all BOM from "s" by moving remaining text. 405 void remove_bom(char *s) 406 { 407 char *p = s; 408 409 while ((p = strchr(p, 0xef)) != NULL) { 410 if ((uint8_t)p[1] == 0xbb && (uint8_t)p[2] == 0xbf) { 411 STRMOVE(p, p + 3); 412 } else { 413 p++; 414 } 415 } 416 } 417 418 /// Get class of pointer: 419 /// 0 for blank or NUL 420 /// 1 for punctuation 421 /// 2 for an alphanumeric word character 422 /// >2 for other word characters, including CJK and emoji 423 int mb_get_class(const char *p) 424 FUNC_ATTR_PURE 425 { 426 return mb_get_class_tab(p, curbuf->b_chartab); 427 } 428 429 int mb_get_class_tab(const char *p, const uint64_t *const chartab) 430 FUNC_ATTR_PURE 431 { 432 if (MB_BYTE2LEN((uint8_t)p[0]) == 1) { 433 if (p[0] == NUL || ascii_iswhite(p[0])) { 434 return 0; 435 } 436 if (vim_iswordc_tab((uint8_t)p[0], chartab)) { 437 return 2; 438 } 439 return 1; 440 } 441 return utf_class_tab(utf_ptr2char(p), chartab); 442 } 443 444 static bool prop_is_emojilike(const utf8proc_property_t *prop) 445 { 446 return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC 447 || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR; 448 } 449 450 /// For UTF-8 character "c" return 2 for a double-width character, 1 for others. 451 /// Returns 4 or 6 for an unprintable character. 452 /// Is only correct for characters >= 0x80. 453 /// When p_ambw is "double", return 2 for a character with East Asian Width 454 /// class 'A'(mbiguous). 455 int utf_char2cells(int c) 456 { 457 if (c < 0x80) { 458 return 1; 459 } 460 461 if (!vim_isprintc(c)) { 462 assert(c <= 0xFFFF); 463 // unprintable is displayed either as <xx> or <xxxx> 464 return c > 0xFF ? 6 : 4; 465 } 466 467 int n = cw_value(c); 468 if (n != 0) { 469 return n; 470 } 471 472 const utf8proc_property_t *prop = utf8proc_get_property(c); 473 474 if (prop->charwidth == 2) { 475 return 2; 476 } 477 if (*p_ambw == 'd' && prop->ambiguous_width) { 478 return 2; 479 } 480 481 // Characters below 1F000 may be considered single width traditionally, 482 // making them double width causes problems. 483 if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) { 484 return 2; 485 } 486 487 return 1; 488 } 489 490 /// Return the number of display cells character at "*p" occupies. 491 /// This doesn't take care of unprintable characters, use ptr2cells() for that. 492 int utf_ptr2cells(const char *p_in) 493 { 494 const uint8_t *p = (const uint8_t *)p_in; 495 // Need to convert to a character number. 496 if ((*p) >= 0x80) { 497 int len = utf8len_tab[*p]; 498 int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len); 499 // An illegal byte is displayed as <xx>. 500 if (c <= 0) { 501 return 4; 502 } 503 // If the char is ASCII it must be an overlong sequence. 504 if (c < 0x80) { 505 return char2cells(c); 506 } 507 int cells = utf_char2cells(c); 508 if (cells == 1 && p_emoji 509 && prop_is_emojilike(utf8proc_get_property(c))) { 510 int c2 = utf_ptr2char(p_in + len); 511 if (c2 == 0xFE0F) { 512 return 2; // emoji presentation 513 } 514 } 515 return cells; 516 } 517 return 1; 518 } 519 520 /// Convert a UTF-8 byte sequence to a character number. 521 /// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL) 522 /// are treated like illegal sequences. 523 /// 524 /// @param[in] p String to convert. 525 /// @param[in] len Length of the character in bytes, 0 or 1 if illegal. 526 /// 527 /// @return Unicode codepoint. A negative value when the sequence is illegal (or 528 /// ASCII, including NUL). 529 int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len) 530 FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT 531 { 532 // uint8_t is a reminder for clang to use smaller cmp 533 #define CHECK \ 534 do { \ 535 if (EXPECT((uint8_t)(cur & 0xC0U) != 0x80U, false)) { \ 536 return -1; \ 537 } \ 538 } while (0) 539 540 static uint32_t const corrections[] = { 541 (1U << 31), // invalid - set invalid bits (safe to add as first 2 bytes 542 (1U << 31), // won't affect highest bit in normal ret) 543 -(0x80U + (0xC0U << 6)), // multibyte - subtract added UTF8 bits (1..10xxx and 10xxx) 544 -(0x80U + (0x80U << 6) + (0xE0U << 12)), 545 -(0x80U + (0x80U << 6) + (0x80U << 12) + (0xF0U << 18)), 546 -(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0xF8U << 24)), 547 -(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0x80U << 24)), // + (0xFCU << 30) 548 }; 549 550 // len is 0-6, but declared uintptr_t to avoid zeroing out upper bits 551 uint32_t const corr = corrections[len]; 552 uint8_t cur; 553 554 // reading second byte unconditionally, safe for invalid 555 // as it cannot be the last byte, not safe for ascii 556 uint32_t code_point = ((uint32_t)p[0] << 6) + (cur = p[1]); 557 CHECK; 558 if ((uint32_t)len < 3) { 559 goto ret; // len == 0, 1, 2 560 } 561 562 code_point = (code_point << 6) + (cur = p[2]); 563 CHECK; 564 if ((uint32_t)len == 3) { 565 goto ret; 566 } 567 568 code_point = (code_point << 6) + (cur = p[3]); 569 CHECK; 570 if ((uint32_t)len == 4) { 571 goto ret; 572 } 573 574 code_point = (code_point << 6) + (cur = p[4]); 575 CHECK; 576 if ((uint32_t)len == 5) { 577 goto ret; 578 } 579 580 code_point = (code_point << 6) + (cur = p[5]); 581 CHECK; 582 // len == 6 583 584 ret: 585 return (int32_t)(code_point + corr); 586 587 #undef CHECK 588 } 589 590 /// Like utf_ptr2cells(), but limit string length to "size". 591 /// For an empty string or truncated character returns 1. 592 int utf_ptr2cells_len(const char *p, int size) 593 { 594 // Need to convert to a wide character. 595 if (size > 0 && (uint8_t)(*p) >= 0x80) { 596 int len = utf_ptr2len_len(p, size); 597 if (len < utf8len_tab[(uint8_t)(*p)]) { 598 return 1; // truncated 599 } 600 int c = utf_ptr2char(p); 601 // An illegal byte is displayed as <xx>. 602 if (utf_ptr2len(p) == 1 || c == NUL) { 603 return 4; 604 } 605 // If the char is ASCII it must be an overlong sequence. 606 if (c < 0x80) { 607 return char2cells(c); 608 } 609 int cells = utf_char2cells(c); 610 if (cells == 1 && p_emoji && size > len 611 && prop_is_emojilike(utf8proc_get_property(c)) 612 && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) { 613 int c2 = utf_ptr2char(p + len); 614 if (c2 == 0xFE0F) { 615 return 2; // emoji presentation 616 } 617 } 618 return cells; 619 } 620 return 1; 621 } 622 623 /// Calculate the number of cells occupied by string `str`. 624 /// 625 /// @param str The source string, may not be NULL, must be a NUL-terminated 626 /// string. 627 /// @return The number of cells occupied by string `str` 628 size_t mb_string2cells(const char *str) 629 { 630 size_t clen = 0; 631 632 for (const char *p = str; *p != NUL; p += utfc_ptr2len(p)) { 633 clen += (size_t)utf_ptr2cells(p); 634 } 635 636 return clen; 637 } 638 639 /// Get the number of cells occupied by string `str` with maximum length `size` 640 /// 641 /// @param str The source string, may not be NULL, must be a NUL-terminated 642 /// string. 643 /// @param size maximum length of string. It will terminate on earlier NUL. 644 /// @return The number of cells occupied by string `str` 645 size_t mb_string2cells_len(const char *str, size_t size) 646 FUNC_ATTR_NONNULL_ARG(1) 647 { 648 size_t clen = 0; 649 650 for (const char *p = str; *p != NUL && p < str + size; 651 p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) { 652 clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str)); 653 } 654 655 return clen; 656 } 657 658 /// Convert a UTF-8 byte sequence to a character number. 659 /// 660 /// If the sequence is illegal or truncated by a NUL then the first byte is 661 /// returned. 662 /// For an overlong sequence this may return zero. 663 /// Does not include composing characters for obvious reasons. 664 /// 665 /// @param[in] p_in String to convert. 666 /// 667 /// @return Unicode codepoint or byte value. 668 int utf_ptr2char(const char *const p_in) 669 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL 670 { 671 uint8_t *p = (uint8_t *)p_in; 672 673 uint32_t const v0 = p[0]; 674 if (EXPECT(v0 < 0x80U, true)) { // Be quick for ASCII. 675 return (int)v0; 676 } 677 678 const uint8_t len = utf8len_tab[v0]; 679 if (EXPECT(len < 2, false)) { 680 return (int)v0; 681 } 682 683 #define CHECK(v) \ 684 do { \ 685 if (EXPECT((uint8_t)((v) & 0xC0U) != 0x80U, false)) { \ 686 return (int)v0; \ 687 } \ 688 } while (0) 689 #define LEN_RETURN(len_v, result) \ 690 do { \ 691 if (len == (len_v)) { \ 692 return (int)(result); \ 693 } \ 694 } while (0) 695 #define S(s) ((uint32_t)0x80U << (s)) 696 697 uint32_t const v1 = p[1]; 698 CHECK(v1); 699 LEN_RETURN(2, (v0 << 6) + v1 - ((0xC0U << 6) + S(0))); 700 701 uint32_t const v2 = p[2]; 702 CHECK(v2); 703 LEN_RETURN(3, (v0 << 12) + (v1 << 6) + v2 - ((0xE0U << 12) + S(6) + S(0))); 704 705 uint32_t const v3 = p[3]; 706 CHECK(v3); 707 LEN_RETURN(4, (v0 << 18) + (v1 << 12) + (v2 << 6) + v3 708 - ((0xF0U << 18) + S(12) + S(6) + S(0))); 709 710 uint32_t const v4 = p[4]; 711 CHECK(v4); 712 LEN_RETURN(5, (v0 << 24) + (v1 << 18) + (v2 << 12) + (v3 << 6) + v4 713 - ((0xF8U << 24) + S(18) + S(12) + S(6) + S(0))); 714 715 uint32_t const v5 = p[5]; 716 CHECK(v5); 717 // len == 6 718 return (int)((v0 << 30) + (v1 << 24) + (v2 << 18) + (v3 << 12) + (v4 << 6) + v5 719 // - (0xFCU << 30) 720 - (S(24) + S(18) + S(12) + S(6) + S(0))); 721 722 #undef S 723 #undef CHECK 724 #undef LEN_RETURN 725 } 726 727 // Convert a UTF-8 byte sequence to a wide character. 728 // String is assumed to be terminated by NUL or after "n" bytes, whichever 729 // comes first. 730 // The function is safe in the sense that it never accesses memory beyond the 731 // first "n" bytes of "s". 732 // 733 // On success, returns decoded codepoint, advances "s" to the beginning of 734 // next character and decreases "n" accordingly. 735 // 736 // If end of string was reached, returns 0 and, if "n" > 0, advances "s" past 737 // NUL byte. 738 // 739 // If byte sequence is illegal or incomplete, returns -1 and does not advance 740 // "s". 741 static int utf_safe_read_char_adv(const char **s, size_t *n) 742 { 743 if (*n == 0) { // end of buffer 744 return 0; 745 } 746 747 uint8_t k = utf8len_tab_zero[(uint8_t)(**s)]; 748 749 if (k == 1) { 750 // ASCII character or NUL 751 (*n)--; 752 return (uint8_t)(*(*s)++); 753 } 754 755 if (k <= *n) { 756 // We have a multibyte sequence and it isn't truncated by buffer 757 // limits so utf_ptr2char() is safe to use. Or the first byte is 758 // illegal (k=0), and it's also safe to use utf_ptr2char(). 759 int c = utf_ptr2char(*s); 760 761 // On failure, utf_ptr2char() returns the first byte, so here we 762 // check equality with the first byte. The only non-ASCII character 763 // which equals the first byte of its own UTF-8 representation is 764 // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. 765 // It's safe even if n=1, else we would have k=2 > n. 766 if (c != (int)((uint8_t)(**s)) || (c == 0xC3 && (uint8_t)(*s)[1] == 0x83)) { 767 // byte sequence was successfully decoded 768 *s += k; 769 *n -= k; 770 return c; 771 } 772 } 773 774 // byte sequence is incomplete or illegal 775 return -1; 776 } 777 778 // Get character at **pp and advance *pp to the next character. 779 // Note: composing characters are skipped! 780 int mb_ptr2char_adv(const char **const pp) 781 { 782 int c = utf_ptr2char(*pp); 783 *pp += utfc_ptr2len(*pp); 784 return c; 785 } 786 787 // Get character at **pp and advance *pp to the next character. 788 // Note: composing characters are returned as separate characters. 789 int mb_cptr2char_adv(const char **pp) 790 { 791 int c = utf_ptr2char(*pp); 792 *pp += utf_ptr2len(*pp); 793 return c; 794 } 795 796 /// When "c" is the first char of a string, determine if it needs to be prefixed 797 /// by a space byte to be drawn correctly, and not merge with the space left of 798 /// the string. 799 bool utf_iscomposing_first(int c) 800 { 801 return c >= 128 && !utf8proc_grapheme_break(' ', c); 802 } 803 804 /// Check if the character pointed to by "p2" is a composing character when it 805 /// comes after "p1". 806 /// 807 /// We use the definition in UAX#29 as implemented by utf8proc with the following 808 /// exceptions: 809 /// 810 /// - ASCII chars always begin a new cluster. This is a long assumed invariant 811 /// in the code base and very useful for performance (we can exit early for ASCII 812 /// all over the place, branch predictor go brrr in ASCII-only text). 813 /// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII, 814 /// which should be exceedingly rare (these PREPEND chars are expected to be 815 /// followed by multibyte chars within the same script family) 816 /// 817 /// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with 818 /// "c" taking one single cell, which behaves like a cluster. 819 /// 820 /// @param "state" should be set to GRAPHEME_STATE_INIT before first call 821 /// it is allowed to be null, but will then not handle some longer 822 /// sequences, like ZWJ based emoji 823 bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state) 824 FUNC_ATTR_NONNULL_ARG(1, 2) 825 { 826 if ((uint8_t)(*p2) < 128) { 827 return false; 828 } 829 830 int first = utf_ptr2char(p1); 831 int second = utf_ptr2char(p2); 832 833 if (!utf8proc_grapheme_break_stateful(first, second, state)) { 834 return true; 835 } 836 837 return arabic_combine(first, second); 838 } 839 840 /// same as utf_composinglike but operating on UCS-4 values 841 bool utf_iscomposing(int c1, int c2, GraphemeState *state) 842 { 843 return (!utf8proc_grapheme_break_stateful(c1, c2, state) 844 || arabic_combine(c1, c2)); 845 } 846 847 /// Get the screen char at the beginning of a string 848 /// 849 /// Caller is expected to check for things like unprintable chars etc 850 /// If first char in string is a composing char, prepend a space to display it correctly. 851 /// 852 /// If "p" starts with an invalid sequence, zero is returned. 853 /// 854 /// @param[out] firstc (required) The first codepoint of the screen char, 855 /// or the first byte of an invalid sequence 856 /// 857 /// @return the char 858 schar_T utfc_ptr2schar(const char *p, int *firstc) 859 FUNC_ATTR_NONNULL_ALL 860 { 861 int c = utf_ptr2char(p); 862 *firstc = c; // NOT optional, you are gonna need it 863 bool first_compose = utf_iscomposing_first(c); 864 size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; 865 size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); 866 867 if (len == 1 && (uint8_t)(*p) >= 0x80) { 868 return 0; // invalid sequence 869 } 870 871 return schar_from_buf_first(p, len, first_compose); 872 } 873 874 /// Get the screen char from a char with a known length 875 /// 876 /// Like utfc_ptr2schar but use no more than p[maxlen]. 877 schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc) 878 FUNC_ATTR_NONNULL_ALL 879 { 880 if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { 881 // invalid or truncated sequence 882 *firstc = (uint8_t)(*p); 883 return 0; 884 } 885 886 int c = utf_ptr2char(p); 887 *firstc = c; 888 bool first_compose = utf_iscomposing_first(c); 889 int maxlen = MAX_SCHAR_SIZE - 1 - first_compose; 890 if (len > maxlen) { 891 len = utfc_ptr2len_len(p, maxlen); 892 } 893 894 return schar_from_buf_first(p, (size_t)len, first_compose); 895 } 896 897 /// Caller must ensure there is space for `first_compose` 898 static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose) 899 { 900 if (first_compose) { 901 char cbuf[MAX_SCHAR_SIZE]; 902 cbuf[0] = ' '; 903 memcpy(cbuf + 1, buf, len); 904 return schar_from_buf(cbuf, len + 1); 905 } else { 906 return schar_from_buf(buf, len); 907 } 908 } 909 910 /// Get the length of a UTF-8 byte sequence representing a single codepoint 911 /// 912 /// @param[in] p UTF-8 string. 913 /// 914 /// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte 915 /// sequence. 916 int utf_ptr2len(const char *const p_in) 917 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL 918 { 919 uint8_t *p = (uint8_t *)p_in; 920 if (*p == NUL) { 921 return 0; 922 } 923 const int len = utf8len_tab[*p]; 924 for (int i = 1; i < len; i++) { 925 if ((p[i] & 0xc0) != 0x80) { 926 return 1; 927 } 928 } 929 return len; 930 } 931 932 // Return length of UTF-8 character, obtained from the first byte. 933 // "b" must be between 0 and 255! 934 // Returns 1 for an invalid first byte value. 935 int utf_byte2len(int b) 936 { 937 return utf8len_tab[b]; 938 } 939 940 // Get the length of UTF-8 byte sequence "p[size]". Does not include any 941 // following composing characters. 942 // Returns 1 for "". 943 // Returns 1 for an illegal byte sequence (also in incomplete byte seq.). 944 // Returns number > "size" for an incomplete byte sequence. 945 // Never returns zero. 946 int utf_ptr2len_len(const char *p, int size) 947 { 948 int m; 949 950 int len = utf8len_tab[(uint8_t)(*p)]; 951 if (len == 1) { 952 return 1; // NUL, ascii or illegal lead byte 953 } 954 if (len > size) { 955 m = size; // incomplete byte sequence. 956 } else { 957 m = len; 958 } 959 for (int i = 1; i < m; i++) { 960 if ((p[i] & 0xc0) != 0x80) { 961 return 1; 962 } 963 } 964 return len; 965 } 966 967 /// Return the number of bytes occupied by a UTF-8 character in a string. 968 /// This includes following composing characters. 969 /// Returns zero for NUL. 970 int utfc_ptr2len(const char *const p) 971 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL 972 { 973 uint8_t b0 = (uint8_t)(*p); 974 975 if (b0 == NUL) { 976 return 0; 977 } 978 if (b0 < 0x80 && (uint8_t)p[1] < 0x80) { // be quick for ASCII 979 return 1; 980 } 981 982 // Skip over first UTF-8 char, stopping at a NUL byte. 983 int len = utf_ptr2len(p); 984 985 // Check for illegal byte. 986 if (len == 1 && b0 >= 0x80) { 987 return 1; 988 } 989 990 // Check for composing characters. 991 int prevlen = 0; 992 GraphemeState state = GRAPHEME_STATE_INIT; 993 while (true) { 994 if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) { 995 return len; 996 } 997 998 // Skip over composing char. 999 prevlen = len; 1000 len += utf_ptr2len(p + len); 1001 } 1002 } 1003 1004 /// Return the number of bytes the UTF-8 encoding of the character at "p[size]" 1005 /// takes. This includes following composing characters. 1006 /// Returns 0 for an empty string. 1007 /// Returns 1 for an illegal char or an incomplete byte sequence. 1008 int utfc_ptr2len_len(const char *p, int size) 1009 { 1010 if (size < 1 || *p == NUL) { 1011 return 0; 1012 } 1013 if ((uint8_t)p[0] < 0x80 && (size == 1 || (uint8_t)p[1] < 0x80)) { // be quick for ASCII 1014 return 1; 1015 } 1016 1017 // Skip over first UTF-8 char, stopping at a NUL byte. 1018 int len = utf_ptr2len_len(p, size); 1019 1020 // Check for illegal byte and incomplete byte sequence. 1021 if ((len == 1 && (uint8_t)p[0] >= 0x80) || len > size) { 1022 return 1; 1023 } 1024 1025 // Check for composing characters. We can only display a limited amount, but 1026 // skip all of them (otherwise the cursor would get stuck). 1027 int prevlen = 0; 1028 GraphemeState state = GRAPHEME_STATE_INIT; 1029 while (len < size) { 1030 if ((uint8_t)p[len] < 0x80) { 1031 break; 1032 } 1033 1034 // Next character length should not go beyond size to ensure that 1035 // utf_composinglike(...) does not read beyond size. 1036 int len_next_char = utf_ptr2len_len(p + len, size - len); 1037 if (len_next_char > size - len) { 1038 break; 1039 } 1040 1041 if (!utf_composinglike(p + prevlen, p + len, &state)) { 1042 break; 1043 } 1044 1045 // Skip over composing char 1046 prevlen = len; 1047 len += len_next_char; 1048 } 1049 return len; 1050 } 1051 1052 /// Determine how many bytes certain unicode codepoint will occupy 1053 int utf_char2len(const int c) 1054 { 1055 if (c < 0x80) { 1056 return 1; 1057 } else if (c < 0x800) { 1058 return 2; 1059 } else if (c < 0x10000) { 1060 return 3; 1061 } else if (c < 0x200000) { 1062 return 4; 1063 } else if (c < 0x4000000) { 1064 return 5; 1065 } else { 1066 return 6; 1067 } 1068 } 1069 1070 /// Convert Unicode character to UTF-8 string 1071 /// 1072 /// @param c character to convert to UTF-8 string in \p buf 1073 /// @param[out] buf UTF-8 string generated from \p c, does not add \0 1074 /// must have room for at least 6 bytes 1075 /// @return Number of bytes (1-6). 1076 int utf_char2bytes(const int c, char *const buf) 1077 { 1078 if (c < 0x80) { // 7 bits 1079 buf[0] = (char)c; 1080 return 1; 1081 } else if (c < 0x800) { // 11 bits 1082 buf[0] = (char)(0xc0 + ((unsigned)c >> 6)); 1083 buf[1] = (char)(0x80 + ((unsigned)c & 0x3f)); 1084 return 2; 1085 } else if (c < 0x10000) { // 16 bits 1086 buf[0] = (char)(0xe0 + ((unsigned)c >> 12)); 1087 buf[1] = (char)(0x80 + (((unsigned)c >> 6) & 0x3f)); 1088 buf[2] = (char)(0x80 + ((unsigned)c & 0x3f)); 1089 return 3; 1090 } else if (c < 0x200000) { // 21 bits 1091 buf[0] = (char)(0xf0 + ((unsigned)c >> 18)); 1092 buf[1] = (char)(0x80 + (((unsigned)c >> 12) & 0x3f)); 1093 buf[2] = (char)(0x80 + (((unsigned)c >> 6) & 0x3f)); 1094 buf[3] = (char)(0x80 + ((unsigned)c & 0x3f)); 1095 return 4; 1096 } else if (c < 0x4000000) { // 26 bits 1097 buf[0] = (char)(0xf8 + ((unsigned)c >> 24)); 1098 buf[1] = (char)(0x80 + (((unsigned)c >> 18) & 0x3f)); 1099 buf[2] = (char)(0x80 + (((unsigned)c >> 12) & 0x3f)); 1100 buf[3] = (char)(0x80 + (((unsigned)c >> 6) & 0x3f)); 1101 buf[4] = (char)(0x80 + ((unsigned)c & 0x3f)); 1102 return 5; 1103 } else { // 31 bits 1104 buf[0] = (char)(0xfc + ((unsigned)c >> 30)); 1105 buf[1] = (char)(0x80 + (((unsigned)c >> 24) & 0x3f)); 1106 buf[2] = (char)(0x80 + (((unsigned)c >> 18) & 0x3f)); 1107 buf[3] = (char)(0x80 + (((unsigned)c >> 12) & 0x3f)); 1108 buf[4] = (char)(0x80 + (((unsigned)c >> 6) & 0x3f)); 1109 buf[5] = (char)(0x80 + ((unsigned)c & 0x3f)); 1110 return 6; 1111 } 1112 } 1113 1114 /// Return true if "c" is a legacy composing UTF-8 character. 1115 /// 1116 /// This is deprecated in favour of utf_composinglike() which uses the modern 1117 /// stateful algorithm to determine grapheme clusters. Still available 1118 /// to support some legacy code which hasn't been refactored yet. 1119 /// 1120 /// To check if a char would combine with a preceding space, use 1121 /// utf_iscomposing_first() instead. 1122 /// 1123 /// Based on code from Markus Kuhn. 1124 /// Returns false for negative values. 1125 bool utf_iscomposing_legacy(int c) 1126 { 1127 const utf8proc_property_t *prop = utf8proc_get_property(c); 1128 return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME; 1129 } 1130 1131 #ifdef __SSE2__ 1132 1133 # include <emmintrin.h> 1134 1135 // Return true for characters that can be displayed in a normal way. 1136 // Only for characters of 0x100 and above! 1137 bool utf_printable(int c) 1138 FUNC_ATTR_CONST 1139 { 1140 if (c < 0x180B || c > 0xFFFF) { 1141 return c != 0x70F; 1142 } 1143 1144 # define L(v) ((int16_t)((v) - 1)) // lower bound (exclusive) 1145 # define H(v) ((int16_t)(v)) // upper bound (inclusive) 1146 1147 // Boundaries of unprintable characters. 1148 // Some values are negative when converted to int16_t. 1149 // Ranges must not wrap around when converted to int16_t. 1150 __m128i const lo = _mm_setr_epi16(L(0x180b), L(0x200b), L(0x202a), L(0x2060), 1151 L(0xd800), L(0xfeff), L(0xfff9), L(0xfffe)); 1152 1153 __m128i const hi = _mm_setr_epi16(H(0x180e), H(0x200f), H(0x202e), H(0x206f), 1154 H(0xdfff), H(0xfeff), H(0xfffb), H(0xffff)); 1155 1156 # undef L 1157 # undef H 1158 1159 __m128i value = _mm_set1_epi16((int16_t)c); 1160 1161 // Using _mm_cmplt_epi16() is less optimal, since it would require 1162 // swapping operands (sse2 only has cmpgt instruction), 1163 // and only the second operand can be a memory location. 1164 1165 // Character is printable when it is above/below both bounds of each range 1166 // (corresponding bits in both masks are equal). 1167 return _mm_movemask_epi8(_mm_cmpgt_epi16(value, lo)) 1168 == _mm_movemask_epi8(_mm_cmpgt_epi16(value, hi)); 1169 } 1170 1171 #else 1172 1173 // Return true if "c" is in "table". 1174 static bool intable(const struct interval *table, size_t n_items, int c) 1175 FUNC_ATTR_CONST 1176 { 1177 assert(n_items > 0); 1178 // first quick check for Latin1 etc. characters 1179 if (c < table[0].first) { 1180 return false; 1181 } 1182 1183 assert(n_items <= SIZE_MAX / 2); 1184 // binary search in table 1185 size_t bot = 0; 1186 size_t top = n_items; 1187 do { 1188 size_t mid = (bot + top) >> 1; 1189 if (table[mid].last < c) { 1190 bot = mid + 1; 1191 } else if (table[mid].first > c) { 1192 top = mid; 1193 } else { 1194 return true; 1195 } 1196 } while (top > bot); 1197 return false; 1198 } 1199 1200 // Return true for characters that can be displayed in a normal way. 1201 // Only for characters of 0x100 and above! 1202 bool utf_printable(int c) 1203 FUNC_ATTR_CONST 1204 { 1205 // Sorted list of non-overlapping intervals. 1206 // 0xd800-0xdfff is reserved for UTF-16, actually illegal. 1207 static const struct interval nonprint[] = { 1208 { 0x070f, 0x070f }, { 0x180b, 0x180e }, { 0x200b, 0x200f }, { 0x202a, 0x202e }, 1209 { 0x2060, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb }, 1210 { 0xfffe, 0xffff } 1211 }; 1212 1213 return !intable(nonprint, ARRAY_SIZE(nonprint), c); 1214 } 1215 1216 #endif 1217 1218 // Get class of a Unicode character. 1219 // 0: white space 1220 // 1: punctuation 1221 // 2 or bigger: some class of word character. 1222 int utf_class(const int c) 1223 { 1224 return utf_class_tab(c, curbuf->b_chartab); 1225 } 1226 1227 int utf_class_tab(const int c, const uint64_t *const chartab) 1228 FUNC_ATTR_PURE 1229 { 1230 // sorted list of non-overlapping intervals 1231 static struct clinterval { 1232 unsigned first; 1233 unsigned last; 1234 unsigned cls; 1235 } classes[] = { 1236 { 0x037e, 0x037e, 1 }, // Greek question mark 1237 { 0x0387, 0x0387, 1 }, // Greek ano teleia 1238 { 0x055a, 0x055f, 1 }, // Armenian punctuation 1239 { 0x0589, 0x0589, 1 }, // Armenian full stop 1240 { 0x05be, 0x05be, 1 }, 1241 { 0x05c0, 0x05c0, 1 }, 1242 { 0x05c3, 0x05c3, 1 }, 1243 { 0x05f3, 0x05f4, 1 }, 1244 { 0x060c, 0x060c, 1 }, 1245 { 0x061b, 0x061b, 1 }, 1246 { 0x061f, 0x061f, 1 }, 1247 { 0x066a, 0x066d, 1 }, 1248 { 0x06d4, 0x06d4, 1 }, 1249 { 0x0700, 0x070d, 1 }, // Syriac punctuation 1250 { 0x0964, 0x0965, 1 }, 1251 { 0x0970, 0x0970, 1 }, 1252 { 0x0df4, 0x0df4, 1 }, 1253 { 0x0e4f, 0x0e4f, 1 }, 1254 { 0x0e5a, 0x0e5b, 1 }, 1255 { 0x0f04, 0x0f12, 1 }, 1256 { 0x0f3a, 0x0f3d, 1 }, 1257 { 0x0f85, 0x0f85, 1 }, 1258 { 0x104a, 0x104f, 1 }, // Myanmar punctuation 1259 { 0x10fb, 0x10fb, 1 }, // Georgian punctuation 1260 { 0x1361, 0x1368, 1 }, // Ethiopic punctuation 1261 { 0x166d, 0x166e, 1 }, // Canadian Syl. punctuation 1262 { 0x1680, 0x1680, 0 }, 1263 { 0x169b, 0x169c, 1 }, 1264 { 0x16eb, 0x16ed, 1 }, 1265 { 0x1735, 0x1736, 1 }, 1266 { 0x17d4, 0x17dc, 1 }, // Khmer punctuation 1267 { 0x1800, 0x180a, 1 }, // Mongolian punctuation 1268 { 0x2000, 0x200b, 0 }, // spaces 1269 { 0x200c, 0x2027, 1 }, // punctuation and symbols 1270 { 0x2028, 0x2029, 0 }, 1271 { 0x202a, 0x202e, 1 }, // punctuation and symbols 1272 { 0x202f, 0x202f, 0 }, 1273 { 0x2030, 0x205e, 1 }, // punctuation and symbols 1274 { 0x205f, 0x205f, 0 }, 1275 { 0x2060, 0x206f, 1 }, // punctuation and symbols 1276 { 0x2070, 0x207f, 0x2070 }, // superscript 1277 { 0x2080, 0x2094, 0x2080 }, // subscript 1278 { 0x20a0, 0x27ff, 1 }, // all kinds of symbols 1279 { 0x2800, 0x28ff, 0x2800 }, // braille 1280 { 0x2900, 0x2998, 1 }, // arrows, brackets, etc. 1281 { 0x29d8, 0x29db, 1 }, 1282 { 0x29fc, 0x29fd, 1 }, 1283 { 0x2e00, 0x2e7f, 1 }, // supplemental punctuation 1284 { 0x3000, 0x3000, 0 }, // ideographic space 1285 { 0x3001, 0x3020, 1 }, // ideographic punctuation 1286 { 0x3030, 0x3030, 1 }, 1287 { 0x303d, 0x303d, 1 }, 1288 { 0x3040, 0x309f, 0x3040 }, // Hiragana 1289 { 0x30a0, 0x30ff, 0x30a0 }, // Katakana 1290 { 0x3300, 0x9fff, 0x4e00 }, // CJK Ideographs 1291 { 0xac00, 0xd7a3, 0xac00 }, // Hangul Syllables 1292 { 0xf900, 0xfaff, 0x4e00 }, // CJK Ideographs 1293 { 0xfd3e, 0xfd3f, 1 }, 1294 { 0xfe30, 0xfe6b, 1 }, // punctuation forms 1295 { 0xff00, 0xff0f, 1 }, // half/fullwidth ASCII 1296 { 0xff1a, 0xff20, 1 }, // half/fullwidth ASCII 1297 { 0xff3b, 0xff40, 1 }, // half/fullwidth ASCII 1298 { 0xff5b, 0xff65, 1 }, // half/fullwidth ASCII 1299 { 0x1d000, 0x1d24f, 1 }, // Musical notation 1300 { 0x1d400, 0x1d7ff, 1 }, // Mathematical Alphanumeric Symbols 1301 { 0x1f000, 0x1f2ff, 1 }, // Game pieces; enclosed characters 1302 { 0x1f300, 0x1f9ff, 1 }, // Many symbol blocks 1303 { 0x20000, 0x2a6df, 0x4e00 }, // CJK Ideographs 1304 { 0x2a700, 0x2b73f, 0x4e00 }, // CJK Ideographs 1305 { 0x2b740, 0x2b81f, 0x4e00 }, // CJK Ideographs 1306 { 0x2f800, 0x2fa1f, 0x4e00 }, // CJK Ideographs 1307 }; 1308 int bot = 0; 1309 int top = ARRAY_SIZE(classes) - 1; 1310 1311 // First quick check for Latin1 characters, use 'iskeyword'. 1312 if (c < 0x100) { 1313 if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) { 1314 return 0; // blank 1315 } 1316 if (vim_iswordc_tab(c, chartab)) { 1317 return 2; // word character 1318 } 1319 return 1; // punctuation 1320 } 1321 1322 const utf8proc_property_t *prop = utf8proc_get_property(c); 1323 // emoji 1324 if (prop_is_emojilike(prop)) { 1325 return 3; 1326 } 1327 1328 // binary search in table 1329 while (top >= bot) { 1330 int mid = (bot + top) / 2; 1331 if (classes[mid].last < (unsigned)c) { 1332 bot = mid + 1; 1333 } else if (classes[mid].first > (unsigned)c) { 1334 top = mid - 1; 1335 } else { 1336 return (int)classes[mid].cls; 1337 } 1338 } 1339 1340 // most other characters are "word" characters 1341 return 2; 1342 } 1343 1344 bool utf_ambiguous_width(const char *p) 1345 { 1346 // be quick if there is nothing to print or ASCII-only 1347 if (p[0] == NUL || p[1] == NUL) { 1348 return false; 1349 } 1350 1351 CharInfo info = utf_ptr2CharInfo(p); 1352 if (info.value >= 0x80) { 1353 const utf8proc_property_t *prop = utf8proc_get_property(info.value); 1354 if (prop->ambiguous_width || prop_is_emojilike(prop)) { 1355 return true; 1356 } 1357 } 1358 1359 // check if second sequence is 0xFE0F VS-16 which can turn things into emoji, 1360 // safe with NUL (no second sequence) 1361 return memcmp(p + info.len, "\xef\xb8\x8f", 3) == 0; 1362 } 1363 1364 // Return the folded-case equivalent of "a", which is a UCS-4 character. Uses 1365 // full case folding. 1366 int utf_fold(int a) 1367 { 1368 if (a < 0x80) { 1369 // be fast for ASCII 1370 return a >= 0x41 && a <= 0x5a ? a + 32 : a; 1371 } 1372 1373 // TODO(dundargoc): utf8proc only does full case folding, which breaks some tests. This is a 1374 // temporary workaround to circumvent failing tests. 1375 // 1376 // (0xdf) ß == ss in full casefolding. Using this however breaks the vim spell tests and the error 1377 // E763 is thrown. This is due to the test spells relying on the vim spell files. 1378 // 1379 // (0x130) İ == i̇ in full casefolding. 1380 if (a == 0xdf || a == 0x130) { 1381 return a; 1382 } 1383 1384 utf8proc_int32_t result[1]; 1385 1386 utf8proc_ssize_t res = utf8proc_decompose_char(a, result, 1, UTF8PROC_CASEFOLD, NULL); 1387 1388 return (res == 1) ? result[0] : a; 1389 } 1390 1391 // Vim's own character class functions. These exist because many library 1392 // islower()/toupper() etc. do not work properly: they crash when used with 1393 // invalid values or can't handle latin1 when the locale is C. 1394 // Speed is most important here. 1395 1396 /// Return the upper-case equivalent of "a", which is a UCS-4 character. Use 1397 /// simple case folding. 1398 int mb_toupper(int a) 1399 { 1400 // If 'casemap' contains "keepascii" use ASCII style toupper(). 1401 if (a < 128 && (cmp_flags & kOptCmpFlagKeepascii)) { 1402 return TOUPPER_ASC(a); 1403 } 1404 1405 if (!(cmp_flags & kOptCmpFlagInternal)) { 1406 return (int)towupper((wint_t)a); 1407 } 1408 1409 // For characters below 128 use locale sensitive toupper(). 1410 if (a < 128) { 1411 return TOUPPER_LOC(a); 1412 } 1413 1414 return utf8proc_toupper(a); 1415 } 1416 1417 bool mb_islower(int a) 1418 { 1419 return mb_toupper(a) != a; 1420 } 1421 1422 /// Return the lower-case equivalent of "a", which is a UCS-4 character. Use 1423 /// simple case folding. 1424 int mb_tolower(int a) 1425 { 1426 // If 'casemap' contains "keepascii" use ASCII style tolower(). 1427 if (a < 128 && (cmp_flags & kOptCmpFlagKeepascii)) { 1428 return TOLOWER_ASC(a); 1429 } 1430 1431 if (!(cmp_flags & kOptCmpFlagInternal)) { 1432 return (int)towlower((wint_t)a); 1433 } 1434 1435 // For characters below 128 use locale sensitive tolower(). 1436 if (a < 128) { 1437 return TOLOWER_LOC(a); 1438 } 1439 1440 return utf8proc_tolower(a); 1441 } 1442 1443 bool mb_isupper(int a) 1444 { 1445 return mb_tolower(a) != a; 1446 } 1447 1448 bool mb_isalpha(int a) 1449 FUNC_ATTR_WARN_UNUSED_RESULT 1450 { 1451 return mb_islower(a) || mb_isupper(a); 1452 } 1453 1454 int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2) 1455 { 1456 int c1, c2; 1457 char buffer[6]; 1458 1459 while (true) { 1460 c1 = utf_safe_read_char_adv(&s1, &n1); 1461 c2 = utf_safe_read_char_adv(&s2, &n2); 1462 1463 if (c1 <= 0 || c2 <= 0) { 1464 break; 1465 } 1466 1467 if (c1 == c2) { 1468 continue; 1469 } 1470 1471 int cdiff = utf_fold(c1) - utf_fold(c2); 1472 if (cdiff != 0) { 1473 return cdiff; 1474 } 1475 } 1476 1477 // some string ended or has an incomplete/illegal character sequence 1478 1479 if (c1 == 0 || c2 == 0) { 1480 // some string ended. shorter string is smaller 1481 if (c1 == 0 && c2 == 0) { 1482 return 0; 1483 } 1484 return c1 == 0 ? -1 : 1; 1485 } 1486 1487 // Continue with bytewise comparison to produce some result that 1488 // would make comparison operations involving this function transitive. 1489 // 1490 // If only one string had an error, comparison should be made with 1491 // folded version of the other string. In this case it is enough 1492 // to fold just one character to determine the result of comparison. 1493 1494 if (c1 != -1 && c2 == -1) { 1495 n1 = (size_t)utf_char2bytes(utf_fold(c1), buffer); 1496 s1 = buffer; 1497 } else if (c2 != -1 && c1 == -1) { 1498 n2 = (size_t)utf_char2bytes(utf_fold(c2), buffer); 1499 s2 = buffer; 1500 } 1501 1502 while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) { 1503 int cdiff = (int)((uint8_t)(*s1)) - (int)((uint8_t)(*s2)); 1504 if (cdiff != 0) { 1505 return cdiff; 1506 } 1507 1508 s1++; 1509 s2++; 1510 n1--; 1511 n2--; 1512 } 1513 1514 if (n1 > 0 && *s1 == NUL) { 1515 n1 = 0; 1516 } 1517 if (n2 > 0 && *s2 == NUL) { 1518 n2 = 0; 1519 } 1520 1521 if (n1 == 0 && n2 == 0) { 1522 return 0; 1523 } 1524 return n1 == 0 ? -1 : 1; 1525 } 1526 1527 #ifdef MSWIN 1528 # ifndef CP_UTF8 1529 # define CP_UTF8 65001 // magic number from winnls.h 1530 # endif 1531 1532 /// Converts string from UTF-8 to UTF-16. 1533 /// 1534 /// @param utf8 UTF-8 string. 1535 /// @param utf8len Length of `utf8`. May be -1 if `utf8` is NUL-terminated. 1536 /// @param utf16[out,allocated] NUL-terminated UTF-16 string, or NULL on error 1537 /// @return 0 on success, or libuv error code 1538 int utf8_to_utf16(const char *utf8, int utf8len, wchar_t **utf16) 1539 FUNC_ATTR_NONNULL_ALL 1540 { 1541 // Compute the length needed for the converted UTF-16 string. 1542 int bufsize = MultiByteToWideChar(CP_UTF8, 1543 0, // dwFlags: must be 0 for UTF-8 1544 utf8, // -1: process up to NUL 1545 utf8len, 1546 NULL, 1547 0); // 0: get length, don't convert 1548 if (bufsize == 0) { 1549 *utf16 = NULL; 1550 return uv_translate_sys_error(GetLastError()); 1551 } 1552 1553 // Allocate the destination buffer adding an extra byte for the terminating 1554 // NULL. If `utf8len` is not -1 MultiByteToWideChar will not add it, so 1555 // we do it ourselves always, just in case. 1556 *utf16 = xmalloc(sizeof(wchar_t) * (bufsize + 1)); 1557 1558 // Convert to UTF-16. 1559 bufsize = MultiByteToWideChar(CP_UTF8, 0, utf8, utf8len, *utf16, bufsize); 1560 if (bufsize == 0) { 1561 XFREE_CLEAR(*utf16); 1562 return uv_translate_sys_error(GetLastError()); 1563 } 1564 1565 (*utf16)[bufsize] = L'\0'; 1566 return 0; 1567 } 1568 1569 /// Converts string from UTF-16 to UTF-8. 1570 /// 1571 /// @param utf16 UTF-16 string. 1572 /// @param utf16len Length of `utf16`. May be -1 if `utf16` is NUL-terminated. 1573 /// @param utf8[out,allocated] NUL-terminated UTF-8 string, or NULL on error 1574 /// @return 0 on success, or libuv error code 1575 int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8) 1576 FUNC_ATTR_NONNULL_ALL 1577 { 1578 // Compute the space needed for the converted UTF-8 string. 1579 DWORD bufsize = WideCharToMultiByte(CP_UTF8, 1580 0, 1581 utf16, 1582 utf16len, 1583 NULL, 1584 0, 1585 NULL, 1586 NULL); 1587 if (bufsize == 0) { 1588 *utf8 = NULL; 1589 return uv_translate_sys_error(GetLastError()); 1590 } 1591 1592 // Allocate the destination buffer adding an extra byte for the terminating 1593 // NULL. If `utf16len` is not -1 WideCharToMultiByte will not add it, so 1594 // we do it ourselves always, just in case. 1595 *utf8 = xmalloc(bufsize + 1); 1596 1597 // Convert to UTF-8. 1598 bufsize = WideCharToMultiByte(CP_UTF8, 1599 0, 1600 utf16, 1601 utf16len, 1602 *utf8, 1603 bufsize, 1604 NULL, 1605 NULL); 1606 if (bufsize == 0) { 1607 XFREE_CLEAR(*utf8); 1608 return uv_translate_sys_error(GetLastError()); 1609 } 1610 1611 (*utf8)[bufsize] = NUL; 1612 return 0; 1613 } 1614 1615 #endif 1616 1617 /// Measure the length of a string in corresponding UTF-32 and UTF-16 units. 1618 /// 1619 /// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit 1620 /// each. 1621 /// 1622 /// The out parameters are incremented. This is used to measure the size of 1623 /// a buffer region consisting of multiple line segments. 1624 /// 1625 /// @param s the string 1626 /// @param len maximum length (an earlier NUL terminates) 1627 /// @param[out] codepoints incremented with UTF-32 code point size 1628 /// @param[out] codeunits incremented with UTF-16 code unit size 1629 void mb_utflen(const char *s, size_t len, size_t *codepoints, size_t *codeunits) 1630 FUNC_ATTR_NONNULL_ALL 1631 { 1632 size_t count = 0; 1633 size_t extra = 0; 1634 size_t clen; 1635 for (size_t i = 0; i < len; i += clen) { 1636 clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i)); 1637 // NB: gets the byte value of invalid sequence bytes. 1638 // we only care whether the char fits in the BMP or not 1639 int c = (clen > 1) ? utf_ptr2char(s + i) : (uint8_t)s[i]; 1640 count++; 1641 if (c > 0xFFFF) { 1642 extra++; 1643 } 1644 } 1645 *codepoints += count; 1646 *codeunits += count + extra; 1647 } 1648 1649 ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_utf16_units) 1650 FUNC_ATTR_NONNULL_ALL 1651 { 1652 size_t count = 0; 1653 size_t clen; 1654 if (index == 0) { 1655 return 0; 1656 } 1657 for (size_t i = 0; i < len; i += clen) { 1658 clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i)); 1659 // NB: gets the byte value of invalid sequence bytes. 1660 // we only care whether the char fits in the BMP or not 1661 int c = (clen > 1) ? utf_ptr2char(s + i) : (uint8_t)s[i]; 1662 count++; 1663 if (use_utf16_units && c > 0xFFFF) { 1664 count++; 1665 } 1666 if (count >= index) { 1667 return (ssize_t)(i + clen); 1668 } 1669 } 1670 return -1; 1671 } 1672 1673 /// Version of strnicmp() that handles multi-byte characters. 1674 /// Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can 1675 /// probably use strnicmp(), because there are no ASCII characters in the 1676 /// second byte. 1677 /// 1678 /// @return zero if s1 and s2 are equal (ignoring case), the difference between 1679 /// two characters otherwise. 1680 int mb_strnicmp(const char *s1, const char *s2, const size_t nn) 1681 { 1682 return utf_strnicmp(s1, s2, nn, nn); 1683 } 1684 1685 /// Compare strings case-insensitively 1686 /// 1687 /// @note We need to call mb_stricmp() even when we aren't dealing with 1688 /// a multi-byte encoding because mb_stricmp() takes care of all ASCII and 1689 /// non-ascii encodings, including characters with umlauts in latin1, 1690 /// etc., while STRICMP() only handles the system locale version, which 1691 /// often does not handle non-ascii properly. 1692 /// 1693 /// @param[in] s1 First string to compare, not more then #MAXCOL characters. 1694 /// @param[in] s2 Second string to compare, not more then #MAXCOL characters. 1695 /// 1696 /// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2. 1697 int mb_stricmp(const char *s1, const char *s2) 1698 { 1699 return mb_strnicmp(s1, s2, MAXCOL); 1700 } 1701 1702 // "g8": show bytes of the UTF-8 char under the cursor. Doesn't matter what 1703 // 'encoding' has been set to. 1704 void show_utf8(void) 1705 { 1706 // Get the byte length of the char under the cursor, including composing 1707 // characters. 1708 char *line = get_cursor_pos_ptr(); 1709 int len = utfc_ptr2len(line); 1710 if (len == 0) { 1711 msg("NUL", 0); 1712 return; 1713 } 1714 1715 size_t rlen = 0; 1716 int clen = 0; 1717 for (int i = 0; i < len; i++) { 1718 if (clen == 0) { 1719 // start of (composing) character, get its length 1720 if (i > 0) { 1721 STRCPY(IObuff + rlen, "+ "); 1722 rlen += 2; 1723 } 1724 clen = utf_ptr2len(line + i); 1725 } 1726 assert(IOSIZE > rlen); 1727 snprintf(IObuff + rlen, IOSIZE - rlen, "%02x ", 1728 (line[i] == NL) ? NUL : (uint8_t)line[i]); // NUL is stored as NL 1729 clen--; 1730 rlen += strlen(IObuff + rlen); 1731 if (rlen > IOSIZE - 20) { 1732 break; 1733 } 1734 } 1735 1736 msg(IObuff, 0); 1737 } 1738 1739 /// @return true if boundclass bc always starts a new cluster regardless of what's before 1740 /// false negatives are allowed (perf cost, not correctness) 1741 static bool always_break(int bc) 1742 { 1743 return (bc == UTF8PROC_BOUNDCLASS_CONTROL); 1744 } 1745 1746 /// @return true if bc2 always starts a cluster after bc1 1747 /// false negatives are allowed (perf cost, not correctness) 1748 static bool always_break_two(int bc1, int bc2) 1749 { 1750 // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by 1751 // "always_break" on first iteration or when it was bc1 in the previous iteration 1752 return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER) 1753 || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL) 1754 || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC 1755 && (bc1 == UTF8PROC_BOUNDCLASS_OTHER 1756 || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC))); 1757 } 1758 1759 /// Return offset from "p" to the start of a character, including composing characters. 1760 /// "base" must be the start of the string, which must be NUL terminated. 1761 /// If "p" points to the NUL at the end of the string return 0. 1762 /// Returns 0 when already at the first byte of a character. 1763 int utf_head_off(const char *base_in, const char *p_in) 1764 { 1765 if ((uint8_t)(*p_in) < 0x80) { // be quick for ASCII 1766 return 0; 1767 } 1768 1769 const uint8_t *base = (uint8_t *)base_in; 1770 const uint8_t *p = (uint8_t *)p_in; 1771 1772 const uint8_t *start = p; 1773 1774 // move start to the first byte of this codepoint 1775 // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl 1776 while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) { 1777 start--; 1778 } 1779 1780 const uint8_t last_len = utf8len_tab[*start]; 1781 int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len); 1782 if (cur_code < 0 || p - start >= last_len) { 1783 return 0; // p must be part of an illegal sequence 1784 } 1785 const uint8_t * const safe_end = start + last_len; 1786 1787 int cur_bc = utf8proc_get_property(cur_code)->boundclass; 1788 if (always_break(cur_bc) || start == base) { 1789 return (int)(p - start); 1790 } 1791 1792 // backtrack to find the start of a cluster. we might go too far, checked in the next loop 1793 const uint8_t *cur_pos = start; 1794 const uint8_t *const p_start = start; 1795 1796 while (true) { 1797 if (start[-1] == NUL) { 1798 break; 1799 } 1800 1801 start--; 1802 if (*start < 0x80) { // stop on ascii, we are done 1803 break; 1804 } 1805 1806 while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) { 1807 start--; 1808 } 1809 1810 int prev_len = utf8len_tab[*start]; 1811 int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len); 1812 if (prev_code < 0 || prev_len < cur_pos - start) { 1813 start = cur_pos; // start at valid sequence after invalid bytes 1814 break; 1815 } 1816 1817 int prev_bc = utf8proc_get_property(prev_code)->boundclass; 1818 if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) { 1819 start = cur_pos; // prev_code cannot be a part of this cluster 1820 break; 1821 } else if (start == base) { 1822 break; 1823 } 1824 cur_pos = start; 1825 cur_bc = prev_bc; 1826 cur_code = prev_code; 1827 } 1828 1829 // hot path: we are already on the first codepoint of a sequence 1830 if (start == p_start && last_len > p - start) { 1831 return (int)(p - start); 1832 } 1833 1834 const uint8_t *q = start; 1835 while (q < p) { 1836 // don't need to find end of cluster. once we reached the codepoint of p, we are done 1837 int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q)); 1838 1839 if (q + len > p) { 1840 return (int)(p - q); 1841 } 1842 1843 q += len; 1844 } 1845 1846 return 0; 1847 } 1848 1849 /// Assumes caller already handles ascii. see `utfc_next` 1850 StrCharInfo utfc_next_impl(StrCharInfo cur) 1851 { 1852 int32_t prev_code = cur.chr.value; 1853 uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); 1854 GraphemeState state = GRAPHEME_STATE_INIT; 1855 assert(*next >= 0x80); 1856 1857 while (true) { 1858 uint8_t const next_len = utf8len_tab[*next]; 1859 int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); 1860 if (!utf_iscomposing(prev_code, next_code, &state)) { 1861 return (StrCharInfo){ 1862 .ptr = (char *)next, 1863 .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, 1864 }; 1865 } 1866 1867 prev_code = next_code; 1868 next += next_len; 1869 if (EXPECT(*next < 0x80U, true)) { 1870 return (StrCharInfo){ 1871 .ptr = (char *)next, 1872 .chr = (CharInfo){ .value = *next, .len = 1 }, 1873 }; 1874 } 1875 } 1876 } 1877 1878 // Whether space is NOT allowed before/after 'c'. 1879 bool utf_eat_space(int cc) 1880 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT 1881 { 1882 return (cc >= 0x2000 && cc <= 0x206F) // General punctuations 1883 || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations 1884 || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations 1885 || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations 1886 || (cc >= 0xff1a && cc <= 0xff20) // .. 1887 || (cc >= 0xff3b && cc <= 0xff40) // .. 1888 || (cc >= 0xff5b && cc <= 0xff65); // .. 1889 } 1890 1891 // Whether line break is allowed before "cc". 1892 bool utf_allow_break_before(int cc) 1893 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT 1894 { 1895 static const int BOL_prohibition_punct[] = { 1896 '!', 1897 '%', 1898 ')', 1899 ',', 1900 ':', 1901 ';', 1902 '>', 1903 '?', 1904 ']', 1905 '}', 1906 0x2019, // ’ right single quotation mark 1907 0x201d, // ” right double quotation mark 1908 0x2020, // † dagger 1909 0x2021, // ‡ double dagger 1910 0x2026, // … horizontal ellipsis 1911 0x2030, // ‰ per mille sign 1912 0x2031, // ‱ per the thousand sign 1913 0x203c, // ‼ double exclamation mark 1914 0x2047, // ⁇ double question mark 1915 0x2048, // ⁈ question exclamation mark 1916 0x2049, // ⁉ exclamation question mark 1917 0x2103, // ℃ degree celsius 1918 0x2109, // ℉ degree fahrenheit 1919 0x3001, // 、 ideographic comma 1920 0x3002, // 。 ideographic full stop 1921 0x3009, // 〉 right angle bracket 1922 0x300b, // 》 right double angle bracket 1923 0x300d, // 」 right corner bracket 1924 0x300f, // 』 right white corner bracket 1925 0x3011, // 】 right black lenticular bracket 1926 0x3015, // 〕 right tortoise shell bracket 1927 0x3017, // 〗 right white lenticular bracket 1928 0x3019, // 〙 right white tortoise shell bracket 1929 0x301b, // 〛 right white square bracket 1930 0xff01, // ! fullwidth exclamation mark 1931 0xff09, // ) fullwidth right parenthesis 1932 0xff0c, // , fullwidth comma 1933 0xff0e, // . fullwidth full stop 1934 0xff1a, // : fullwidth colon 1935 0xff1b, // ; fullwidth semicolon 1936 0xff1f, // ? fullwidth question mark 1937 0xff3d, // ] fullwidth right square bracket 1938 0xff5d, // } fullwidth right curly bracket 1939 }; 1940 1941 int first = 0; 1942 int last = ARRAY_SIZE(BOL_prohibition_punct) - 1; 1943 1944 while (first < last) { 1945 const int mid = (first + last) / 2; 1946 1947 if (cc == BOL_prohibition_punct[mid]) { 1948 return false; 1949 } else if (cc > BOL_prohibition_punct[mid]) { 1950 first = mid + 1; 1951 } else { 1952 last = mid - 1; 1953 } 1954 } 1955 1956 return cc != BOL_prohibition_punct[first]; 1957 } 1958 1959 // Whether line break is allowed after "cc". 1960 bool utf_allow_break_after(int cc) 1961 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT 1962 { 1963 static const int EOL_prohibition_punct[] = { 1964 '(', 1965 '<', 1966 '[', 1967 '`', 1968 '{', 1969 // 0x2014, // — em dash 1970 0x2018, // ‘ left single quotation mark 1971 0x201c, // “ left double quotation mark 1972 // 0x2053, // ~ swung dash 1973 0x3008, // 〈 left angle bracket 1974 0x300a, // 《 left double angle bracket 1975 0x300c, // 「 left corner bracket 1976 0x300e, // 『 left white corner bracket 1977 0x3010, // 【 left black lenticular bracket 1978 0x3014, // 〔 left tortoise shell bracket 1979 0x3016, // 〖 left white lenticular bracket 1980 0x3018, // 〘 left white tortoise shell bracket 1981 0x301a, // 〚 left white square bracket 1982 0xff08, // ( fullwidth left parenthesis 1983 0xff3b, // [ fullwidth left square bracket 1984 0xff5b, // { fullwidth left curly bracket 1985 }; 1986 1987 int first = 0; 1988 int last = ARRAY_SIZE(EOL_prohibition_punct) - 1; 1989 1990 while (first < last) { 1991 const int mid = (first + last)/2; 1992 1993 if (cc == EOL_prohibition_punct[mid]) { 1994 return false; 1995 } else if (cc > EOL_prohibition_punct[mid]) { 1996 first = mid + 1; 1997 } else { 1998 last = mid - 1; 1999 } 2000 } 2001 2002 return cc != EOL_prohibition_punct[first]; 2003 } 2004 2005 // Whether line break is allowed between "cc" and "ncc". 2006 bool utf_allow_break(int cc, int ncc) 2007 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT 2008 { 2009 // don't break between two-letter punctuations 2010 if (cc == ncc 2011 && (cc == 0x2014 // em dash 2012 || cc == 0x2026)) { // horizontal ellipsis 2013 return false; 2014 } 2015 return utf_allow_break_after(cc) && utf_allow_break_before(ncc); 2016 } 2017 2018 /// Copy a character, advancing the pointers 2019 /// 2020 /// @param[in,out] fp Source of the character to copy. 2021 /// @param[in,out] tp Destination to copy to. 2022 void mb_copy_char(const char **const fp, char **const tp) 2023 { 2024 const size_t l = (size_t)utfc_ptr2len(*fp); 2025 2026 memmove(*tp, *fp, l); 2027 *tp += l; 2028 *fp += l; 2029 } 2030 2031 /// Return the offset from "p" to the first byte of a character. When "p" is 2032 /// at the start of a character 0 is returned, otherwise the offset to the next 2033 /// character. Can start anywhere in a stream of bytes. 2034 int mb_off_next(const char *base, const char *p) 2035 { 2036 int head_off = utf_head_off(base, p); 2037 2038 if (head_off == 0) { 2039 return 0; 2040 } 2041 2042 return utfc_ptr2len(p - head_off) - head_off; 2043 } 2044 2045 /// Returns the offset in bytes from "p_in" to the first and one-past-end bytes 2046 /// of the codepoint it points to. 2047 /// "p_in" can point anywhere in a stream of bytes. 2048 /// "p_len" limits number of bytes after "p_in". 2049 /// Note: Counts individual codepoints of composed characters separately. 2050 CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len) 2051 FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL 2052 { 2053 assert(base <= p_in && p_len > 0); 2054 uint8_t const *const b = (uint8_t *)base; 2055 uint8_t const *const p = (uint8_t *)p_in; 2056 if (*p < 0x80U) { // be quick for ASCII 2057 return (CharBoundsOff){ 0, 1 }; 2058 } 2059 2060 int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1); 2061 int first_off = 0; 2062 for (; utf_is_trail_byte(p[first_off]); first_off--) { 2063 if (first_off == max_first_off) { // failed to find first byte 2064 return (CharBoundsOff){ 0, 1 }; 2065 } 2066 } 2067 2068 int const max_end_off = utf8len_tab[p[first_off]] + first_off; 2069 if (max_end_off <= 0 || max_end_off > p_len) { // illegal or incomplete sequence 2070 return (CharBoundsOff){ 0, 1 }; 2071 } 2072 2073 for (int end_off = 1; end_off < max_end_off; end_off++) { 2074 if (!utf_is_trail_byte(p[end_off])) { // not enough trail bytes 2075 return (CharBoundsOff){ 0, 1 }; 2076 } 2077 } 2078 2079 return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off }; 2080 } 2081 2082 /// Returns the offset in bytes from "p_in" to the first and one-past-end bytes 2083 /// of the codepoint it points to. 2084 /// "p_in" can point anywhere in a stream of bytes. 2085 /// Stream must be NUL-terminated. 2086 /// Note: Counts individual codepoints of composed characters separately. 2087 CharBoundsOff utf_cp_bounds(char const *base, char const *p_in) 2088 FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL 2089 { 2090 return utf_cp_bounds_len(base, p_in, INT_MAX); 2091 } 2092 2093 // Find the next illegal byte sequence. 2094 void utf_find_illegal(void) 2095 { 2096 pos_T pos = curwin->w_cursor; 2097 vimconv_T vimconv; 2098 char *tofree = NULL; 2099 2100 vimconv.vc_type = CONV_NONE; 2101 if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) { 2102 // 'encoding' is "utf-8" but we are editing a 8-bit encoded file, 2103 // possibly a utf-8 file with illegal bytes. Setup for conversion 2104 // from utf-8 to 'fileencoding'. 2105 convert_setup(&vimconv, p_enc, curbuf->b_p_fenc); 2106 } 2107 2108 curwin->w_cursor.coladd = 0; 2109 while (true) { 2110 char *p = get_cursor_pos_ptr(); 2111 if (vimconv.vc_type != CONV_NONE) { 2112 xfree(tofree); 2113 tofree = string_convert(&vimconv, p, NULL); 2114 if (tofree == NULL) { 2115 break; 2116 } 2117 p = tofree; 2118 } 2119 2120 while (*p != NUL) { 2121 // Illegal means that there are not enough trail bytes (checked by 2122 // utf_ptr2len()) or too many of them (overlong sequence). 2123 int len = utf_ptr2len(p); 2124 if ((uint8_t)(*p) >= 0x80 && (len == 1 || utf_char2len(utf_ptr2char(p)) != len)) { 2125 if (vimconv.vc_type == CONV_NONE) { 2126 curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr()); 2127 } else { 2128 int l; 2129 2130 len = (int)(p - tofree); 2131 for (p = get_cursor_pos_ptr(); *p != NUL && len-- > 0; p += l) { 2132 l = utf_ptr2len(p); 2133 curwin->w_cursor.col += l; 2134 } 2135 } 2136 goto theend; 2137 } 2138 p += len; 2139 } 2140 if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) { 2141 break; 2142 } 2143 curwin->w_cursor.lnum++; 2144 curwin->w_cursor.col = 0; 2145 } 2146 2147 // didn't find it: don't move and beep 2148 curwin->w_cursor = pos; 2149 beep_flush(); 2150 2151 theend: 2152 xfree(tofree); 2153 convert_setup(&vimconv, NULL, NULL); 2154 } 2155 2156 /// @return true if string "s" is a valid utf-8 string. 2157 /// When "end" is NULL stop at the first NUL. Otherwise stop at "end". 2158 bool utf_valid_string(const char *s, const char *end) 2159 { 2160 const uint8_t *p = (uint8_t *)s; 2161 2162 while (end == NULL ? *p != NUL : p < (uint8_t *)end) { 2163 int l = utf8len_tab_zero[*p]; 2164 if (l == 0) { 2165 return false; // invalid lead byte 2166 } 2167 if (end != NULL && p + l > (uint8_t *)end) { 2168 return false; // incomplete byte sequence 2169 } 2170 p++; 2171 while (--l > 0) { 2172 if ((*p++ & 0xc0) != 0x80) { 2173 return false; // invalid trail byte 2174 } 2175 } 2176 } 2177 return true; 2178 } 2179 2180 // If the cursor moves on an trail byte, set the cursor on the lead byte. 2181 // Thus it moves left if necessary. 2182 void mb_adjust_cursor(void) 2183 { 2184 mark_mb_adjustpos(curbuf, &curwin->w_cursor); 2185 } 2186 2187 /// Checks and adjusts cursor column. Not mode-dependent. 2188 /// @see check_cursor_col 2189 /// 2190 /// @param win_ Places cursor on a valid column for this window. 2191 void mb_check_adjust_col(void *win_) 2192 { 2193 win_T *win = (win_T *)win_; 2194 colnr_T oldcol = win->w_cursor.col; 2195 2196 // Column 0 is always valid. 2197 if (oldcol != 0) { 2198 char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum); 2199 colnr_T len = (colnr_T)strlen(p); 2200 2201 // Empty line or invalid column? 2202 if (len == 0 || oldcol < 0) { 2203 win->w_cursor.col = 0; 2204 } else { 2205 // Cursor column too big for line? 2206 if (oldcol > len) { 2207 win->w_cursor.col = len - 1; 2208 } 2209 // Move the cursor to the head byte. 2210 win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col); 2211 } 2212 2213 // Reset `coladd` when the cursor would be on the right half of a 2214 // double-wide character. 2215 if (win->w_cursor.coladd == 1 && p[win->w_cursor.col] != TAB 2216 && vim_isprintc(utf_ptr2char(p + win->w_cursor.col)) 2217 && ptr2cells(p + win->w_cursor.col) > 1) { 2218 win->w_cursor.coladd = 0; 2219 } 2220 } 2221 } 2222 2223 /// @param line start of the string 2224 /// 2225 /// @return a pointer to the character before "*p", if there is one. 2226 char *mb_prevptr(char *line, char *p) 2227 { 2228 if (p > line) { 2229 MB_PTR_BACK(line, p); 2230 } 2231 return p; 2232 } 2233 2234 /// Return the character length of "str". Each multi-byte character (with 2235 /// following composing characters) counts as one. 2236 int mb_charlen(const char *str) 2237 { 2238 const char *p = str; 2239 int count; 2240 2241 if (p == NULL) { 2242 return 0; 2243 } 2244 2245 for (count = 0; *p != NUL; count++) { 2246 p += utfc_ptr2len(p); 2247 } 2248 2249 return count; 2250 } 2251 2252 /// Like mb_charlen() but for a string with specified length. 2253 int mb_charlen_len(const char *str, int len) 2254 { 2255 const char *p = str; 2256 int count; 2257 2258 for (count = 0; *p != NUL && p < str + len; count++) { 2259 p += utfc_ptr2len(p); 2260 } 2261 2262 return count; 2263 } 2264 2265 /// Try to unescape a multibyte character 2266 /// 2267 /// Used for the rhs and lhs of the mappings. 2268 /// 2269 /// @param[in,out] pp String to unescape. Is advanced to just after the bytes 2270 /// that form a multibyte character. 2271 /// 2272 /// @return Unescaped string if it is a multibyte character, NULL if no 2273 /// multibyte character was found. Returns a static buffer, always one 2274 /// and the same. 2275 const char *mb_unescape(const char **const pp) 2276 FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL 2277 { 2278 static char buf[6]; 2279 size_t buf_idx = 0; 2280 uint8_t *str = (uint8_t *)(*pp); 2281 2282 // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL. 2283 // Maximum length of a utf-8 character is 4 bytes. 2284 for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) { 2285 if (str[str_idx] == K_SPECIAL 2286 && str[str_idx + 1] == KS_SPECIAL 2287 && str[str_idx + 2] == KE_FILLER) { 2288 buf[buf_idx++] = (char)K_SPECIAL; 2289 str_idx += 2; 2290 } else if (str[str_idx] == K_SPECIAL) { 2291 break; // A special key can't be a multibyte char. 2292 } else { 2293 buf[buf_idx++] = (char)str[str_idx]; 2294 } 2295 buf[buf_idx] = NUL; 2296 2297 // Return a multi-byte character if it's found. An illegal sequence 2298 // will result in a 1 here. 2299 if (utf_ptr2len(buf) > 1) { 2300 *pp = (const char *)str + str_idx + 1; 2301 return buf; 2302 } 2303 2304 // Bail out quickly for ASCII. 2305 if ((uint8_t)buf[0] < 128) { 2306 break; 2307 } 2308 } 2309 return NULL; 2310 } 2311 2312 /// Skip the Vim specific head of a 'encoding' name. 2313 char *enc_skip(char *p) 2314 { 2315 if (strncmp(p, "2byte-", 6) == 0) { 2316 return p + 6; 2317 } 2318 if (strncmp(p, "8bit-", 5) == 0) { 2319 return p + 5; 2320 } 2321 return p; 2322 } 2323 2324 /// Find the canonical name for encoding "enc". 2325 /// When the name isn't recognized, returns "enc" itself, but with all lower 2326 /// case characters and '_' replaced with '-'. 2327 /// 2328 /// @return an allocated string. 2329 char *enc_canonize(char *enc) 2330 FUNC_ATTR_NONNULL_RET 2331 { 2332 if (strcmp(enc, "default") == 0) { 2333 // Use the default encoding as found by set_init_1(). 2334 return xstrdup(fenc_default); 2335 } 2336 2337 // copy "enc" to allocated memory, with room for two '-' 2338 char *r = xmalloc(strlen(enc) + 3); 2339 // Make it all lower case and replace '_' with '-'. 2340 char *p = r; 2341 for (char *s = enc; *s != NUL; s++) { 2342 if (*s == '_') { 2343 *p++ = '-'; 2344 } else { 2345 *p++ = (char)TOLOWER_ASC(*s); 2346 } 2347 } 2348 *p = NUL; 2349 2350 // Skip "2byte-" and "8bit-". 2351 p = enc_skip(r); 2352 2353 // Change "microsoft-cp" to "cp". Used in some spell files. 2354 if (strncmp(p, "microsoft-cp", 12) == 0) { 2355 STRMOVE(p, p + 10); 2356 } 2357 2358 // "iso8859" -> "iso-8859" 2359 if (strncmp(p, "iso8859", 7) == 0) { 2360 STRMOVE(p + 4, p + 3); 2361 p[3] = '-'; 2362 } 2363 2364 // "iso-8859n" -> "iso-8859-n" 2365 if (strncmp(p, "iso-8859", 8) == 0 && p[8] != '-') { 2366 STRMOVE(p + 9, p + 8); 2367 p[8] = '-'; 2368 } 2369 2370 // "latin-N" -> "latinN" 2371 if (strncmp(p, "latin-", 6) == 0) { 2372 STRMOVE(p + 5, p + 6); 2373 } 2374 2375 int i; 2376 if (enc_canon_search(p) >= 0) { 2377 // canonical name can be used unmodified 2378 if (p != r) { 2379 STRMOVE(r, p); 2380 } 2381 } else if ((i = enc_alias_search(p)) >= 0) { 2382 // alias recognized, get canonical name 2383 xfree(r); 2384 r = xstrdup(enc_canon_table[i].name); 2385 } 2386 return r; 2387 } 2388 2389 /// Search for an encoding alias of "name". 2390 /// Returns -1 when not found. 2391 static int enc_alias_search(const char *name) 2392 { 2393 for (int i = 0; enc_alias_table[i].name != NULL; i++) { 2394 if (strcmp(name, enc_alias_table[i].name) == 0) { 2395 return enc_alias_table[i].canon; 2396 } 2397 } 2398 return -1; 2399 } 2400 2401 #ifdef HAVE_LANGINFO_H 2402 # include <langinfo.h> 2403 #endif 2404 2405 // Get the canonicalized encoding of the current locale. 2406 // Returns an allocated string when successful, NULL when not. 2407 char *enc_locale(void) 2408 { 2409 int i; 2410 char buf[50]; 2411 2412 const char *s; 2413 2414 #ifdef HAVE_NL_LANGINFO_CODESET 2415 if (!(s = nl_langinfo(CODESET)) || *s == NUL) 2416 #endif 2417 { 2418 if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL) { 2419 if ((s = os_getenv_noalloc("LC_ALL"))) { 2420 if ((s = os_getenv_noalloc("LC_CTYPE"))) { 2421 s = os_getenv_noalloc("LANG"); 2422 } 2423 } 2424 } 2425 } 2426 2427 if (!s) { 2428 return NULL; 2429 } 2430 2431 // The most generic locale format is: 2432 // language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]] 2433 // If there is a '.' remove the part before it. 2434 // if there is something after the codeset, remove it. 2435 // Make the name lowercase and replace '_' with '-'. 2436 // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn", 2437 // "ko_KR.EUC" == "euc-kr" 2438 const char *p = vim_strchr(s, '.'); 2439 if (p != NULL) { 2440 if (p > s + 2 && !STRNICMP(p + 1, "EUC", 3) 2441 && !isalnum((uint8_t)p[4]) && p[4] != '-' && p[-3] == '_') { 2442 // Copy "XY.EUC" to "euc-XY" to buf[10]. 2443 memmove(buf, "euc-", 4); 2444 buf[4] = (char)(ASCII_ISALNUM(p[-2]) ? TOLOWER_ASC(p[-2]) : 0); 2445 buf[5] = (char)(ASCII_ISALNUM(p[-1]) ? TOLOWER_ASC(p[-1]) : 0); 2446 buf[6] = NUL; 2447 } else { 2448 s = p + 1; 2449 goto enc_locale_copy_enc; 2450 } 2451 } else { 2452 enc_locale_copy_enc: 2453 for (i = 0; i < (int)sizeof(buf) - 1 && s[i] != NUL; i++) { 2454 if (s[i] == '_' || s[i] == '-') { 2455 buf[i] = '-'; 2456 } else if (ASCII_ISALNUM((uint8_t)s[i])) { 2457 buf[i] = (char)TOLOWER_ASC(s[i]); 2458 } else { 2459 break; 2460 } 2461 } 2462 buf[i] = NUL; 2463 } 2464 2465 return enc_canonize(buf); 2466 } 2467 2468 // Call iconv_open() with a check if iconv() works properly (there are broken 2469 // versions). 2470 // Returns (void *)-1 if failed. 2471 // (should return iconv_t, but that causes problems with prototypes). 2472 void *my_iconv_open(char *to, char *from) 2473 { 2474 #define ICONV_TESTLEN 400 2475 char tobuf[ICONV_TESTLEN]; 2476 static WorkingStatus iconv_working = kUnknown; 2477 2478 if (iconv_working == kBroken) { 2479 return (void *)-1; // detected a broken iconv() previously 2480 } 2481 iconv_t fd = iconv_open(enc_skip(to), enc_skip(from)); 2482 2483 if (fd != (iconv_t)-1 && iconv_working == kUnknown) { 2484 // Do a dummy iconv() call to check if it actually works. There is a 2485 // version of iconv() on Linux that is broken. We can't ignore it, 2486 // because it's wide-spread. The symptoms are that after outputting 2487 // the initial shift state the "to" pointer is NULL and conversion 2488 // stops for no apparent reason after about 8160 characters. 2489 char *p = tobuf; 2490 size_t tolen = ICONV_TESTLEN; 2491 iconv(fd, NULL, NULL, &p, &tolen); 2492 if (p == NULL) { 2493 iconv_working = kBroken; 2494 iconv_close(fd); 2495 fd = (iconv_t)-1; 2496 } else { 2497 iconv_working = kWorking; 2498 } 2499 } 2500 2501 return (void *)fd; 2502 } 2503 2504 // Convert the string "str[slen]" with iconv(). 2505 // If "unconvlenp" is not NULL handle the string ending in an incomplete 2506 // sequence and set "*unconvlenp" to the length of it. 2507 // Returns the converted string in allocated memory. NULL for an error. 2508 // If resultlenp is not NULL, sets it to the result length in bytes. 2509 static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t slen, 2510 size_t *unconvlenp, size_t *resultlenp) 2511 { 2512 char *to; 2513 size_t len = 0; 2514 size_t done = 0; 2515 char *result = NULL; 2516 2517 const char *from = str; 2518 size_t fromlen = slen; 2519 while (true) { 2520 if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) { 2521 // Allocate enough room for most conversions. When re-allocating 2522 // increase the buffer size. 2523 len = len + fromlen * 2 + 40; 2524 char *p = xmalloc(len); 2525 if (done > 0) { 2526 memmove(p, result, done); 2527 } 2528 xfree(result); 2529 result = p; 2530 } 2531 2532 to = result + done; 2533 size_t tolen = len - done - 2; 2534 // Avoid a warning for systems with a wrong iconv() prototype by 2535 // casting the second argument to void *. 2536 if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) { 2537 // Finished, append a NUL. 2538 *to = NUL; 2539 break; 2540 } 2541 2542 // Check both ICONV_EINVAL and EINVAL, because the dynamically loaded 2543 // iconv library may use one of them. 2544 if (!vcp->vc_fail && unconvlenp != NULL 2545 && (ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) { 2546 // Handle an incomplete sequence at the end. 2547 *to = NUL; 2548 *unconvlenp = fromlen; 2549 break; 2550 } else if (!vcp->vc_fail 2551 && (ICONV_ERRNO == ICONV_EILSEQ || ICONV_ERRNO == EILSEQ 2552 || ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) { 2553 // Check both ICONV_EILSEQ and EILSEQ, because the dynamically loaded 2554 // iconv library may use one of them. 2555 2556 // Can't convert: insert a '?' and skip a character. This assumes 2557 // conversion from 'encoding' to something else. In other 2558 // situations we don't know what to skip anyway. 2559 *to++ = '?'; 2560 if (utf_ptr2cells(from) > 1) { 2561 *to++ = '?'; 2562 } 2563 int l = utfc_ptr2len_len(from, (int)fromlen); 2564 from += l; 2565 fromlen -= (size_t)l; 2566 } else if (ICONV_ERRNO != ICONV_E2BIG) { 2567 // conversion failed 2568 XFREE_CLEAR(result); 2569 break; 2570 } 2571 // Not enough room or skipping illegal sequence. 2572 done = (size_t)(to - result); 2573 } 2574 2575 if (resultlenp != NULL && result != NULL) { 2576 *resultlenp = (size_t)(to - result); 2577 } 2578 return result; 2579 } 2580 2581 /// iconv() function 2582 void f_iconv(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) 2583 { 2584 vimconv_T vimconv; 2585 2586 rettv->v_type = VAR_STRING; 2587 rettv->vval.v_string = NULL; 2588 2589 const char *const str = tv_get_string(&argvars[0]); 2590 char buf1[NUMBUFLEN]; 2591 char *const from = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[1], buf1))); 2592 char buf2[NUMBUFLEN]; 2593 char *const to = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[2], buf2))); 2594 vimconv.vc_type = CONV_NONE; 2595 convert_setup(&vimconv, from, to); 2596 2597 // If the encodings are equal, no conversion needed. 2598 if (vimconv.vc_type == CONV_NONE) { 2599 rettv->vval.v_string = xstrdup(str); 2600 } else { 2601 rettv->vval.v_string = string_convert(&vimconv, (char *)str, NULL); 2602 } 2603 2604 convert_setup(&vimconv, NULL, NULL); 2605 xfree(from); 2606 xfree(to); 2607 } 2608 2609 /// Setup "vcp" for conversion from "from" to "to". 2610 /// The names must have been made canonical with enc_canonize(). 2611 /// vcp->vc_type must have been initialized to CONV_NONE. 2612 /// Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8 2613 /// instead). 2614 /// Afterwards invoke with "from" and "to" equal to NULL to cleanup. 2615 /// 2616 /// @return FAIL when conversion is not supported, OK otherwise. 2617 int convert_setup(vimconv_T *vcp, char *from, char *to) 2618 { 2619 return convert_setup_ext(vcp, from, true, to, true); 2620 } 2621 2622 /// As convert_setup(), but only when from_unicode_is_utf8 is true will all 2623 /// "from" unicode charsets be considered utf-8. Same for "to". 2624 int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, char *to, 2625 bool to_unicode_is_utf8) 2626 { 2627 int from_is_utf8; 2628 int to_is_utf8; 2629 2630 // Reset to no conversion. 2631 if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-1) { 2632 iconv_close(vcp->vc_fd); 2633 } 2634 *vcp = (vimconv_T)MBYTE_NONE_CONV; 2635 2636 // No conversion when one of the names is empty or they are equal. 2637 if (from == NULL || *from == NUL || to == NULL || *to == NUL 2638 || strcmp(from, to) == 0) { 2639 return OK; 2640 } 2641 2642 int from_prop = enc_canon_props(from); 2643 int to_prop = enc_canon_props(to); 2644 if (from_unicode_is_utf8) { 2645 from_is_utf8 = from_prop & ENC_UNICODE; 2646 } else { 2647 from_is_utf8 = from_prop == ENC_UNICODE; 2648 } 2649 if (to_unicode_is_utf8) { 2650 to_is_utf8 = to_prop & ENC_UNICODE; 2651 } else { 2652 to_is_utf8 = to_prop == ENC_UNICODE; 2653 } 2654 2655 if ((from_prop & ENC_LATIN1) && to_is_utf8) { 2656 // Internal latin1 -> utf-8 conversion. 2657 vcp->vc_type = CONV_TO_UTF8; 2658 vcp->vc_factor = 2; // up to twice as long 2659 } else if ((from_prop & ENC_LATIN9) && to_is_utf8) { 2660 // Internal latin9 -> utf-8 conversion. 2661 vcp->vc_type = CONV_9_TO_UTF8; 2662 vcp->vc_factor = 3; // up to three as long (euro sign) 2663 } else if (from_is_utf8 && (to_prop & ENC_LATIN1)) { 2664 // Internal utf-8 -> latin1 conversion. 2665 vcp->vc_type = CONV_TO_LATIN1; 2666 } else if (from_is_utf8 && (to_prop & ENC_LATIN9)) { 2667 // Internal utf-8 -> latin9 conversion. 2668 vcp->vc_type = CONV_TO_LATIN9; 2669 } else { 2670 // Use iconv() for conversion. 2671 vcp->vc_fd = (iconv_t)my_iconv_open(to_is_utf8 ? "utf-8" : to, 2672 from_is_utf8 ? "utf-8" : from); 2673 if (vcp->vc_fd != (iconv_t)-1) { 2674 vcp->vc_type = CONV_ICONV; 2675 vcp->vc_factor = 4; // could be longer too... 2676 } 2677 } 2678 if (vcp->vc_type == CONV_NONE) { 2679 return FAIL; 2680 } 2681 2682 return OK; 2683 } 2684 2685 /// Convert text "ptr[*lenp]" according to "vcp". 2686 /// Returns the result in allocated memory and sets "*lenp". 2687 /// When "lenp" is NULL, use NUL terminated strings. 2688 /// Illegal chars are often changed to "?", unless vcp->vc_fail is set. 2689 /// When something goes wrong, NULL is returned and "*lenp" is unchanged. 2690 char *string_convert(const vimconv_T *const vcp, char *ptr, size_t *lenp) 2691 { 2692 return string_convert_ext(vcp, ptr, lenp, NULL); 2693 } 2694 2695 // Like string_convert(), but when "unconvlenp" is not NULL and there are is 2696 // an incomplete sequence at the end it is not converted and "*unconvlenp" is 2697 // set to the number of remaining bytes. 2698 char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, size_t *unconvlenp) 2699 { 2700 uint8_t *retval = NULL; 2701 uint8_t *d; 2702 int c; 2703 2704 size_t len; 2705 if (lenp == NULL) { 2706 len = strlen(ptr); 2707 } else { 2708 len = *lenp; 2709 } 2710 if (len == 0) { 2711 return xstrdup(""); 2712 } 2713 2714 switch (vcp->vc_type) { 2715 case CONV_TO_UTF8: // latin1 to utf-8 conversion 2716 retval = xmalloc(len * 2 + 1); 2717 d = retval; 2718 for (size_t i = 0; i < len; i++) { 2719 c = (uint8_t)ptr[i]; 2720 if (c < 0x80) { 2721 *d++ = (uint8_t)c; 2722 } else { 2723 *d++ = (uint8_t)(0xc0 + (uint8_t)((unsigned)c >> 6)); 2724 *d++ = (uint8_t)(0x80 + (c & 0x3f)); 2725 } 2726 } 2727 *d = NUL; 2728 if (lenp != NULL) { 2729 *lenp = (size_t)(d - retval); 2730 } 2731 break; 2732 2733 case CONV_9_TO_UTF8: // latin9 to utf-8 conversion 2734 retval = xmalloc(len * 3 + 1); 2735 d = retval; 2736 for (size_t i = 0; i < len; i++) { 2737 c = (uint8_t)ptr[i]; 2738 switch (c) { 2739 case 0xa4: 2740 c = 0x20ac; break; // euro 2741 case 0xa6: 2742 c = 0x0160; break; // S hat 2743 case 0xa8: 2744 c = 0x0161; break; // S -hat 2745 case 0xb4: 2746 c = 0x017d; break; // Z hat 2747 case 0xb8: 2748 c = 0x017e; break; // Z -hat 2749 case 0xbc: 2750 c = 0x0152; break; // OE 2751 case 0xbd: 2752 c = 0x0153; break; // oe 2753 case 0xbe: 2754 c = 0x0178; break; // Y 2755 } 2756 d += utf_char2bytes(c, (char *)d); 2757 } 2758 *d = NUL; 2759 if (lenp != NULL) { 2760 *lenp = (size_t)(d - retval); 2761 } 2762 break; 2763 2764 case CONV_TO_LATIN1: // utf-8 to latin1 conversion 2765 case CONV_TO_LATIN9: // utf-8 to latin9 conversion 2766 retval = xmalloc(len + 1); 2767 d = retval; 2768 for (size_t i = 0; i < len; i++) { 2769 int l = utf_ptr2len_len(ptr + i, (int)(len - i)); 2770 if (l == 0) { 2771 *d++ = NUL; 2772 } else if (l == 1) { 2773 uint8_t l_w = utf8len_tab_zero[(uint8_t)ptr[i]]; 2774 2775 if (l_w == 0) { 2776 // Illegal utf-8 byte cannot be converted 2777 xfree(retval); 2778 return NULL; 2779 } 2780 if (unconvlenp != NULL && l_w > len - i) { 2781 // Incomplete sequence at the end. 2782 *unconvlenp = len - i; 2783 break; 2784 } 2785 *d++ = (uint8_t)ptr[i]; 2786 } else { 2787 c = utf_ptr2char(ptr + i); 2788 if (vcp->vc_type == CONV_TO_LATIN9) { 2789 switch (c) { 2790 case 0x20ac: 2791 c = 0xa4; break; // euro 2792 case 0x0160: 2793 c = 0xa6; break; // S hat 2794 case 0x0161: 2795 c = 0xa8; break; // S -hat 2796 case 0x017d: 2797 c = 0xb4; break; // Z hat 2798 case 0x017e: 2799 c = 0xb8; break; // Z -hat 2800 case 0x0152: 2801 c = 0xbc; break; // OE 2802 case 0x0153: 2803 c = 0xbd; break; // oe 2804 case 0x0178: 2805 c = 0xbe; break; // Y 2806 case 0xa4: 2807 case 0xa6: 2808 case 0xa8: 2809 case 0xb4: 2810 case 0xb8: 2811 case 0xbc: 2812 case 0xbd: 2813 case 0xbe: 2814 c = 0x100; break; // not in latin9 2815 } 2816 } 2817 if (!utf_iscomposing_legacy(c)) { // skip composing chars 2818 if (c < 0x100) { 2819 *d++ = (uint8_t)c; 2820 } else if (vcp->vc_fail) { 2821 xfree(retval); 2822 return NULL; 2823 } else { 2824 *d++ = 0xbf; 2825 if (utf_char2cells(c) > 1) { 2826 *d++ = '?'; 2827 } 2828 } 2829 } 2830 i += (size_t)l - 1; 2831 } 2832 } 2833 *d = NUL; 2834 if (lenp != NULL) { 2835 *lenp = (size_t)(d - retval); 2836 } 2837 break; 2838 2839 case CONV_ICONV: // conversion with vcp->vc_fd 2840 retval = (uint8_t *)iconv_string(vcp, ptr, len, unconvlenp, lenp); 2841 break; 2842 } 2843 2844 return (char *)retval; 2845 } 2846 2847 /// Table set by setcellwidths(). 2848 typedef struct { 2849 int64_t first; 2850 int64_t last; 2851 char width; 2852 } cw_interval_T; 2853 2854 static cw_interval_T *cw_table = NULL; 2855 static size_t cw_table_size = 0; 2856 2857 /// Return the value of the cellwidth table for the character `c`. 2858 /// 2859 /// @param c The source character. 2860 /// @return 1 or 2 when `c` is in the cellwidth table, 0 if not. 2861 static int cw_value(int c) 2862 { 2863 if (cw_table == NULL) { 2864 return 0; 2865 } 2866 2867 // first quick check for Latin1 etc. characters 2868 if (c < cw_table[0].first) { 2869 return 0; 2870 } 2871 2872 // binary search in table 2873 int bot = 0; 2874 int top = (int)cw_table_size - 1; 2875 while (top >= bot) { 2876 int mid = (bot + top) / 2; 2877 if (cw_table[mid].last < c) { 2878 bot = mid + 1; 2879 } else if (cw_table[mid].first > c) { 2880 top = mid - 1; 2881 } else { 2882 return cw_table[mid].width; 2883 } 2884 } 2885 return 0; 2886 } 2887 2888 static int tv_nr_compare(const void *a1, const void *a2) 2889 { 2890 const listitem_T *const li1 = tv_list_first(*(const list_T **)a1); 2891 const listitem_T *const li2 = tv_list_first(*(const list_T **)a2); 2892 const varnumber_T n1 = TV_LIST_ITEM_TV(li1)->vval.v_number; 2893 const varnumber_T n2 = TV_LIST_ITEM_TV(li2)->vval.v_number; 2894 2895 return n1 == n2 ? 0 : n1 > n2 ? 1 : -1; 2896 } 2897 2898 /// "setcellwidths()" function 2899 void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) 2900 { 2901 if (argvars[0].v_type != VAR_LIST || argvars[0].vval.v_list == NULL) { 2902 emsg(_(e_listreq)); 2903 return; 2904 } 2905 2906 const list_T *const l = argvars[0].vval.v_list; 2907 cw_interval_T *table = NULL; 2908 const size_t table_size = (size_t)tv_list_len(l); 2909 if (table_size == 0) { 2910 // Clearing the table. 2911 goto update; 2912 } 2913 2914 // Note: use list_T instead of listitem_T so that TV_LIST_ITEM_NEXT can be used properly below. 2915 const list_T **ptrs = xmalloc(sizeof(const list_T *) * table_size); 2916 2917 // Check that all entries are a list with three numbers, the range is 2918 // valid and the cell width is valid. 2919 int item = 0; 2920 TV_LIST_ITER_CONST(l, li, { 2921 const typval_T *const li_tv = TV_LIST_ITEM_TV(li); 2922 2923 if (li_tv->v_type != VAR_LIST || li_tv->vval.v_list == NULL) { 2924 semsg(_(e_list_item_nr_is_not_list), item); 2925 xfree((void *)ptrs); 2926 return; 2927 } 2928 2929 const list_T *const li_l = li_tv->vval.v_list; 2930 ptrs[item] = li_l; 2931 const listitem_T *lili = tv_list_first(li_l); 2932 int i; 2933 varnumber_T n1; 2934 for (i = 0; lili != NULL; lili = TV_LIST_ITEM_NEXT(li_l, lili), i++) { 2935 const typval_T *const lili_tv = TV_LIST_ITEM_TV(lili); 2936 if (lili_tv->v_type != VAR_NUMBER) { 2937 break; 2938 } 2939 if (i == 0) { 2940 n1 = lili_tv->vval.v_number; 2941 if (n1 < 0x80) { 2942 emsg(_(e_only_values_of_0x80_and_higher_supported)); 2943 xfree((void *)ptrs); 2944 return; 2945 } 2946 } else if (i == 1 && lili_tv->vval.v_number < n1) { 2947 semsg(_(e_list_item_nr_range_invalid), item); 2948 xfree((void *)ptrs); 2949 return; 2950 } else if (i == 2 && (lili_tv->vval.v_number < 1 || lili_tv->vval.v_number > 2)) { 2951 semsg(_(e_list_item_nr_cell_width_invalid), item); 2952 xfree((void *)ptrs); 2953 return; 2954 } 2955 } 2956 2957 if (i != 3) { 2958 semsg(_(e_list_item_nr_does_not_contain_3_numbers), item); 2959 xfree((void *)ptrs); 2960 return; 2961 } 2962 2963 item++; 2964 }); 2965 2966 // Sort the list on the first number. 2967 qsort((void *)ptrs, table_size, sizeof(const list_T *), tv_nr_compare); 2968 2969 table = xmalloc(sizeof(cw_interval_T) * table_size); 2970 2971 // Store the items in the new table. 2972 for (item = 0; (size_t)item < table_size; item++) { 2973 const list_T *const li_l = ptrs[item]; 2974 const listitem_T *lili = tv_list_first(li_l); 2975 const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number; 2976 if (item > 0 && n1 <= table[item - 1].last) { 2977 semsg(_(e_overlapping_ranges_for_nr), (size_t)n1); 2978 xfree((void *)ptrs); 2979 xfree(table); 2980 return; 2981 } 2982 table[item].first = n1; 2983 lili = TV_LIST_ITEM_NEXT(li_l, lili); 2984 table[item].last = TV_LIST_ITEM_TV(lili)->vval.v_number; 2985 lili = TV_LIST_ITEM_NEXT(li_l, lili); 2986 table[item].width = (char)TV_LIST_ITEM_TV(lili)->vval.v_number; 2987 } 2988 2989 xfree((void *)ptrs); 2990 2991 update: 2992 ; 2993 cw_interval_T *const cw_table_save = cw_table; 2994 const size_t cw_table_size_save = cw_table_size; 2995 cw_table = table; 2996 cw_table_size = table_size; 2997 2998 // Check that the new value does not conflict with 'listchars' or 2999 // 'fillchars'. 3000 const char *const error = check_chars_options(); 3001 if (error != NULL) { 3002 emsg(_(error)); 3003 cw_table = cw_table_save; 3004 cw_table_size = cw_table_size_save; 3005 xfree(table); 3006 return; 3007 } 3008 3009 xfree(cw_table_save); 3010 changed_window_setting_all(); 3011 redraw_all_later(UPD_NOT_VALID); 3012 } 3013 3014 /// "getcellwidths()" function 3015 void f_getcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) 3016 { 3017 tv_list_alloc_ret(rettv, (ptrdiff_t)cw_table_size); 3018 3019 for (size_t i = 0; i < cw_table_size; i++) { 3020 list_T *entry = tv_list_alloc(3); 3021 tv_list_append_number(entry, (varnumber_T)cw_table[i].first); 3022 tv_list_append_number(entry, (varnumber_T)cw_table[i].last); 3023 tv_list_append_number(entry, (varnumber_T)cw_table[i].width); 3024 3025 tv_list_append_list(rettv->vval.v_list, entry); 3026 } 3027 } 3028 3029 void f_charclass(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) 3030 { 3031 if (tv_check_for_string_arg(argvars, 0) == FAIL 3032 || argvars[0].vval.v_string == NULL) { 3033 return; 3034 } 3035 rettv->vval.v_number = mb_get_class(argvars[0].vval.v_string); 3036 } 3037 3038 /// Function given to ExpandGeneric() to obtain the possible arguments of the 3039 /// encoding options. 3040 char *get_encoding_name(expand_T *xp FUNC_ATTR_UNUSED, int idx) 3041 { 3042 if (idx >= (int)ARRAY_SIZE(enc_canon_table)) { 3043 return NULL; 3044 } 3045 3046 return (char *)enc_canon_table[idx].name; 3047 } 3048 3049 /// Compare strings 3050 /// 3051 /// @param[in] ic True if case is to be ignored. 3052 /// 3053 /// @return 0 if s1 == s2, <0 if s1 < s2, >0 if s1 > s2. 3054 int mb_strcmp_ic(bool ic, const char *s1, const char *s2) 3055 FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT 3056 { 3057 return (ic ? mb_stricmp(s1, s2) : strcmp(s1, s2)); 3058 }