mbyte_spec.lua (12936B)
1 local t = require('test.unit.testutil') 2 local itp = t.gen_itp(it) 3 4 local ffi = t.ffi 5 local eq = t.eq 6 local to_cstr = t.to_cstr 7 8 local lib = t.cimport( 9 './src/nvim/mbyte.h', 10 './src/nvim/charset.h', 11 './src/nvim/grid.h', 12 './src/nvim/option_vars.h' 13 ) 14 15 describe('mbyte', function() 16 -- Convert from bytes to string 17 local function to_string(bytes) 18 local s = {} 19 for i = 1, #bytes do 20 s[i] = string.char(bytes[i]) 21 end 22 return table.concat(s) 23 end 24 25 before_each(function() end) 26 27 itp('utf_ptr2char', function() 28 -- For strings with length 1 the first byte is returned. 29 for c = 0, 255 do 30 eq(c, lib.utf_ptr2char(to_string({ c, 0 }))) 31 end 32 33 -- Some ill formed byte sequences that should not be recognized as UTF-8 34 -- First byte: 0xc0 or 0xc1 35 -- Second byte: 0x80 .. 0xbf 36 --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80}))) 37 --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf}))) 38 -- 39 -- Sequences with more than four bytes 40 end) 41 42 for n = 0, 0xF do 43 itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function() 44 local char_p = ffi.typeof('char[?]') 45 for c = n * 0x1000, n * 0x1000 + 0xFFF do 46 local p = char_p(4, 0) 47 lib.utf_char2bytes(c, p) 48 eq(c, lib.utf_ptr2char(p)) 49 eq(lib.vim_iswordc(c), lib.vim_iswordp(p)) 50 end 51 end) 52 end 53 54 describe('utfc_ptr2schar', function() 55 local function test_seq(seq) 56 local firstc = ffi.new('int[1]') 57 local buf = ffi.new('char[32]') 58 lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc)) 59 local str = ffi.string(buf) 60 if 1 > 2 then -- for debugging 61 local tbl = {} 62 for i = 1, #str do 63 table.insert(tbl, string.format('0x%02x', string.byte(str, i))) 64 end 65 print('{ ' .. table.concat(tbl, ', ') .. ' }') 66 io.stdout:flush() 67 end 68 return { str, firstc[0] } 69 end 70 71 local function byte(val) 72 return { string.char(val), val } 73 end 74 75 itp('1-byte sequences', function() 76 eq({ '', 0 }, test_seq { 0 }) 77 for c = 1, 127 do 78 eq(byte(c), test_seq { c }) 79 end 80 for c = 128, 255 do 81 eq({ '', c }, test_seq { c }) 82 end 83 end) 84 85 itp('2-byte sequences', function() 86 -- No combining characters 87 eq(byte(0x7f), test_seq { 0x7f, 0x7f }) 88 -- No combining characters 89 eq(byte(0x7f), test_seq { 0x7f, 0x80 }) 90 91 -- No UTF-8 sequence 92 eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f }) 93 -- One UTF-8 character 94 eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80 }) 95 -- No UTF-8 sequence 96 eq({ '', 0xc2 }, test_seq { 0xc2, 0xc0 }) 97 end) 98 99 itp('3-byte sequences', function() 100 -- No second UTF-8 character 101 eq(byte(0x7f), test_seq { 0x7f, 0x80, 0x80 }) 102 -- No combining character 103 eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 }) 104 105 -- Combining character is U+0300 106 eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 }) 107 -- invalid start byte for combining 108 eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) 109 110 -- No UTF-8 sequence 111 eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc }) 112 -- Incomplete combining character 113 eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc }) 114 115 -- One UTF-8 character (composing only) 116 eq({ ' \xe2\x83\x90', 0x20d0 }, test_seq { 0xe2, 0x83, 0x90 }) 117 end) 118 119 itp('4-byte sequences', function() 120 -- No following combining character 121 eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 }) 122 eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 }) 123 -- No second UTF-8 character 124 eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 }) 125 126 -- Combining character U+0300 127 eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc }) 128 129 -- No UTF-8 sequence 130 eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 }) 131 -- No following UTF-8 character 132 eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc }) 133 -- Combining character U+0301 134 eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 }) 135 -- U+0080 : not a valid start char 136 eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) 137 138 -- One UTF-8 character 139 eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 }) 140 end) 141 142 itp('5+-byte sequences', function() 143 -- No following combining character 144 eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80, 0x80 }) 145 -- No second UTF-8 character 146 eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 }) 147 148 -- Combining character U+0300 149 eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 }) 150 151 -- Combining characters U+0300 and U+0301 152 eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 }) 153 -- Combining characters U+0300, U+0301, U+0302 154 eq( 155 { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, 156 test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } 157 ) 158 -- Combining characters U+0300, U+0301, U+0302, U+0303 159 eq( 160 { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 }, 161 test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } 162 ) 163 -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 164 eq( 165 { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 }, 166 test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } 167 ) 168 -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 169 eq( 170 { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 }, 171 test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } 172 ) 173 174 -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 175 eq( 176 { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 }, 177 test_seq { 178 0x29, 179 0xcc, 180 0x80, 181 0xcc, 182 0x81, 183 0xcc, 184 0x82, 185 0xcc, 186 0x83, 187 0xcc, 188 0x84, 189 0xcc, 190 0x85, 191 0xcc, 192 0x86, 193 } 194 ) 195 196 -- Only three following combining characters U+0300, U+0301, U+0302 197 eq( 198 { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, 199 test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } 200 ) 201 202 -- No UTF-8 sequence 203 eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 }) 204 -- No following UTF-8 character 205 eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 }) 206 -- Combining character U+0301 207 eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f }) 208 -- Combining character U+0301 209 eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc }) 210 211 -- One UTF-8 character 212 eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f }) 213 214 -- One UTF-8 character 215 eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x80 }) 216 -- One UTF-8 character 217 eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0xcc }) 218 219 -- Combining characters U+1AB0 and U+0301 220 eq( 221 { '\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81', 0x100000 }, 222 test_seq { 0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81 } 223 ) 224 end) 225 end) 226 227 describe('utf_cp_bounds_len', function() 228 local tests = { 229 { 230 name = 'for valid string', 231 str = 'iΓiiβ± iβ± β± πi', 232 offsets = { 233 b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 }, 234 e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 }, 235 }, 236 }, 237 { 238 name = 'for string with incomplete sequence', 239 str = 'i\xC3iΓβ± iΓ\xE2\xB1β± \xF0\x90\x80', 240 offsets = { 241 b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 }, 242 e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 }, 243 }, 244 }, 245 { 246 name = 'for string with trailing bytes after multibyte', 247 str = 'iΓ\xA0β± \xA0β± π\xA0i', 248 offsets = { 249 b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 }, 250 e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 }, 251 }, 252 }, 253 } 254 255 for _, test in ipairs(tests) do 256 itp(test.name, function() 257 local cstr = to_cstr(test.str) 258 local b_offsets, e_offsets = {}, {} 259 for i = 1, #test.str do 260 local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1)) 261 table.insert(b_offsets, result.begin_off) 262 table.insert(e_offsets, result.end_off) 263 end 264 eq(test.offsets, { b = b_offsets, e = e_offsets }) 265 end) 266 end 267 268 itp('does not read before start', function() 269 local str = 'π' 270 local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } } 271 local cstr = to_cstr(str) + 1 272 local b_offsets, e_offsets = {}, {} 273 for i = 1, 3 do 274 local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1)) 275 table.insert(b_offsets, result.begin_off) 276 table.insert(e_offsets, result.end_off) 277 end 278 eq(expected_offsets, { b = b_offsets, e = e_offsets }) 279 end) 280 281 itp('does not read past the end', function() 282 local str = 'π' 283 local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } } 284 local cstr = to_cstr(str) 285 local b_offsets, e_offsets = {}, {} 286 for i = 1, 3 do 287 local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1)) 288 table.insert(b_offsets, result.begin_off) 289 table.insert(e_offsets, result.end_off) 290 end 291 eq(expected_offsets, { b = b_offsets, e = e_offsets }) 292 end) 293 end) 294 295 itp('utf_head_off', function() 296 local function check(str, expected_glyphs) 297 local len = #str 298 local cstr = to_cstr(str) 299 local breaks = { 0 } -- SOT 300 local pos = 0 301 local mb_glyphs = {} 302 while pos < len do 303 local clen = lib.utfc_ptr2len(cstr + pos) 304 if clen == 0 then 305 eq(0, string.byte(str, pos + 1)) -- only NUL bytes can has length zery 306 clen = 1 -- but skip it, otherwise we get stuck 307 end 308 if clen > 1 then 309 table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen)) 310 end 311 pos = pos + clen 312 table.insert(breaks, pos) 313 end 314 eq(breaks[#breaks], len) -- include EOT as break 315 -- we could also send in breaks, but this is more human readable 316 eq(mb_glyphs, expected_glyphs) 317 318 for i = 1, #breaks - 1 do 319 local start, next = breaks[i], breaks[i + 1] 320 321 for p = start, next - 1 do 322 eq(p - start, lib.utf_head_off(cstr, cstr + p)) 323 end 324 end 325 eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe 326 end 327 -- stylua doesn't like ZWJ chars.. 328 -- stylua: ignore start 329 check('hej och hΓ₯ π§βπΎ!', { 'Γ₯', 'π§βπΎ' }) 330 331 -- emoji (various kinds of combinations, use g8 to see them) 332 check("π³οΈββ§οΈπ§βπΎβ€οΈππ΄ββ οΈ", {"π³οΈββ§οΈ", "π§βπΎ", "β€οΈ", "π", "π΄ββ οΈ"}) 333 check('π³οΈββ§οΈxyπ§βπΎ\rβ€οΈπΓ₯π΄ββ οΈΒ', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'Γ₯', 'π΄ββ οΈ', 'Β' }) 334 check('π³οΈββ§οΈ\000π§βπΎ\000β€οΈ\000π\000Γ₯\000π΄ββ οΈ\000Β', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'Γ₯', 'π΄ββ οΈ', 'Β' }) 335 check('\195π³οΈββ§οΈ\198π§βπΎ\165β€οΈ\168\195π\255π΄ββ οΈ\129Β\165', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'π΄ββ οΈ', 'Β' }) 336 337 check('π¦π ±οΈ π¦π½ π¦π¨π¦ π²π½πΉπ±',{'π¦', 'π ±οΈ', 'π¦π½', 'π¦π¨', 'π¦', 'π²π½', 'πΉπ±'}) 338 check('π΄σ §σ ’σ ³σ £σ ΄σ Ώπ΄σ §σ ’σ ·σ ¬σ ³σ Ώ', {'π΄σ §σ ’σ ³σ £σ ΄σ Ώ', 'π΄σ §σ ’σ ·σ ¬σ ³σ Ώ'}) 339 340 check('Γ₯\165ΓΌ\195aΓ«q\168Ξ²\000\169ζ¬\255', {'Γ₯', 'ΓΌ', 'Γ«', 'Ξ²', 'ζ¬'}) 341 342 lib.p_arshape = true -- default 343 check('Ψ³ΩΨ§Ω ', { 'Ψ³', 'ΩΨ§', 'Ω ' }) 344 lib.p_arshape = false 345 check('Ψ³ΩΨ§Ω ', { 'Ψ³', 'Ω', 'Ψ§', 'Ω ' }) 346 347 check('LΜΜΜΜΜΜoΜΜΜΜΜΜrΜΜΜΜΜΜeΜΜΜ ΜΜΜmΜ ΜΜΜΜΜ', {'LΜΜΜΜΜΜ', 'oΜΜΜΜΜΜ', 'rΜΜΜΜΜΜ', 'eΜΜΜ ΜΜΜ', 'mΜ ΜΜΜΜΜ'}) 348 -- stylua: ignore end 349 end) 350 351 describe('utf_fold', function() 352 itp('does not crash with surrogates #30527', function() 353 eq(0xddfb, lib.utf_fold(0xddfb)) -- low surrogate, invalid as a character 354 eq(0xd800, lib.utf_fold(0xd800)) -- high surrogate, invalid as a character 355 end) 356 357 itp("doesn't crash on invalid codepoints", function() 358 eq(9000000, lib.utf_fold(9000000)) 359 eq(0, lib.utf_fold(0)) 360 end) 361 end) 362 end)