neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

mbyte_spec.lua (12936B)


      1 local t = require('test.unit.testutil')
      2 local itp = t.gen_itp(it)
      3 
      4 local ffi = t.ffi
      5 local eq = t.eq
      6 local to_cstr = t.to_cstr
      7 
      8 local lib = t.cimport(
      9  './src/nvim/mbyte.h',
     10  './src/nvim/charset.h',
     11  './src/nvim/grid.h',
     12  './src/nvim/option_vars.h'
     13 )
     14 
     15 describe('mbyte', function()
     16  -- Convert from bytes to string
     17  local function to_string(bytes)
     18    local s = {}
     19    for i = 1, #bytes do
     20      s[i] = string.char(bytes[i])
     21    end
     22    return table.concat(s)
     23  end
     24 
     25  before_each(function() end)
     26 
     27  itp('utf_ptr2char', function()
     28    -- For strings with length 1 the first byte is returned.
     29    for c = 0, 255 do
     30      eq(c, lib.utf_ptr2char(to_string({ c, 0 })))
     31    end
     32 
     33    -- Some ill formed byte sequences that should not be recognized as UTF-8
     34    -- First byte: 0xc0 or 0xc1
     35    -- Second byte: 0x80 .. 0xbf
     36    --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
     37    --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
     38    --
     39    -- Sequences with more than four bytes
     40  end)
     41 
     42  for n = 0, 0xF do
     43    itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
     44      local char_p = ffi.typeof('char[?]')
     45      for c = n * 0x1000, n * 0x1000 + 0xFFF do
     46        local p = char_p(4, 0)
     47        lib.utf_char2bytes(c, p)
     48        eq(c, lib.utf_ptr2char(p))
     49        eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
     50      end
     51    end)
     52  end
     53 
     54  describe('utfc_ptr2schar', function()
     55    local function test_seq(seq)
     56      local firstc = ffi.new('int[1]')
     57      local buf = ffi.new('char[32]')
     58      lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
     59      local str = ffi.string(buf)
     60      if 1 > 2 then -- for debugging
     61        local tbl = {}
     62        for i = 1, #str do
     63          table.insert(tbl, string.format('0x%02x', string.byte(str, i)))
     64        end
     65        print('{ ' .. table.concat(tbl, ', ') .. ' }')
     66        io.stdout:flush()
     67      end
     68      return { str, firstc[0] }
     69    end
     70 
     71    local function byte(val)
     72      return { string.char(val), val }
     73    end
     74 
     75    itp('1-byte sequences', function()
     76      eq({ '', 0 }, test_seq { 0 })
     77      for c = 1, 127 do
     78        eq(byte(c), test_seq { c })
     79      end
     80      for c = 128, 255 do
     81        eq({ '', c }, test_seq { c })
     82      end
     83    end)
     84 
     85    itp('2-byte sequences', function()
     86      -- No combining characters
     87      eq(byte(0x7f), test_seq { 0x7f, 0x7f })
     88      -- No combining characters
     89      eq(byte(0x7f), test_seq { 0x7f, 0x80 })
     90 
     91      -- No UTF-8 sequence
     92      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f })
     93      -- One UTF-8 character
     94      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80 })
     95      -- No UTF-8 sequence
     96      eq({ '', 0xc2 }, test_seq { 0xc2, 0xc0 })
     97    end)
     98 
     99    itp('3-byte sequences', function()
    100      -- No second UTF-8 character
    101      eq(byte(0x7f), test_seq { 0x7f, 0x80, 0x80 })
    102      -- No combining character
    103      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
    104 
    105      -- Combining character is U+0300
    106      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
    107      -- invalid start byte for combining
    108      eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
    109 
    110      -- No UTF-8 sequence
    111      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
    112      -- Incomplete combining character
    113      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc })
    114 
    115      -- One UTF-8 character (composing only)
    116      eq({ ' \xe2\x83\x90', 0x20d0 }, test_seq { 0xe2, 0x83, 0x90 })
    117    end)
    118 
    119    itp('4-byte sequences', function()
    120      -- No following combining character
    121      eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
    122      eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
    123      -- No second UTF-8 character
    124      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
    125 
    126      -- Combining character U+0300
    127      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
    128 
    129      -- No UTF-8 sequence
    130      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
    131      -- No following UTF-8 character
    132      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
    133      -- Combining character U+0301
    134      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
    135      -- U+0080 : not a valid start char
    136      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
    137 
    138      -- One UTF-8 character
    139      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
    140    end)
    141 
    142    itp('5+-byte sequences', function()
    143      -- No following combining character
    144      eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80, 0x80 })
    145      -- No second UTF-8 character
    146      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
    147 
    148      -- Combining character U+0300
    149      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
    150 
    151      -- Combining characters U+0300 and U+0301
    152      eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
    153      -- Combining characters U+0300, U+0301, U+0302
    154      eq(
    155        { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
    156        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
    157      )
    158      -- Combining characters U+0300, U+0301, U+0302, U+0303
    159      eq(
    160        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
    161        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
    162      )
    163      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
    164      eq(
    165        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
    166        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
    167      )
    168      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
    169      eq(
    170        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
    171        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
    172      )
    173 
    174      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
    175      eq(
    176        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
    177        test_seq {
    178          0x29,
    179          0xcc,
    180          0x80,
    181          0xcc,
    182          0x81,
    183          0xcc,
    184          0x82,
    185          0xcc,
    186          0x83,
    187          0xcc,
    188          0x84,
    189          0xcc,
    190          0x85,
    191          0xcc,
    192          0x86,
    193        }
    194      )
    195 
    196      -- Only three following combining characters U+0300, U+0301, U+0302
    197      eq(
    198        { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
    199        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
    200      )
    201 
    202      -- No UTF-8 sequence
    203      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
    204      -- No following UTF-8 character
    205      eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
    206      -- Combining character U+0301
    207      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
    208      -- Combining character U+0301
    209      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
    210 
    211      -- One UTF-8 character
    212      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
    213 
    214      -- One UTF-8 character
    215      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x80 })
    216      -- One UTF-8 character
    217      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0xcc })
    218 
    219      -- Combining characters U+1AB0 and U+0301
    220      eq(
    221        { '\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81', 0x100000 },
    222        test_seq { 0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81 }
    223      )
    224    end)
    225  end)
    226 
    227  describe('utf_cp_bounds_len', function()
    228    local tests = {
    229      {
    230        name = 'for valid string',
    231        str = 'iΓ€iiβ± iⱠⱠ𐀀i',
    232        offsets = {
    233          b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 },
    234          e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 },
    235        },
    236      },
    237      {
    238        name = 'for string with incomplete sequence',
    239        str = 'i\xC3iΓ€β± iΓ€\xE2\xB1β± \xF0\x90\x80',
    240        offsets = {
    241          b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 },
    242          e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 },
    243        },
    244      },
    245      {
    246        name = 'for string with trailing bytes after multibyte',
    247        str = 'iΓ€\xA0β± \xA0Ⱡ𐀀\xA0i',
    248        offsets = {
    249          b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 },
    250          e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 },
    251        },
    252      },
    253    }
    254 
    255    for _, test in ipairs(tests) do
    256      itp(test.name, function()
    257        local cstr = to_cstr(test.str)
    258        local b_offsets, e_offsets = {}, {}
    259        for i = 1, #test.str do
    260          local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1))
    261          table.insert(b_offsets, result.begin_off)
    262          table.insert(e_offsets, result.end_off)
    263        end
    264        eq(test.offsets, { b = b_offsets, e = e_offsets })
    265      end)
    266    end
    267 
    268    itp('does not read before start', function()
    269      local str = '𐀀'
    270      local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
    271      local cstr = to_cstr(str) + 1
    272      local b_offsets, e_offsets = {}, {}
    273      for i = 1, 3 do
    274        local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
    275        table.insert(b_offsets, result.begin_off)
    276        table.insert(e_offsets, result.end_off)
    277      end
    278      eq(expected_offsets, { b = b_offsets, e = e_offsets })
    279    end)
    280 
    281    itp('does not read past the end', function()
    282      local str = '𐀀'
    283      local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
    284      local cstr = to_cstr(str)
    285      local b_offsets, e_offsets = {}, {}
    286      for i = 1, 3 do
    287        local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
    288        table.insert(b_offsets, result.begin_off)
    289        table.insert(e_offsets, result.end_off)
    290      end
    291      eq(expected_offsets, { b = b_offsets, e = e_offsets })
    292    end)
    293  end)
    294 
    295  itp('utf_head_off', function()
    296    local function check(str, expected_glyphs)
    297      local len = #str
    298      local cstr = to_cstr(str)
    299      local breaks = { 0 } -- SOT
    300      local pos = 0
    301      local mb_glyphs = {}
    302      while pos < len do
    303        local clen = lib.utfc_ptr2len(cstr + pos)
    304        if clen == 0 then
    305          eq(0, string.byte(str, pos + 1)) -- only NUL bytes can has length zery
    306          clen = 1 -- but skip it, otherwise we get stuck
    307        end
    308        if clen > 1 then
    309          table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
    310        end
    311        pos = pos + clen
    312        table.insert(breaks, pos)
    313      end
    314      eq(breaks[#breaks], len) -- include EOT as break
    315      -- we could also send in breaks, but this is more human readable
    316      eq(mb_glyphs, expected_glyphs)
    317 
    318      for i = 1, #breaks - 1 do
    319        local start, next = breaks[i], breaks[i + 1]
    320 
    321        for p = start, next - 1 do
    322          eq(p - start, lib.utf_head_off(cstr, cstr + p))
    323        end
    324      end
    325      eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
    326    end
    327    -- stylua doesn't like ZWJ chars..
    328    -- stylua: ignore start
    329    check('hej och hΓ₯ πŸ§‘β€πŸŒΎ!', { 'Γ₯', 'πŸ§‘β€πŸŒΎ' })
    330 
    331    -- emoji (various kinds of combinations, use g8 to see them)
    332    check("πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ", {"πŸ³οΈβ€βš§οΈ", "πŸ§‘β€πŸŒΎ", "❀️", "πŸ˜‚", "πŸ΄β€β˜ οΈ"})
    333    check('πŸ³οΈβ€βš§οΈxyπŸ§‘β€πŸŒΎ\rβ€οΈπŸ˜‚Γ₯πŸ΄β€β˜ οΈΒ€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' })
    334    check('πŸ³οΈβ€βš§οΈ\000πŸ§‘β€πŸŒΎ\000❀️\000πŸ˜‚\000Γ₯\000πŸ΄β€β˜ οΈ\000Β€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' })
    335    check('\195πŸ³οΈβ€βš§οΈ\198πŸ§‘β€πŸŒΎ\165❀️\168\195πŸ˜‚\255πŸ΄β€β˜ οΈ\129Β€\165', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'πŸ΄β€β˜ οΈ', 'Β€' })
    336 
    337    check('πŸ‡¦πŸ…±οΈ πŸ‡¦πŸ‡½ πŸ‡¦πŸ‡¨πŸ‡¦ πŸ‡²πŸ‡½πŸ‡ΉπŸ‡±',{'πŸ‡¦', 'πŸ…±οΈ', 'πŸ‡¦πŸ‡½', 'πŸ‡¦πŸ‡¨', 'πŸ‡¦', 'πŸ‡²πŸ‡½', 'πŸ‡ΉπŸ‡±'})
    338    check('🏴󠁧󠁒󠁳󠁣󠁴󠁿🏴󠁧󠁒󠁷󠁬󠁳󠁿', {'🏴󠁧󠁒󠁳󠁣󠁴󠁿', '🏴󠁧󠁒󠁷󠁬󠁳󠁿'})
    339 
    340    check('Γ₯\165ΓΌ\195aΓ«q\168Ξ²\000\169本\255', {'Γ₯', 'ΓΌ', 'Γ«', 'Ξ²', '本'})
    341 
    342    lib.p_arshape = true -- default
    343    check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„Ψ§', 'Ω…' })
    344    lib.p_arshape = false
    345    check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„', 'Ψ§', 'Ω…' })
    346 
    347    check('LΜ“Μ‰Μ‘Μ’ΜŒΜšoΜŒΜ’Μ—Μ„Μ›Μ€rΜΜˆΜ•ΜˆΜŽΜè̇̅̄̄̐mΜ…Μ–ΜŸΜ„ΜŸΜš', {'LΜ“Μ‰Μ‘Μ’ΜŒΜš', 'oΜŒΜ’Μ—Μ„Μ›Μ€', 'rΜΜˆΜ•ΜˆΜŽΜ', 'è̇̅̄̄̐', 'mΜ…Μ–ΜŸΜ„ΜŸΜš'})
    348    -- stylua: ignore end
    349  end)
    350 
    351  describe('utf_fold', function()
    352    itp('does not crash with surrogates #30527', function()
    353      eq(0xddfb, lib.utf_fold(0xddfb)) -- low surrogate, invalid as a character
    354      eq(0xd800, lib.utf_fold(0xd800)) -- high surrogate, invalid as a character
    355    end)
    356 
    357    itp("doesn't crash on invalid codepoints", function()
    358      eq(9000000, lib.utf_fold(9000000))
    359      eq(0, lib.utf_fold(0))
    360    end)
    361  end)
    362 end)