neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

glob.lua (12681B)


      1 --- @brief Glob-to-LPeg Converter (Peglob)
      2 --- This module converts glob patterns to LPeg patterns according to the LSP 3.17 specification:
      3 --- https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#pattern
      4 ---
      5 --- Glob grammar overview:
      6 --- - `*` to match zero or more characters in a path segment
      7 --- - `?` to match on one character in a path segment
      8 --- - `**` to match any number of path segments, including none
      9 --- - `{}` to group conditions (e.g. `*.{ts,js}` matches TypeScript and JavaScript files)
     10 --- - `[]` to declare a range of characters to match in a path segment
     11 ---   (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …)
     12 --- - `[!...]` to negate a range of characters to match in a path segment
     13 ---   (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but not `example.0`)
     14 ---
     15 --- Additional constraints:
     16 --- - A Glob pattern must match an entire path, with partial matches
     17 ---   considered failures.
     18 --- - The pattern only determines success or failure, without specifying
     19 ---   which parts correspond to which characters.
     20 --- - A *path segment* is the portion of a path between two adjacent path
     21 ---   separators (`/`), or between the start/end of the path and the nearest
     22 ---   separator.
     23 --- - The `**` (*globstar*) pattern matches zero or more path segments,
     24 ---   including intervening separators (`/`). Within pattern strings, `**`
     25 ---   must be delimited by path separators (`/`) or pattern boundaries and
     26 ---   cannot be adjacent to any characters other than `/`. If `**` is not
     27 ---   the final element, it must be followed by `/`.
     28 --- - `{}` (*braced conditions*) contains valid Glob patterns as branches,
     29 ---   separated by commas. Commas are exclusively used for separating
     30 ---   branches and cannot appear within a branch for any other purpose.
     31 ---   Nested `{}` structures are allowed, but `{}` must contain at least two
     32 ---   branches—zero or one branch is not permitted.
     33 --- - In `[]` or `[!...]`, a *character range* consists of character
     34 ---   intervals (e.g., `a-z`) or individual characters (e.g., `w`). A range
     35 ---   including `/` won’t match that character.
     36 
     37 --- @diagnostic disable: missing-fields
     38 
     39 local m = vim.lpeg
     40 local mt = getmetatable(m.P(0))
     41 local re = vim.re
     42 local bit = require('bit')
     43 
     44 local M = {}
     45 
     46 -- Basic patterns for matching glob components
     47 local letter = m.P(1) - m.S('*?[]{}/\\') -- Any character except special glob characters
     48 local slash = m.P '/' * m.Cc(m.P '/') -- Path separator with capture
     49 local notslash = m.P(1) - m.P '/' -- Any character except path separator
     50 local notcomma = m.P(1) - m.S(',\\') -- Any character except comma and backslash
     51 
     52 --- Handle EOF, considering whether we're in a segment or not
     53 --- @type vim.lpeg.Pattern
     54 local eof = -1
     55  * m.Cb('inseg')
     56  / function(flag)
     57    if flag then
     58      return #m.P '/'
     59    else
     60      return m.P(-1)
     61    end
     62  end
     63 
     64 ---@alias pat_table { F: string?, [1]: string, [2]: vim.lpeg.Pattern }
     65 ---@alias seg_part { [string]: any, [integer]: pat_table }
     66 
     67 --- @param p pat_table Initial segment pattern data
     68 --- @return seg_part Segment structure with start pattern
     69 local function start_seg(p)
     70  return { s = p[2], e = true, n = 0 }
     71 end
     72 
     73 --- @param t seg_part Segment structure
     74 --- @param p pat_table Pattern to look for
     75 --- @return table Updated segment structure
     76 local function lookfor(t, p)
     77  t.n = t.n + 1
     78  t[t.n] = p
     79  return t
     80 end
     81 
     82 --- @param t seg_part Segment structure
     83 --- @return table Segment structure with end pattern
     84 local function to_seg_end(t)
     85  t.e = notslash ^ 0
     86  return t
     87 end
     88 
     89 --- Constructs a segment matching pattern from collected components
     90 ---
     91 --- @param t seg_part Segment structure with patterns
     92 --- @return vim.lpeg.Pattern Complete segment match pattern
     93 local function end_seg(t)
     94  --- @type table<any,any>
     95  local seg_grammar = { 's' }
     96  if t.n > 0 then
     97    seg_grammar.s = t.s
     98    for i = 1, t.n do
     99      local rname = '_' .. t[i][1]
    100      if not seg_grammar[rname] then
    101        -- Optimize search when deterministic first character is available
    102        if t[i].F then
    103          seg_grammar[rname] = t[i][2] + notslash * (notslash - m.P(t[i].F)) ^ 0 * m.V(rname)
    104        else
    105          seg_grammar[rname] = t[i][2] + notslash * m.V(rname)
    106        end
    107      end
    108      seg_grammar.s = seg_grammar.s * m.V(rname)
    109    end
    110    if t.e then
    111      seg_grammar.s = seg_grammar.s * t.e
    112    end
    113    return m.P(seg_grammar)
    114  else
    115    seg_grammar.s = t.s
    116    if t.e then
    117      seg_grammar.s = seg_grammar.s * t.e
    118    end
    119    return seg_grammar.s
    120  end
    121 end
    122 
    123 --- @param p vim.lpeg.Pattern Pattern directly after `**/`
    124 --- @return vim.lpeg.Pattern LPeg pattern for `**/p`
    125 local function dseg(p)
    126  return m.P { p + notslash ^ 0 * m.P '/' * m.V(1) }
    127 end
    128 
    129 --- @type (vim.lpeg.Pattern|table)
    130 local g = nil
    131 
    132 --- Multiplies conditions for braced expansion (Cartesian product)
    133 ---
    134 --- @param a string|string[] First part
    135 --- @param b string|string[] Second part
    136 --- @return string|string[] Cartesian product of values
    137 local function mul_cond(a, b)
    138  if type(a) == 'string' then
    139    if type(b) == 'string' then
    140      return a .. b
    141    elseif type(b) == 'table' then
    142      for i = 1, #b do
    143        b[i] = a .. b[i]
    144      end
    145      return b
    146    else
    147      return a
    148    end
    149  elseif type(a) == 'table' then
    150    if type(b) == 'string' then
    151      for i = 1, #a do
    152        a[i] = a[i] .. b
    153      end
    154      return a
    155    elseif type(b) == 'table' then
    156      --- @type string[]
    157      local res = {}
    158      local idx = 0
    159      for i = 1, #a do
    160        for j = 1, #b do
    161          idx = idx + 1
    162          res[idx] = a[i] .. b[j]
    163        end
    164      end
    165      return res
    166    else
    167      return a
    168    end
    169  else
    170    return b
    171  end
    172 end
    173 
    174 --- Combines alternatives in braced patterns
    175 ---
    176 --- @param a string|table First part
    177 --- @param b string|table Second part
    178 --- @return table #Combined alternatives
    179 local function add_cond(a, b)
    180  if type(a) == 'string' then
    181    if type(b) == 'string' then
    182      return { a, b }
    183    elseif type(b) == 'table' then
    184      table.insert(b, 1, a)
    185      return b
    186    end
    187  elseif type(a) == 'table' then
    188    if type(b) == 'string' then
    189      table.insert(a, b)
    190      return a
    191    elseif type(b) == 'table' then
    192      for i = 1, #b do
    193        table.insert(a, b[i])
    194      end
    195      return a
    196    end
    197    --- @diagnostic disable-next-line: missing-return
    198  end
    199 end
    200 
    201 --- Expands patterns handling segment boundaries
    202 --- `#` prefix is added for sub-grammar to detect in-segment flag
    203 ---
    204 ---@param a (any[]|vim.lpeg.Pattern[]) Array of patterns
    205 ---@param b string Tail string
    206 ---@param inseg boolean Whether inside a path segment
    207 ---@return vim.lpeg.Pattern #Expanded pattern
    208 local function expand(a, b, inseg)
    209  for i = 1, #a do
    210    if inseg then
    211      a[i] = '#' .. a[i]
    212    end
    213    a[i] = g:match(a[i] .. b)
    214  end
    215  local res = a[1]
    216  for i = 2, #a do
    217    res = res + a[i]
    218  end
    219  return res
    220 end
    221 
    222 --- Converts a UTF-8 character to its Unicode codepoint
    223 ---
    224 --- @param utf8_str string UTF-8 character
    225 --- @return number #Codepoint value
    226 local function to_codepoint(utf8_str)
    227  local codepoint = 0
    228  local byte_count = 0
    229 
    230  for i = 1, #utf8_str do
    231    local byte = utf8_str:byte(i)
    232 
    233    if byte_count ~= 0 then
    234      codepoint = bit.bor(bit.lshift(codepoint, 6), bit.band(byte, 0x3F))
    235      byte_count = byte_count - 1
    236    else
    237      if byte < 0x80 then
    238        codepoint = byte
    239      elseif byte < 0xE0 then
    240        byte_count = 1
    241        codepoint = bit.band(byte, 0x1F)
    242      elseif byte < 0xF0 then
    243        byte_count = 2
    244        codepoint = bit.band(byte, 0x0F)
    245      else
    246        byte_count = 3
    247        codepoint = bit.band(byte, 0x07)
    248      end
    249    end
    250 
    251    if byte_count == 0 then
    252      break
    253    end
    254  end
    255 
    256  return codepoint
    257 end
    258 
    259 --- Pattern for matching UTF-8 characters
    260 local cont = m.R('\128\191')
    261 local any_utf8 = m.R('\0\127')
    262  + m.R('\194\223') * cont
    263  + m.R('\224\239') * cont * cont
    264  + m.R('\240\244') * cont * cont * cont
    265 
    266 --- Creates a character class pattern for glob ranges
    267 --- @param inv string Inversion flag ('!' or '')
    268 --- @param ranges (string|string[])[] Character ranges
    269 --- @return vim.lpeg.Pattern #Character class pattern
    270 local function class(inv, ranges)
    271  local patt = m.P(false)
    272  if #ranges == 0 then
    273    if inv == '!' then
    274      return m.P '[!]'
    275    else
    276      return m.P '[]'
    277    end
    278  end
    279  for _, v in ipairs(ranges) do
    280    patt = patt + (type(v) == 'table' and m.utfR(to_codepoint(v[1]), to_codepoint(v[2])) or m.P(v))
    281  end
    282  if inv == '!' then
    283    patt = m.P(1) - patt --[[@as vim.lpeg.Pattern]]
    284  end
    285  return patt - m.P '/'
    286 end
    287 
    288 -- Parse constraints for optimizing braced conditions
    289 local noopt_condlist = re.compile [[
    290  s <- '/' / '**' / . [^/*]* s
    291 ]]
    292 
    293 local opt_tail = re.compile [[
    294  s <- (!'**' [^{/])* &'/'
    295 ]]
    296 
    297 -- stylua: ignore start
    298 --- @nodoc
    299 --- @diagnostic disable
    300 --- Main grammar for glob pattern matching
    301 g = {
    302  'Glob',
    303  Glob     = (m.P'#' * m.Cg(m.Cc(true), 'inseg') + m.Cg(m.Cc(false), 'inseg')) *
    304             m.Cf(m.V'Element'^-1 * (slash * m.V'Element')^0 * (slash^-1 * eof), mt.__mul),
    305  -- Elements handle segments, globstar patterns
    306  Element  = m.V'DSeg' + m.V'DSEnd' + m.Cf(m.V'Segment' * (slash * m.V'Segment')^0 * (slash * eof + eof^-1), mt.__mul),
    307  -- Globstar patterns
    308  DSeg     = m.P'**/' * ((m.V'Element' + eof) / dseg),
    309  DSEnd    = m.P'**' * -1 * m.Cc(m.P(1)^0),
    310  -- Segment handling with word and star patterns
    311  Segment  = (m.V'Word' / start_seg + m.Cc({ '', true }) / start_seg * (m.V'Star' * m.V'Word' % lookfor)) *
    312              (m.V'Star' * m.V'Word' % lookfor)^0 * (m.V'Star' * m.V'CheckBnd' % to_seg_end)^-1 / end_seg
    313             + m.V'Star' * m.V'CheckBnd' * m.Cc(notslash^0),
    314  CheckBnd = #m.P'/' + -1,  -- Boundary constraint
    315 
    316  -- Word patterns for fixed-length matching
    317  Word     = -m.P'*' * m.Ct( m.V('FIRST')^-1 * m.C(m.V'WordAux') ),
    318  WordAux  = m.V'Branch' + m.Cf(m.V'Simple'^1 * m.V'Branch'^-1, mt.__mul),
    319  Simple   = m.Cg( m.V'Token' * (m.V'Token' % mt.__mul)^0 * (m.V'Boundary' % mt.__mul)^-1),
    320  Boundary = #m.P'/' * m.Cc(#m.P'/') + eof,
    321  Token    = m.V'Ques' + m.V'Class' + m.V'Escape' + m.V'Literal',
    322  Star     = m.P'*',
    323  Ques     = m.P'?' * m.Cc(notslash),
    324  Escape   = m.P'\\' * m.C(1) / m.P,
    325  Literal  = m.C(letter^1) / m.P,
    326 
    327  -- Branch handling for braced conditions
    328  Branch   = m.Cmt(m.C(m.V'CondList'), function(s, i, p1, p2)
    329                                         -- Optimize brace expansion when possible
    330                                         -- p1: string form of condition list, p2: transformed lua table
    331                                         if noopt_condlist:match(p1) then
    332                                           -- Cannot optimize, match till the end
    333                                           return #s + 1, p2, s:sub(i)
    334                                         end
    335                                         -- Find point to cut for optimization
    336                                         local cut = opt_tail:match(s, i)
    337                                         if cut then
    338                                           -- Can optimize: match till cut point
    339                                           -- true flag tells expand to transform EOF matches to &'/' predicates
    340                                           return cut, p2, s:sub(i, cut - 1), true
    341                                         else
    342                                           -- Cannot optimize
    343                                           return #s + 1, p2, s:sub(i)
    344                                         end
    345                                       end) / expand,
    346  -- Brace expansion handling
    347  CondList = m.Cf(m.P'{' * m.V'Cond' * (m.P',' * m.V'Cond')^1 * m.P'}', add_cond),
    348  Cond     = m.Cf((m.C((notcomma + m.P'\\' * 1 - m.S'{}')^1) + m.V'CondList')^1, mul_cond) + m.C(true),
    349 
    350  -- Character class handling
    351  Class    = m.P'[' * m.C(m.P'!'^-1) * m.Ct(
    352              (m.Ct(m.C(any_utf8) * m.P'-' * m.C(any_utf8 - m.P']')) + m.C(any_utf8 - m.P']'))^0
    353            ) * m.P']' / class,
    354 
    355  -- Deterministic first character extraction for optimization
    356  FIRST    = m.Cg(m.P(function(s, i)
    357                        if letter:match(s, i) then return true, s:sub(i, i)
    358                        else return false end
    359                      end), 'F')
    360 }
    361 -- stylua: ignore end
    362 --- @diagnostic enable
    363 
    364 --- @nodoc
    365 g = m.P(g)
    366 
    367 --- Parses a raw glob into an |lua-lpeg| pattern.
    368 ---
    369 ---@param pattern string The raw glob pattern
    370 ---@return vim.lpeg.Pattern #An |lua-lpeg| representation of the pattern
    371 function M.to_lpeg(pattern)
    372  local lpeg_pattern = g:match(pattern) --[[@as vim.lpeg.Pattern?]]
    373  assert(lpeg_pattern, string.format('Invalid glob: %s', pattern))
    374  return lpeg_pattern
    375 end
    376 
    377 return M