glob.lua (12681B)
1 --- @brief Glob-to-LPeg Converter (Peglob) 2 --- This module converts glob patterns to LPeg patterns according to the LSP 3.17 specification: 3 --- https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#pattern 4 --- 5 --- Glob grammar overview: 6 --- - `*` to match zero or more characters in a path segment 7 --- - `?` to match on one character in a path segment 8 --- - `**` to match any number of path segments, including none 9 --- - `{}` to group conditions (e.g. `*.{ts,js}` matches TypeScript and JavaScript files) 10 --- - `[]` to declare a range of characters to match in a path segment 11 --- (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …) 12 --- - `[!...]` to negate a range of characters to match in a path segment 13 --- (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but not `example.0`) 14 --- 15 --- Additional constraints: 16 --- - A Glob pattern must match an entire path, with partial matches 17 --- considered failures. 18 --- - The pattern only determines success or failure, without specifying 19 --- which parts correspond to which characters. 20 --- - A *path segment* is the portion of a path between two adjacent path 21 --- separators (`/`), or between the start/end of the path and the nearest 22 --- separator. 23 --- - The `**` (*globstar*) pattern matches zero or more path segments, 24 --- including intervening separators (`/`). Within pattern strings, `**` 25 --- must be delimited by path separators (`/`) or pattern boundaries and 26 --- cannot be adjacent to any characters other than `/`. If `**` is not 27 --- the final element, it must be followed by `/`. 28 --- - `{}` (*braced conditions*) contains valid Glob patterns as branches, 29 --- separated by commas. Commas are exclusively used for separating 30 --- branches and cannot appear within a branch for any other purpose. 31 --- Nested `{}` structures are allowed, but `{}` must contain at least two 32 --- branches—zero or one branch is not permitted. 33 --- - In `[]` or `[!...]`, a *character range* consists of character 34 --- intervals (e.g., `a-z`) or individual characters (e.g., `w`). A range 35 --- including `/` won’t match that character. 36 37 --- @diagnostic disable: missing-fields 38 39 local m = vim.lpeg 40 local mt = getmetatable(m.P(0)) 41 local re = vim.re 42 local bit = require('bit') 43 44 local M = {} 45 46 -- Basic patterns for matching glob components 47 local letter = m.P(1) - m.S('*?[]{}/\\') -- Any character except special glob characters 48 local slash = m.P '/' * m.Cc(m.P '/') -- Path separator with capture 49 local notslash = m.P(1) - m.P '/' -- Any character except path separator 50 local notcomma = m.P(1) - m.S(',\\') -- Any character except comma and backslash 51 52 --- Handle EOF, considering whether we're in a segment or not 53 --- @type vim.lpeg.Pattern 54 local eof = -1 55 * m.Cb('inseg') 56 / function(flag) 57 if flag then 58 return #m.P '/' 59 else 60 return m.P(-1) 61 end 62 end 63 64 ---@alias pat_table { F: string?, [1]: string, [2]: vim.lpeg.Pattern } 65 ---@alias seg_part { [string]: any, [integer]: pat_table } 66 67 --- @param p pat_table Initial segment pattern data 68 --- @return seg_part Segment structure with start pattern 69 local function start_seg(p) 70 return { s = p[2], e = true, n = 0 } 71 end 72 73 --- @param t seg_part Segment structure 74 --- @param p pat_table Pattern to look for 75 --- @return table Updated segment structure 76 local function lookfor(t, p) 77 t.n = t.n + 1 78 t[t.n] = p 79 return t 80 end 81 82 --- @param t seg_part Segment structure 83 --- @return table Segment structure with end pattern 84 local function to_seg_end(t) 85 t.e = notslash ^ 0 86 return t 87 end 88 89 --- Constructs a segment matching pattern from collected components 90 --- 91 --- @param t seg_part Segment structure with patterns 92 --- @return vim.lpeg.Pattern Complete segment match pattern 93 local function end_seg(t) 94 --- @type table<any,any> 95 local seg_grammar = { 's' } 96 if t.n > 0 then 97 seg_grammar.s = t.s 98 for i = 1, t.n do 99 local rname = '_' .. t[i][1] 100 if not seg_grammar[rname] then 101 -- Optimize search when deterministic first character is available 102 if t[i].F then 103 seg_grammar[rname] = t[i][2] + notslash * (notslash - m.P(t[i].F)) ^ 0 * m.V(rname) 104 else 105 seg_grammar[rname] = t[i][2] + notslash * m.V(rname) 106 end 107 end 108 seg_grammar.s = seg_grammar.s * m.V(rname) 109 end 110 if t.e then 111 seg_grammar.s = seg_grammar.s * t.e 112 end 113 return m.P(seg_grammar) 114 else 115 seg_grammar.s = t.s 116 if t.e then 117 seg_grammar.s = seg_grammar.s * t.e 118 end 119 return seg_grammar.s 120 end 121 end 122 123 --- @param p vim.lpeg.Pattern Pattern directly after `**/` 124 --- @return vim.lpeg.Pattern LPeg pattern for `**/p` 125 local function dseg(p) 126 return m.P { p + notslash ^ 0 * m.P '/' * m.V(1) } 127 end 128 129 --- @type (vim.lpeg.Pattern|table) 130 local g = nil 131 132 --- Multiplies conditions for braced expansion (Cartesian product) 133 --- 134 --- @param a string|string[] First part 135 --- @param b string|string[] Second part 136 --- @return string|string[] Cartesian product of values 137 local function mul_cond(a, b) 138 if type(a) == 'string' then 139 if type(b) == 'string' then 140 return a .. b 141 elseif type(b) == 'table' then 142 for i = 1, #b do 143 b[i] = a .. b[i] 144 end 145 return b 146 else 147 return a 148 end 149 elseif type(a) == 'table' then 150 if type(b) == 'string' then 151 for i = 1, #a do 152 a[i] = a[i] .. b 153 end 154 return a 155 elseif type(b) == 'table' then 156 --- @type string[] 157 local res = {} 158 local idx = 0 159 for i = 1, #a do 160 for j = 1, #b do 161 idx = idx + 1 162 res[idx] = a[i] .. b[j] 163 end 164 end 165 return res 166 else 167 return a 168 end 169 else 170 return b 171 end 172 end 173 174 --- Combines alternatives in braced patterns 175 --- 176 --- @param a string|table First part 177 --- @param b string|table Second part 178 --- @return table #Combined alternatives 179 local function add_cond(a, b) 180 if type(a) == 'string' then 181 if type(b) == 'string' then 182 return { a, b } 183 elseif type(b) == 'table' then 184 table.insert(b, 1, a) 185 return b 186 end 187 elseif type(a) == 'table' then 188 if type(b) == 'string' then 189 table.insert(a, b) 190 return a 191 elseif type(b) == 'table' then 192 for i = 1, #b do 193 table.insert(a, b[i]) 194 end 195 return a 196 end 197 --- @diagnostic disable-next-line: missing-return 198 end 199 end 200 201 --- Expands patterns handling segment boundaries 202 --- `#` prefix is added for sub-grammar to detect in-segment flag 203 --- 204 ---@param a (any[]|vim.lpeg.Pattern[]) Array of patterns 205 ---@param b string Tail string 206 ---@param inseg boolean Whether inside a path segment 207 ---@return vim.lpeg.Pattern #Expanded pattern 208 local function expand(a, b, inseg) 209 for i = 1, #a do 210 if inseg then 211 a[i] = '#' .. a[i] 212 end 213 a[i] = g:match(a[i] .. b) 214 end 215 local res = a[1] 216 for i = 2, #a do 217 res = res + a[i] 218 end 219 return res 220 end 221 222 --- Converts a UTF-8 character to its Unicode codepoint 223 --- 224 --- @param utf8_str string UTF-8 character 225 --- @return number #Codepoint value 226 local function to_codepoint(utf8_str) 227 local codepoint = 0 228 local byte_count = 0 229 230 for i = 1, #utf8_str do 231 local byte = utf8_str:byte(i) 232 233 if byte_count ~= 0 then 234 codepoint = bit.bor(bit.lshift(codepoint, 6), bit.band(byte, 0x3F)) 235 byte_count = byte_count - 1 236 else 237 if byte < 0x80 then 238 codepoint = byte 239 elseif byte < 0xE0 then 240 byte_count = 1 241 codepoint = bit.band(byte, 0x1F) 242 elseif byte < 0xF0 then 243 byte_count = 2 244 codepoint = bit.band(byte, 0x0F) 245 else 246 byte_count = 3 247 codepoint = bit.band(byte, 0x07) 248 end 249 end 250 251 if byte_count == 0 then 252 break 253 end 254 end 255 256 return codepoint 257 end 258 259 --- Pattern for matching UTF-8 characters 260 local cont = m.R('\128\191') 261 local any_utf8 = m.R('\0\127') 262 + m.R('\194\223') * cont 263 + m.R('\224\239') * cont * cont 264 + m.R('\240\244') * cont * cont * cont 265 266 --- Creates a character class pattern for glob ranges 267 --- @param inv string Inversion flag ('!' or '') 268 --- @param ranges (string|string[])[] Character ranges 269 --- @return vim.lpeg.Pattern #Character class pattern 270 local function class(inv, ranges) 271 local patt = m.P(false) 272 if #ranges == 0 then 273 if inv == '!' then 274 return m.P '[!]' 275 else 276 return m.P '[]' 277 end 278 end 279 for _, v in ipairs(ranges) do 280 patt = patt + (type(v) == 'table' and m.utfR(to_codepoint(v[1]), to_codepoint(v[2])) or m.P(v)) 281 end 282 if inv == '!' then 283 patt = m.P(1) - patt --[[@as vim.lpeg.Pattern]] 284 end 285 return patt - m.P '/' 286 end 287 288 -- Parse constraints for optimizing braced conditions 289 local noopt_condlist = re.compile [[ 290 s <- '/' / '**' / . [^/*]* s 291 ]] 292 293 local opt_tail = re.compile [[ 294 s <- (!'**' [^{/])* &'/' 295 ]] 296 297 -- stylua: ignore start 298 --- @nodoc 299 --- @diagnostic disable 300 --- Main grammar for glob pattern matching 301 g = { 302 'Glob', 303 Glob = (m.P'#' * m.Cg(m.Cc(true), 'inseg') + m.Cg(m.Cc(false), 'inseg')) * 304 m.Cf(m.V'Element'^-1 * (slash * m.V'Element')^0 * (slash^-1 * eof), mt.__mul), 305 -- Elements handle segments, globstar patterns 306 Element = m.V'DSeg' + m.V'DSEnd' + m.Cf(m.V'Segment' * (slash * m.V'Segment')^0 * (slash * eof + eof^-1), mt.__mul), 307 -- Globstar patterns 308 DSeg = m.P'**/' * ((m.V'Element' + eof) / dseg), 309 DSEnd = m.P'**' * -1 * m.Cc(m.P(1)^0), 310 -- Segment handling with word and star patterns 311 Segment = (m.V'Word' / start_seg + m.Cc({ '', true }) / start_seg * (m.V'Star' * m.V'Word' % lookfor)) * 312 (m.V'Star' * m.V'Word' % lookfor)^0 * (m.V'Star' * m.V'CheckBnd' % to_seg_end)^-1 / end_seg 313 + m.V'Star' * m.V'CheckBnd' * m.Cc(notslash^0), 314 CheckBnd = #m.P'/' + -1, -- Boundary constraint 315 316 -- Word patterns for fixed-length matching 317 Word = -m.P'*' * m.Ct( m.V('FIRST')^-1 * m.C(m.V'WordAux') ), 318 WordAux = m.V'Branch' + m.Cf(m.V'Simple'^1 * m.V'Branch'^-1, mt.__mul), 319 Simple = m.Cg( m.V'Token' * (m.V'Token' % mt.__mul)^0 * (m.V'Boundary' % mt.__mul)^-1), 320 Boundary = #m.P'/' * m.Cc(#m.P'/') + eof, 321 Token = m.V'Ques' + m.V'Class' + m.V'Escape' + m.V'Literal', 322 Star = m.P'*', 323 Ques = m.P'?' * m.Cc(notslash), 324 Escape = m.P'\\' * m.C(1) / m.P, 325 Literal = m.C(letter^1) / m.P, 326 327 -- Branch handling for braced conditions 328 Branch = m.Cmt(m.C(m.V'CondList'), function(s, i, p1, p2) 329 -- Optimize brace expansion when possible 330 -- p1: string form of condition list, p2: transformed lua table 331 if noopt_condlist:match(p1) then 332 -- Cannot optimize, match till the end 333 return #s + 1, p2, s:sub(i) 334 end 335 -- Find point to cut for optimization 336 local cut = opt_tail:match(s, i) 337 if cut then 338 -- Can optimize: match till cut point 339 -- true flag tells expand to transform EOF matches to &'/' predicates 340 return cut, p2, s:sub(i, cut - 1), true 341 else 342 -- Cannot optimize 343 return #s + 1, p2, s:sub(i) 344 end 345 end) / expand, 346 -- Brace expansion handling 347 CondList = m.Cf(m.P'{' * m.V'Cond' * (m.P',' * m.V'Cond')^1 * m.P'}', add_cond), 348 Cond = m.Cf((m.C((notcomma + m.P'\\' * 1 - m.S'{}')^1) + m.V'CondList')^1, mul_cond) + m.C(true), 349 350 -- Character class handling 351 Class = m.P'[' * m.C(m.P'!'^-1) * m.Ct( 352 (m.Ct(m.C(any_utf8) * m.P'-' * m.C(any_utf8 - m.P']')) + m.C(any_utf8 - m.P']'))^0 353 ) * m.P']' / class, 354 355 -- Deterministic first character extraction for optimization 356 FIRST = m.Cg(m.P(function(s, i) 357 if letter:match(s, i) then return true, s:sub(i, i) 358 else return false end 359 end), 'F') 360 } 361 -- stylua: ignore end 362 --- @diagnostic enable 363 364 --- @nodoc 365 g = m.P(g) 366 367 --- Parses a raw glob into an |lua-lpeg| pattern. 368 --- 369 ---@param pattern string The raw glob pattern 370 ---@return vim.lpeg.Pattern #An |lua-lpeg| representation of the pattern 371 function M.to_lpeg(pattern) 372 local lpeg_pattern = g:match(pattern) --[[@as vim.lpeg.Pattern?]] 373 assert(lpeg_pattern, string.format('Invalid glob: %s', pattern)) 374 return lpeg_pattern 375 end 376 377 return M