neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

formatc.lua (9044B)


      1 --[[ Copyright (c) 2009 Peter "Corsix" Cawley
      2 
      3 Permission is hereby granted, free of charge, to any person obtaining a copy of
      4 this software and associated documentation files (the "Software"), to deal in
      5 the Software without restriction, including without limitation the rights to
      6 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
      7 of the Software, and to permit persons to whom the Software is furnished to do
      8 so, subject to the following conditions:
      9 
     10 The above copyright notice and this permission notice shall be included in all
     11 copies or substantial portions of the Software.
     12 
     13 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     19 SOFTWARE. --]]
     20 
     21 -- this C parser was taken from Corsix-TH, I'm sure this could be done much
     22 -- better (i.e.: I think everything I do could be substitutions made with LPeg
     23 -- during parsing), but I've just learned enough basic LPeg to make this
     24 -- work.
     25 -- see: http://lua-users.org/wiki/LpegRecipes
     26 
     27 local lpeg = require 'lpeg'
     28 
     29 local C, P, R, S, V = lpeg.C, lpeg.P, lpeg.R, lpeg.S, lpeg.V
     30 local Carg, Cc, Cp, Ct = lpeg.Carg, lpeg.Cc, lpeg.Cp, lpeg.Ct
     31 
     32 local tokens = P {
     33  'tokens',
     34  -- Comment of form /* ... */
     35  comment = Ct(P '/*' * C((V 'newline' + (1 - P '*/')) ^ 0) * P '*/' * Cc 'comment'),
     36 
     37  -- Single line comment
     38  line_comment = Ct(P '//' * C((1 - V 'newline') ^ 0) * Cc 'comment_line'),
     39 
     40  -- Single platform independent line break which increments line number
     41  newline = (P '\r\n' + P '\n\r' + S '\r\n') * (Cp() * Carg(1)) / function(pos, state)
     42    state.line = state.line + 1
     43    state.line_start = pos
     44  end,
     45 
     46  -- Line continuation
     47  line_extend = Ct(C(P [[\]] * V 'newline') * Cc 'line_extend'),
     48 
     49  -- Whitespace of any length (includes newlines)
     50  whitespace = Ct(C((S ' \t' + V 'newline') ^ 1) * Cc 'whitespace'),
     51 
     52  -- Special form of #include with filename followed in angled brackets (matches 3 tokens)
     53  include = Ct(C(P '#include') * Cc 'preprocessor') * Ct(C(S ' \t' ^ 1) * Cc 'whitespace') * Ct(
     54    C(P '<' * (1 - P '>') ^ 1 * P '>') * Cc 'string'
     55  ),
     56 
     57  -- Preprocessor instruction
     58  preprocessor = V 'include'
     59    + Ct(
     60      C(
     61        P '#'
     62          * P ' ' ^ 0
     63          * (P 'define' + P 'elif' + P 'else' + P 'endif' + P '#' + P 'error' + P 'ifdef' + P 'ifndef' + P 'if' + P 'import' + P 'include' + P 'line' + P 'pragma' + P 'undef' + P 'using' + P 'pragma')
     64          * #S ' \r\n\t'
     65      ) * Cc 'preprocessor'
     66    ),
     67 
     68  -- Identifier of form [a-zA-Z_][a-zA-Z0-9_]*
     69  identifier = Ct(C(R('az', 'AZ', '__') * R('09', 'az', 'AZ', '__') ^ 0) * Cc 'identifier'),
     70 
     71  -- Single character in a string
     72  sstring_char = R('\001&', '([', ']\255') + (P '\\' * S [[ntvbrfa\?'"0x]]),
     73  dstring_char = R('\001!', '#[', ']\255') + (P '\\' * S [[ntvbrfa\?'"0x]]),
     74 
     75  -- String literal
     76  string = Ct(
     77    C(
     78      P "'" * (V 'sstring_char' + P '"') ^ 0 * P "'"
     79        + P '"' * (V 'dstring_char' + P "'") ^ 0 * P '"'
     80    ) * Cc 'string'
     81  ),
     82 
     83  -- Operator
     84  operator = Ct(
     85    C(
     86      P '>>='
     87        + P '<<='
     88        + P '...'
     89        + P '::'
     90        + P '<<'
     91        + P '>>'
     92        + P '<='
     93        + P '>='
     94        + P '=='
     95        + P '!='
     96        + P '||'
     97        + P '&&'
     98        + P '++'
     99        + P '--'
    100        + P '->'
    101        + P '+='
    102        + P '-='
    103        + P '*='
    104        + P '/='
    105        + P '|='
    106        + P '&='
    107        + P '^='
    108        + S '+-*/=<>%^|&.?:!~,'
    109    ) * Cc 'operator'
    110  ),
    111 
    112  -- Misc. char (token type is the character itself)
    113  char = Ct(C(S '[]{}();') / function(x)
    114    return x, x
    115  end),
    116 
    117  -- Hex, octal or decimal number
    118  int = Ct(
    119    C((P '0x' * R('09', 'af', 'AF') ^ 1) + (P '0' * R '07' ^ 0) + R '09' ^ 1) * Cc 'integer'
    120  ),
    121 
    122  -- Floating point number
    123  f_exponent = S 'eE' + S '+-' ^ -1 * R '09' ^ 1,
    124  f_terminator = S 'fFlL',
    125  float = Ct(
    126    C(
    127      R '09' ^ 1 * V 'f_exponent' * V 'f_terminator' ^ -1
    128        + R '09' ^ 0 * P '.' * R '09' ^ 1 * V 'f_exponent' ^ -1 * V 'f_terminator' ^ -1
    129        + R '09' ^ 1 * P '.' * R '09' ^ 0 * V 'f_exponent' ^ -1 * V 'f_terminator' ^ -1
    130    ) * Cc 'float'
    131  ),
    132 
    133  -- Any token
    134  token = V 'comment'
    135    + V 'line_comment'
    136    + V 'identifier'
    137    + V 'whitespace'
    138    + V 'line_extend'
    139    + V 'preprocessor'
    140    + V 'string'
    141    + V 'char'
    142    + V 'operator'
    143    + V 'float'
    144    + V 'int',
    145 
    146  -- Error for when nothing else matches
    147  error = (Cp() * C(P(1) ^ -8) * Carg(1)) / function(pos, where, state)
    148    error(
    149      ("Tokenising error on line %i, position %i, near '%s'"):format(
    150        state.line,
    151        pos - state.line_start + 1,
    152        where
    153      )
    154    )
    155  end,
    156 
    157  -- Match end of input or throw error
    158  finish = -P(1) + V 'error',
    159 
    160  -- Match stream of tokens into a table
    161  tokens = Ct(V 'token' ^ 0) * V 'finish',
    162 }
    163 
    164 local function TokeniseC(str)
    165  return tokens:match(str, 1, { line = 1, line_start = 1 })
    166 end
    167 
    168 local function set(t)
    169  local s = {}
    170  for _, v in ipairs(t) do
    171    s[v] = true
    172  end
    173  return s
    174 end
    175 
    176 local C_keywords = set { -- luacheck: ignore
    177  'break',
    178  'case',
    179  'char',
    180  'const',
    181  'continue',
    182  'default',
    183  'do',
    184  'double',
    185  'else',
    186  'enum',
    187  'extern',
    188  'float',
    189  'for',
    190  'goto',
    191  'if',
    192  'int',
    193  'long',
    194  'register',
    195  'return',
    196  'short',
    197  'signed',
    198  'sizeof',
    199  'static',
    200  'struct',
    201  'switch',
    202  'typedef',
    203  'union',
    204  'unsigned',
    205  'void',
    206  'volatile',
    207  'while',
    208 }
    209 
    210 -- Very primitive C formatter that tries to put "things" inside braces on one
    211 -- line. This is a step done after preprocessing the C source to ensure that
    212 -- the duplicate line detector can more reliably pick out identical declarations.
    213 --
    214 -- an example:
    215 --   struct mystruct
    216 --   {
    217 --      int a;
    218 --      int b;
    219 --   };
    220 --
    221 -- would become:
    222 --  struct mystruct { int a; int b; };
    223 --
    224 --  The first one will have a lot of false positives (the line '{' for
    225 --  example), the second one is more unique.
    226 --- @param string
    227 --- @return string
    228 local function formatc(str)
    229  local toks = TokeniseC(str)
    230  local result = {}
    231  local block_level = 0
    232  local allow_one_nl = false
    233  local end_at_brace = false
    234 
    235  for _, token in ipairs(toks) do
    236    local typ = token[2]
    237    if typ == '{' then
    238      block_level = block_level + 1
    239    elseif typ == '}' then
    240      block_level = block_level - 1
    241 
    242      if block_level == 0 and end_at_brace then
    243        -- if we're not inside a block, we're at the basic statement level,
    244        -- and ';' indicates we're at the end of a statement, so we put end
    245        -- it with a newline.
    246        token[1] = token[1] .. '\n'
    247        end_at_brace = false
    248      end
    249    elseif typ == 'identifier' then
    250      -- static and/or inline usually indicate an inline header function,
    251      -- which has no trailing ';', so we have to add a newline after the
    252      -- '}' ourselves.
    253      local tok = token[1]
    254      if tok == 'static' or tok == 'inline' or tok == '__inline' then
    255        end_at_brace = true
    256      end
    257    elseif typ == 'preprocessor' then
    258      -- preprocessor directives don't end in ';' but need their newline, so
    259      -- we're going to allow the next newline to pass.
    260      allow_one_nl = true
    261    elseif typ == ';' then
    262      if block_level == 0 then
    263        -- if we're not inside a block, we're at the basic statement level,
    264        -- and ';' indicates we're at the end of a statement, so we put end
    265        -- it with a newline.
    266        token[1] = ';\n'
    267        end_at_brace = false
    268      end
    269    elseif typ == 'whitespace' then
    270      -- replace all whitespace by one space
    271      local repl = ' '
    272 
    273      -- except when allow_on_nl is true and there's a newline in the whitespace
    274      if string.find(token[1], '[\r\n]+') and allow_one_nl == true then
    275        -- in that case we replace all whitespace by one newline
    276        repl = '\n'
    277        allow_one_nl = false
    278      end
    279 
    280      token[1] = string.gsub(token[1], '%s+', repl)
    281    end
    282    result[#result + 1] = token[1]
    283  end
    284 
    285  return table.concat(result)
    286 end
    287 
    288 -- standalone operation (very handy for debugging)
    289 local function standalone(...) -- luacheck: ignore
    290  local Preprocess = require('preprocess')
    291  Preprocess.add_to_include_path('./../../src')
    292  Preprocess.add_to_include_path('./../../build/include')
    293  Preprocess.add_to_include_path('./../../.deps/usr/include')
    294 
    295  local raw = Preprocess.preprocess('', arg[1])
    296 
    297  local formatted
    298  if #arg == 2 and arg[2] == 'no' then
    299    formatted = raw
    300  else
    301    formatted = formatc(raw)
    302  end
    303 
    304  print(formatted)
    305 end
    306 -- uncomment this line (and comment the `return`) for standalone debugging
    307 -- example usage:
    308 --    ../../.deps/usr/bin/luajit formatc.lua ../../include/fileio.h.generated.h
    309 --    ../../.deps/usr/bin/luajit formatc.lua /usr/include/malloc.h
    310 -- standalone(...)
    311 return formatc