formatc.lua (9044B)
1 --[[ Copyright (c) 2009 Peter "Corsix" Cawley 2 3 Permission is hereby granted, free of charge, to any person obtaining a copy of 4 this software and associated documentation files (the "Software"), to deal in 5 the Software without restriction, including without limitation the rights to 6 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 of the Software, and to permit persons to whom the Software is furnished to do 8 so, subject to the following conditions: 9 10 The above copyright notice and this permission notice shall be included in all 11 copies or substantial portions of the Software. 12 13 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 SOFTWARE. --]] 20 21 -- this C parser was taken from Corsix-TH, I'm sure this could be done much 22 -- better (i.e.: I think everything I do could be substitutions made with LPeg 23 -- during parsing), but I've just learned enough basic LPeg to make this 24 -- work. 25 -- see: http://lua-users.org/wiki/LpegRecipes 26 27 local lpeg = require 'lpeg' 28 29 local C, P, R, S, V = lpeg.C, lpeg.P, lpeg.R, lpeg.S, lpeg.V 30 local Carg, Cc, Cp, Ct = lpeg.Carg, lpeg.Cc, lpeg.Cp, lpeg.Ct 31 32 local tokens = P { 33 'tokens', 34 -- Comment of form /* ... */ 35 comment = Ct(P '/*' * C((V 'newline' + (1 - P '*/')) ^ 0) * P '*/' * Cc 'comment'), 36 37 -- Single line comment 38 line_comment = Ct(P '//' * C((1 - V 'newline') ^ 0) * Cc 'comment_line'), 39 40 -- Single platform independent line break which increments line number 41 newline = (P '\r\n' + P '\n\r' + S '\r\n') * (Cp() * Carg(1)) / function(pos, state) 42 state.line = state.line + 1 43 state.line_start = pos 44 end, 45 46 -- Line continuation 47 line_extend = Ct(C(P [[\]] * V 'newline') * Cc 'line_extend'), 48 49 -- Whitespace of any length (includes newlines) 50 whitespace = Ct(C((S ' \t' + V 'newline') ^ 1) * Cc 'whitespace'), 51 52 -- Special form of #include with filename followed in angled brackets (matches 3 tokens) 53 include = Ct(C(P '#include') * Cc 'preprocessor') * Ct(C(S ' \t' ^ 1) * Cc 'whitespace') * Ct( 54 C(P '<' * (1 - P '>') ^ 1 * P '>') * Cc 'string' 55 ), 56 57 -- Preprocessor instruction 58 preprocessor = V 'include' 59 + Ct( 60 C( 61 P '#' 62 * P ' ' ^ 0 63 * (P 'define' + P 'elif' + P 'else' + P 'endif' + P '#' + P 'error' + P 'ifdef' + P 'ifndef' + P 'if' + P 'import' + P 'include' + P 'line' + P 'pragma' + P 'undef' + P 'using' + P 'pragma') 64 * #S ' \r\n\t' 65 ) * Cc 'preprocessor' 66 ), 67 68 -- Identifier of form [a-zA-Z_][a-zA-Z0-9_]* 69 identifier = Ct(C(R('az', 'AZ', '__') * R('09', 'az', 'AZ', '__') ^ 0) * Cc 'identifier'), 70 71 -- Single character in a string 72 sstring_char = R('\001&', '([', ']\255') + (P '\\' * S [[ntvbrfa\?'"0x]]), 73 dstring_char = R('\001!', '#[', ']\255') + (P '\\' * S [[ntvbrfa\?'"0x]]), 74 75 -- String literal 76 string = Ct( 77 C( 78 P "'" * (V 'sstring_char' + P '"') ^ 0 * P "'" 79 + P '"' * (V 'dstring_char' + P "'") ^ 0 * P '"' 80 ) * Cc 'string' 81 ), 82 83 -- Operator 84 operator = Ct( 85 C( 86 P '>>=' 87 + P '<<=' 88 + P '...' 89 + P '::' 90 + P '<<' 91 + P '>>' 92 + P '<=' 93 + P '>=' 94 + P '==' 95 + P '!=' 96 + P '||' 97 + P '&&' 98 + P '++' 99 + P '--' 100 + P '->' 101 + P '+=' 102 + P '-=' 103 + P '*=' 104 + P '/=' 105 + P '|=' 106 + P '&=' 107 + P '^=' 108 + S '+-*/=<>%^|&.?:!~,' 109 ) * Cc 'operator' 110 ), 111 112 -- Misc. char (token type is the character itself) 113 char = Ct(C(S '[]{}();') / function(x) 114 return x, x 115 end), 116 117 -- Hex, octal or decimal number 118 int = Ct( 119 C((P '0x' * R('09', 'af', 'AF') ^ 1) + (P '0' * R '07' ^ 0) + R '09' ^ 1) * Cc 'integer' 120 ), 121 122 -- Floating point number 123 f_exponent = S 'eE' + S '+-' ^ -1 * R '09' ^ 1, 124 f_terminator = S 'fFlL', 125 float = Ct( 126 C( 127 R '09' ^ 1 * V 'f_exponent' * V 'f_terminator' ^ -1 128 + R '09' ^ 0 * P '.' * R '09' ^ 1 * V 'f_exponent' ^ -1 * V 'f_terminator' ^ -1 129 + R '09' ^ 1 * P '.' * R '09' ^ 0 * V 'f_exponent' ^ -1 * V 'f_terminator' ^ -1 130 ) * Cc 'float' 131 ), 132 133 -- Any token 134 token = V 'comment' 135 + V 'line_comment' 136 + V 'identifier' 137 + V 'whitespace' 138 + V 'line_extend' 139 + V 'preprocessor' 140 + V 'string' 141 + V 'char' 142 + V 'operator' 143 + V 'float' 144 + V 'int', 145 146 -- Error for when nothing else matches 147 error = (Cp() * C(P(1) ^ -8) * Carg(1)) / function(pos, where, state) 148 error( 149 ("Tokenising error on line %i, position %i, near '%s'"):format( 150 state.line, 151 pos - state.line_start + 1, 152 where 153 ) 154 ) 155 end, 156 157 -- Match end of input or throw error 158 finish = -P(1) + V 'error', 159 160 -- Match stream of tokens into a table 161 tokens = Ct(V 'token' ^ 0) * V 'finish', 162 } 163 164 local function TokeniseC(str) 165 return tokens:match(str, 1, { line = 1, line_start = 1 }) 166 end 167 168 local function set(t) 169 local s = {} 170 for _, v in ipairs(t) do 171 s[v] = true 172 end 173 return s 174 end 175 176 local C_keywords = set { -- luacheck: ignore 177 'break', 178 'case', 179 'char', 180 'const', 181 'continue', 182 'default', 183 'do', 184 'double', 185 'else', 186 'enum', 187 'extern', 188 'float', 189 'for', 190 'goto', 191 'if', 192 'int', 193 'long', 194 'register', 195 'return', 196 'short', 197 'signed', 198 'sizeof', 199 'static', 200 'struct', 201 'switch', 202 'typedef', 203 'union', 204 'unsigned', 205 'void', 206 'volatile', 207 'while', 208 } 209 210 -- Very primitive C formatter that tries to put "things" inside braces on one 211 -- line. This is a step done after preprocessing the C source to ensure that 212 -- the duplicate line detector can more reliably pick out identical declarations. 213 -- 214 -- an example: 215 -- struct mystruct 216 -- { 217 -- int a; 218 -- int b; 219 -- }; 220 -- 221 -- would become: 222 -- struct mystruct { int a; int b; }; 223 -- 224 -- The first one will have a lot of false positives (the line '{' for 225 -- example), the second one is more unique. 226 --- @param string 227 --- @return string 228 local function formatc(str) 229 local toks = TokeniseC(str) 230 local result = {} 231 local block_level = 0 232 local allow_one_nl = false 233 local end_at_brace = false 234 235 for _, token in ipairs(toks) do 236 local typ = token[2] 237 if typ == '{' then 238 block_level = block_level + 1 239 elseif typ == '}' then 240 block_level = block_level - 1 241 242 if block_level == 0 and end_at_brace then 243 -- if we're not inside a block, we're at the basic statement level, 244 -- and ';' indicates we're at the end of a statement, so we put end 245 -- it with a newline. 246 token[1] = token[1] .. '\n' 247 end_at_brace = false 248 end 249 elseif typ == 'identifier' then 250 -- static and/or inline usually indicate an inline header function, 251 -- which has no trailing ';', so we have to add a newline after the 252 -- '}' ourselves. 253 local tok = token[1] 254 if tok == 'static' or tok == 'inline' or tok == '__inline' then 255 end_at_brace = true 256 end 257 elseif typ == 'preprocessor' then 258 -- preprocessor directives don't end in ';' but need their newline, so 259 -- we're going to allow the next newline to pass. 260 allow_one_nl = true 261 elseif typ == ';' then 262 if block_level == 0 then 263 -- if we're not inside a block, we're at the basic statement level, 264 -- and ';' indicates we're at the end of a statement, so we put end 265 -- it with a newline. 266 token[1] = ';\n' 267 end_at_brace = false 268 end 269 elseif typ == 'whitespace' then 270 -- replace all whitespace by one space 271 local repl = ' ' 272 273 -- except when allow_on_nl is true and there's a newline in the whitespace 274 if string.find(token[1], '[\r\n]+') and allow_one_nl == true then 275 -- in that case we replace all whitespace by one newline 276 repl = '\n' 277 allow_one_nl = false 278 end 279 280 token[1] = string.gsub(token[1], '%s+', repl) 281 end 282 result[#result + 1] = token[1] 283 end 284 285 return table.concat(result) 286 end 287 288 -- standalone operation (very handy for debugging) 289 local function standalone(...) -- luacheck: ignore 290 local Preprocess = require('preprocess') 291 Preprocess.add_to_include_path('./../../src') 292 Preprocess.add_to_include_path('./../../build/include') 293 Preprocess.add_to_include_path('./../../.deps/usr/include') 294 295 local raw = Preprocess.preprocess('', arg[1]) 296 297 local formatted 298 if #arg == 2 and arg[2] == 'no' then 299 formatted = raw 300 else 301 formatted = formatc(raw) 302 end 303 304 print(formatted) 305 end 306 -- uncomment this line (and comment the `return`) for standalone debugging 307 -- example usage: 308 -- ../../.deps/usr/bin/luajit formatc.lua ../../include/fileio.h.generated.h 309 -- ../../.deps/usr/bin/luajit formatc.lua /usr/include/malloc.h 310 -- standalone(...) 311 return formatc