blob: 26c098d8514a0c664cdbbecfb21c7bd10f00ee29 [file] [log] [blame]
#!/usr/bin/env python3
# A tool to automatically generate documentation for the config options of the
# clang static analyzer by reading `AnalyzerOptions.def`.
import argparse
from collections import namedtuple
from enum import Enum, auto
import re
import sys
import textwrap
# The following code implements a trivial parser for the narrow subset of C++
# which is used in AnalyzerOptions.def. This supports the following features:
# - ignores preprocessor directives, even if they are continued with \ at EOL
# - ignores comments: both /* ... */ and // ...
# - parses string literals (even if they contain \" escapes)
# - concatenates adjacent string literals
# - parses numbers even if they contain ' as a thousands separator
# - recognizes MACRO(arg1, arg2, ..., argN) calls
class TT(Enum):
"Token type enum."
number = auto()
ident = auto()
string = auto()
punct = auto()
TOKENS = [
(re.compile(r"-?[0-9']+"), TT.number),
(re.compile(r"\w+"), TT.ident),
(re.compile(r'"([^\\"]|\\.)*"'), TT.string),
(re.compile(r"[(),]"), TT.punct),
(re.compile(r"/\*((?!\*/).)*\*/", re.S), None), # C-style comment
(re.compile(r"//.*\n"), None), # C++ style oneline comment
(re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None), # preprocessor directive
(re.compile(r"\s+"), None), # whitespace
]
Token = namedtuple("Token", "kind code")
class ErrorHandler:
def __init__(self):
self.seen_errors = False
# This script uses some heuristical tweaks to modify the documentation
# of some analyzer options. As this code is fragile, we record the use
# of these tweaks and report them if they become obsolete:
self.unused_tweaks = [
"escape star",
"escape underline",
"accepted values",
"example file content",
]
def record_use_of_tweak(self, tweak_name):
try:
self.unused_tweaks.remove(tweak_name)
except ValueError:
pass
def replace_as_tweak(self, string, pattern, repl, tweak_name):
res = string.replace(pattern, repl)
if res != string:
self.record_use_of_tweak(tweak_name)
return res
def report_error(self, msg):
print("Error:", msg, file=sys.stderr)
self.seen_errors = True
def report_unexpected_char(self, s, pos):
lines = (s[:pos] + "X").split("\n")
lineno, col = (len(lines), len(lines[-1]))
self.report_error(
"unexpected character %r in AnalyzerOptions.def at line %d column %d"
% (s[pos], lineno, col),
)
def report_unused_tweaks(self):
if not self.unused_tweaks:
return
_is = " is" if len(self.unused_tweaks) == 1 else "s are"
names = ", ".join(self.unused_tweaks)
self.report_error(f"textual tweak{_is} unused in script: {names}")
err_handler = ErrorHandler()
def tokenize(s):
result = []
pos = 0
while pos < len(s):
for regex, kind in TOKENS:
if m := regex.match(s, pos):
if kind is not None:
result.append(Token(kind, m.group(0)))
pos = m.end()
break
else:
err_handler.report_unexpected_char(s, pos)
pos += 1
return result
def join_strings(tokens):
result = []
for tok in tokens:
if tok.kind == TT.string and result and result[-1].kind == TT.string:
# If this token is a string, and the previous non-ignored token is
# also a string, then merge them into a single token. We need to
# discard the closing " of the previous string and the opening " of
# this string.
prev = result.pop()
result.append(Token(TT.string, prev.code[:-1] + tok.code[1:]))
else:
result.append(tok)
return result
MacroCall = namedtuple("MacroCall", "name args")
class State(Enum):
"States of the state machine used for parsing the macro calls."
init = auto()
after_ident = auto()
before_arg = auto()
after_arg = auto()
def get_calls(tokens, macro_names):
state = State.init
result = []
current = None
for tok in tokens:
if state == State.init and tok.kind == TT.ident and tok.code in macro_names:
current = MacroCall(tok.code, [])
state = State.after_ident
elif state == State.after_ident and tok == Token(TT.punct, "("):
state = State.before_arg
elif state == State.before_arg:
if current is not None:
current.args.append(tok)
state = State.after_arg
elif state == State.after_arg and tok.kind == TT.punct:
if tok.code == ")":
result.append(current)
current = None
state = State.init
elif tok.code == ",":
state = State.before_arg
else:
current = None
state = State.init
return result
# The information will be extracted from calls to these two macros:
# #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)
# #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,
# SHALLOW_VAL, DEEP_VAL)
MACRO_NAMES_PARAMCOUNTS = {
"ANALYZER_OPTION": 5,
"ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6,
}
def string_value(tok):
if tok.kind != TT.string:
raise ValueError(f"expected a string token, got {tok.kind.name}")
text = tok.code[1:-1] # Remove quotes
text = re.sub(r"\\(.)", r"\1", text) # Resolve backslash escapes
return text
def cmdflag_to_rst_title(cmdflag_tok):
text = string_value(cmdflag_tok)
underline = "-" * len(text)
ref = f".. _analyzer-option-{text}:"
return f"{ref}\n\n{text}\n{underline}\n\n"
def desc_to_rst_paragraphs(tok):
desc = string_value(tok)
# Escape some characters that have special meaning in RST:
desc = err_handler.replace_as_tweak(desc, "*", r"\*", "escape star")
desc = err_handler.replace_as_tweak(desc, "_", r"\_", "escape underline")
# Many descriptions end with "Value: <list of accepted values>", which is
# OK for a terse command line printout, but should be prettified for web
# documentation.
# Moreover, the option ctu-invocation-list shows some example file content
# which is formatted as a preformatted block.
paragraphs = [desc]
extra = ""
if m := re.search(r"(^|\s)Value:", desc):
err_handler.record_use_of_tweak("accepted values")
paragraphs = [desc[: m.start()], "Accepted values:" + desc[m.end() :]]
elif m := re.search(r"\s*Example file.content:", desc):
err_handler.record_use_of_tweak("example file content")
paragraphs = [desc[: m.start()]]
extra = "Example file content::\n\n " + desc[m.end() :] + "\n\n"
wrapped = [textwrap.fill(p, width=80) for p in paragraphs if p.strip()]
return "\n\n".join(wrapped + [""]) + extra
def default_to_rst(tok):
if tok.kind == TT.string:
if tok.code == '""':
return "(empty string)"
return tok.code
if tok.kind == TT.ident:
return tok.code
if tok.kind == TT.number:
return tok.code.replace("'", "")
raise ValueError(f"unexpected token as default value: {tok.kind.name}")
def defaults_to_rst_paragraph(defaults):
strs = [default_to_rst(d) for d in defaults]
if len(strs) == 1:
return f"Default value: {strs[0]}\n\n"
if len(strs) == 2:
return (
f"Default value: {strs[0]} (in shallow mode) / {strs[1]} (in deep mode)\n\n"
)
raise ValueError("unexpected count of default values: %d" % len(defaults))
def macro_call_to_rst_paragraphs(macro_call):
try:
arg_count = len(macro_call.args)
param_count = MACRO_NAMES_PARAMCOUNTS[macro_call.name]
if arg_count != param_count:
raise ValueError(
f"expected {param_count} arguments for {macro_call.name}, found {arg_count}"
)
_, _, cmdflag, desc, *defaults = macro_call.args
return (
cmdflag_to_rst_title(cmdflag)
+ desc_to_rst_paragraphs(desc)
+ defaults_to_rst_paragraph(defaults)
)
except ValueError as ve:
err_handler.report_error(ve.args[0])
return ""
def get_option_list(input_file):
with open(input_file, encoding="utf-8") as f:
contents = f.read()
tokens = join_strings(tokenize(contents))
macro_calls = get_calls(tokens, MACRO_NAMES_PARAMCOUNTS)
result = ""
for mc in macro_calls:
result += macro_call_to_rst_paragraphs(mc)
return result
p = argparse.ArgumentParser()
p.add_argument("--options-def", help="path to AnalyzerOptions.def")
p.add_argument("--template", help="template file")
p.add_argument("--out", help="output file")
opts = p.parse_args()
with open(opts.template, encoding="utf-8") as f:
doc_template = f.read()
PLACEHOLDER = ".. OPTIONS_LIST_PLACEHOLDER\n"
rst_output = doc_template.replace(PLACEHOLDER, get_option_list(opts.options_def))
err_handler.report_unused_tweaks()
with open(opts.out, "w", newline="", encoding="utf-8") as f:
f.write(rst_output)
if err_handler.seen_errors:
sys.exit(1)