clang/docs/tools/generate_analyzer_options_docs.py - llvm-project - Git at Google

 #!/usr/bin/env python3
 # A tool to automatically generate documentation for the config options of the
 # clang static analyzer by reading `AnalyzerOptions.def`.

 import argparse
 from collections import namedtuple
 from enum import Enum, auto
 import re
 import sys
 import textwrap


 # The following code implements a trivial parser for the narrow subset of C++
 # which is used in AnalyzerOptions.def. This supports the following features:
 # - ignores preprocessor directives, even if they are continued with \ at EOL
 # - ignores comments: both /* ... */ and // ...
 # - parses string literals (even if they contain \" escapes)
 # - concatenates adjacent string literals
 # - parses numbers even if they contain ' as a thousands separator
 # - recognizes MACRO(arg1, arg2, ..., argN) calls


 class TT(Enum):
     "Token type enum."
     number = auto()
     ident = auto()
     string = auto()
     punct = auto()


 TOKENS = [
     (re.compile(r"-?[0-9']+"), TT.number),
     (re.compile(r"\w+"), TT.ident),
     (re.compile(r'"([^\\"]|\\.)*"'), TT.string),
     (re.compile(r"[(),]"), TT.punct),
     (re.compile(r"/\*((?!\*/).)*\*/", re.S), None),  # C-style comment
     (re.compile(r"//.*\n"), None),  # C++ style oneline comment
     (re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None),  # preprocessor directive
     (re.compile(r"\s+"), None),  # whitespace
 ]

 Token = namedtuple("Token", "kind code")


 class ErrorHandler:
     def __init__(self):
         self.seen_errors = False

         # This script uses some heuristical tweaks to modify the documentation
         # of some analyzer options. As this code is fragile, we record the use
         # of these tweaks and report them if they become obsolete:
         self.unused_tweaks = [
             "escape star",
             "escape underline",
             "accepted values",
             "example file content",
         ]

     def record_use_of_tweak(self, tweak_name):
         try:
             self.unused_tweaks.remove(tweak_name)
         except ValueError:
             pass

     def replace_as_tweak(self, string, pattern, repl, tweak_name):
         res = string.replace(pattern, repl)
         if res != string:
             self.record_use_of_tweak(tweak_name)
         return res

     def report_error(self, msg):
         print("Error:", msg, file=sys.stderr)
         self.seen_errors = True

     def report_unexpected_char(self, s, pos):
         lines = (s[:pos] + "X").split("\n")
         lineno, col = (len(lines), len(lines[-1]))
         self.report_error(
             "unexpected character %r in AnalyzerOptions.def at line %d column %d"
             % (s[pos], lineno, col),
         )

     def report_unused_tweaks(self):
         if not self.unused_tweaks:
             return
         _is = " is" if len(self.unused_tweaks) == 1 else "s are"
         names = ", ".join(self.unused_tweaks)
         self.report_error(f"textual tweak{_is} unused in script: {names}")


 err_handler = ErrorHandler()


 def tokenize(s):
     result = []
     pos = 0
     while pos < len(s):
         for regex, kind in TOKENS:
             if m := regex.match(s, pos):
                 if kind is not None:
                     result.append(Token(kind, m.group(0)))
                 pos = m.end()
                 break
         else:
             err_handler.report_unexpected_char(s, pos)
             pos += 1
     return result


 def join_strings(tokens):
     result = []
     for tok in tokens:
         if tok.kind == TT.string and result and result[-1].kind == TT.string:
             # If this token is a string, and the previous non-ignored token is
             # also a string, then merge them into a single token. We need to
             # discard the closing " of the previous string and the opening " of
             # this string.
             prev = result.pop()
             result.append(Token(TT.string, prev.code[:-1] + tok.code[1:]))
         else:
             result.append(tok)
     return result


 MacroCall = namedtuple("MacroCall", "name args")


 class State(Enum):
     "States of the state machine used for parsing the macro calls."
     init = auto()
     after_ident = auto()
     before_arg = auto()
     after_arg = auto()


 def get_calls(tokens, macro_names):
     state = State.init
     result = []
     current = None
     for tok in tokens:
         if state == State.init and tok.kind == TT.ident and tok.code in macro_names:
             current = MacroCall(tok.code, [])
             state = State.after_ident
         elif state == State.after_ident and tok == Token(TT.punct, "("):
             state = State.before_arg
         elif state == State.before_arg:
             if current is not None:
                 current.args.append(tok)
                 state = State.after_arg
         elif state == State.after_arg and tok.kind == TT.punct:
             if tok.code == ")":
                 result.append(current)
                 current = None
                 state = State.init
             elif tok.code == ",":
                 state = State.before_arg
         else:
             current = None
             state = State.init
     return result


 # The information will be extracted from calls to these two macros:
 # #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)
 # #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,
 #                                              SHALLOW_VAL, DEEP_VAL)

 MACRO_NAMES_PARAMCOUNTS = {
     "ANALYZER_OPTION": 5,
     "ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6,
 }


 def string_value(tok):
     if tok.kind != TT.string:
         raise ValueError(f"expected a string token, got {tok.kind.name}")
     text = tok.code[1:-1]  # Remove quotes
     text = re.sub(r"\\(.)", r"\1", text)  # Resolve backslash escapes
     return text


 def cmdflag_to_rst_title(cmdflag_tok):
     text = string_value(cmdflag_tok)
     underline = "-" * len(text)
     ref = f".. _analyzer-option-{text}:"

     return f"{ref}\n\n{text}\n{underline}\n\n"


 def desc_to_rst_paragraphs(tok):
     desc = string_value(tok)

     # Escape some characters that have special meaning in RST:
     desc = err_handler.replace_as_tweak(desc, "*", r"\*", "escape star")
     desc = err_handler.replace_as_tweak(desc, "_", r"\_", "escape underline")

     # Many descriptions end with "Value: <list of accepted values>", which is
     # OK for a terse command line printout, but should be prettified for web
     # documentation.
     # Moreover, the option ctu-invocation-list shows some example file content
     # which is formatted as a preformatted block.
     paragraphs = [desc]
     extra = ""
     if m := re.search(r"(^|\s)Value:", desc):
         err_handler.record_use_of_tweak("accepted values")
         paragraphs = [desc[: m.start()], "Accepted values:" + desc[m.end() :]]
     elif m := re.search(r"\s*Example file.content:", desc):
         err_handler.record_use_of_tweak("example file content")
         paragraphs = [desc[: m.start()]]
         extra = "Example file content::\n\n  " + desc[m.end() :] + "\n\n"

     wrapped = [textwrap.fill(p, width=80) for p in paragraphs if p.strip()]

     return "\n\n".join(wrapped + [""]) + extra


 def default_to_rst(tok):
     if tok.kind == TT.string:
         if tok.code == '""':
             return "(empty string)"
         return tok.code
     if tok.kind == TT.ident:
         return tok.code
     if tok.kind == TT.number:
         return tok.code.replace("'", "")
     raise ValueError(f"unexpected token as default value: {tok.kind.name}")


 def defaults_to_rst_paragraph(defaults):
     strs = [default_to_rst(d) for d in defaults]

     if len(strs) == 1:
         return f"Default value: {strs[0]}\n\n"
     if len(strs) == 2:
         return (
             f"Default value: {strs[0]} (in shallow mode) / {strs[1]} (in deep mode)\n\n"
         )
     raise ValueError("unexpected count of default values: %d" % len(defaults))


 def macro_call_to_rst_paragraphs(macro_call):
     try:
         arg_count = len(macro_call.args)
         param_count = MACRO_NAMES_PARAMCOUNTS[macro_call.name]
         if arg_count != param_count:
             raise ValueError(
                 f"expected {param_count} arguments for {macro_call.name}, found {arg_count}"
             )

         _, _, cmdflag, desc, *defaults = macro_call.args

         return (
             cmdflag_to_rst_title(cmdflag)
             + desc_to_rst_paragraphs(desc)
             + defaults_to_rst_paragraph(defaults)
         )
     except ValueError as ve:
         err_handler.report_error(ve.args[0])
         return ""


 def get_option_list(input_file):
     with open(input_file, encoding="utf-8") as f:
         contents = f.read()
     tokens = join_strings(tokenize(contents))
     macro_calls = get_calls(tokens, MACRO_NAMES_PARAMCOUNTS)

     result = ""
     for mc in macro_calls:
         result += macro_call_to_rst_paragraphs(mc)
     return result


 p = argparse.ArgumentParser()
 p.add_argument("--options-def", help="path to AnalyzerOptions.def")
 p.add_argument("--template", help="template file")
 p.add_argument("--out", help="output file")
 opts = p.parse_args()

 with open(opts.template, encoding="utf-8") as f:
     doc_template = f.read()

 PLACEHOLDER = ".. OPTIONS_LIST_PLACEHOLDER\n"

 rst_output = doc_template.replace(PLACEHOLDER, get_option_list(opts.options_def))

 err_handler.report_unused_tweaks()

 with open(opts.out, "w", newline="", encoding="utf-8") as f:
     f.write(rst_output)

 if err_handler.seen_errors:
     sys.exit(1)
	#!/usr/bin/env python3
	# A tool to automatically generate documentation for the config options of the
	# clang static analyzer by reading `AnalyzerOptions.def`.

	import argparse
	from collections import namedtuple
	from enum import Enum, auto
	import re
	import sys
	import textwrap


	# The following code implements a trivial parser for the narrow subset of C++
	# which is used in AnalyzerOptions.def. This supports the following features:
	# - ignores preprocessor directives, even if they are continued with \ at EOL
	# - ignores comments: both /* ... */ and // ...
	# - parses string literals (even if they contain \" escapes)
	# - concatenates adjacent string literals
	# - parses numbers even if they contain ' as a thousands separator
	# - recognizes MACRO(arg1, arg2, ..., argN) calls


	class TT(Enum):
	"Token type enum."
	number = auto()
	ident = auto()
	string = auto()
	punct = auto()


	TOKENS = [
	(re.compile(r"-?[0-9']+"), TT.number),
	(re.compile(r"\w+"), TT.ident),
	(re.compile(r'"([^\\"]\|\\.)*"'), TT.string),
	(re.compile(r"[(),]"), TT.punct),
	(re.compile(r"/\((?!\/).)\/", re.S), None), # C-style comment
	(re.compile(r"//.*\n"), None), # C++ style oneline comment
	(re.compile(r"#.(\\\n.)*(?<!\\)\n"), None), # preprocessor directive
	(re.compile(r"\s+"), None), # whitespace
	]

	Token = namedtuple("Token", "kind code")


	class ErrorHandler:
	def __init__(self):
	self.seen_errors = False

	# This script uses some heuristical tweaks to modify the documentation
	# of some analyzer options. As this code is fragile, we record the use
	# of these tweaks and report them if they become obsolete:
	self.unused_tweaks = [
	"escape star",
	"escape underline",
	"accepted values",
	"example file content",
	]

	def record_use_of_tweak(self, tweak_name):
	try:
	self.unused_tweaks.remove(tweak_name)
	except ValueError:
	pass

	def replace_as_tweak(self, string, pattern, repl, tweak_name):
	res = string.replace(pattern, repl)
	if res != string:
	self.record_use_of_tweak(tweak_name)
	return res

	def report_error(self, msg):
	print("Error:", msg, file=sys.stderr)
	self.seen_errors = True

	def report_unexpected_char(self, s, pos):
	lines = (s[:pos] + "X").split("\n")
	lineno, col = (len(lines), len(lines[-1]))
	self.report_error(
	"unexpected character %r in AnalyzerOptions.def at line %d column %d"
	% (s[pos], lineno, col),
	)

	def report_unused_tweaks(self):
	if not self.unused_tweaks:
	return
	_is = " is" if len(self.unused_tweaks) == 1 else "s are"
	names = ", ".join(self.unused_tweaks)
	self.report_error(f"textual tweak{_is} unused in script: {names}")


	err_handler = ErrorHandler()


	def tokenize(s):
	result = []
	pos = 0
	while pos < len(s):
	for regex, kind in TOKENS:
	if m := regex.match(s, pos):
	if kind is not None:
	result.append(Token(kind, m.group(0)))
	pos = m.end()
	break
	else:
	err_handler.report_unexpected_char(s, pos)
	pos += 1
	return result


	def join_strings(tokens):
	result = []
	for tok in tokens:
	if tok.kind == TT.string and result and result[-1].kind == TT.string:
	# If this token is a string, and the previous non-ignored token is
	# also a string, then merge them into a single token. We need to
	# discard the closing " of the previous string and the opening " of
	# this string.
	prev = result.pop()
	result.append(Token(TT.string, prev.code[:-1] + tok.code[1:]))
	else:
	result.append(tok)
	return result


	MacroCall = namedtuple("MacroCall", "name args")


	class State(Enum):
	"States of the state machine used for parsing the macro calls."
	init = auto()
	after_ident = auto()
	before_arg = auto()
	after_arg = auto()


	def get_calls(tokens, macro_names):
	state = State.init
	result = []
	current = None
	for tok in tokens:
	if state == State.init and tok.kind == TT.ident and tok.code in macro_names:
	current = MacroCall(tok.code, [])
	state = State.after_ident
	elif state == State.after_ident and tok == Token(TT.punct, "("):
	state = State.before_arg
	elif state == State.before_arg:
	if current is not None:
	current.args.append(tok)
	state = State.after_arg
	elif state == State.after_arg and tok.kind == TT.punct:
	if tok.code == ")":
	result.append(current)
	current = None
	state = State.init
	elif tok.code == ",":
	state = State.before_arg
	else:
	current = None
	state = State.init
	return result


	# The information will be extracted from calls to these two macros:
	# #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)
	# #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,
	# SHALLOW_VAL, DEEP_VAL)

	MACRO_NAMES_PARAMCOUNTS = {
	"ANALYZER_OPTION": 5,
	"ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6,
	}


	def string_value(tok):
	if tok.kind != TT.string:
	raise ValueError(f"expected a string token, got {tok.kind.name}")
	text = tok.code[1:-1] # Remove quotes
	text = re.sub(r"\\(.)", r"\1", text) # Resolve backslash escapes
	return text


	def cmdflag_to_rst_title(cmdflag_tok):
	text = string_value(cmdflag_tok)
	underline = "-" * len(text)
	ref = f".. _analyzer-option-{text}:"

	return f"{ref}\n\n{text}\n{underline}\n\n"


	def desc_to_rst_paragraphs(tok):
	desc = string_value(tok)

	# Escape some characters that have special meaning in RST:
	desc = err_handler.replace_as_tweak(desc, "", r"\", "escape star")
	desc = err_handler.replace_as_tweak(desc, "_", r"\_", "escape underline")

	# Many descriptions end with "Value: <list of accepted values>", which is
	# OK for a terse command line printout, but should be prettified for web
	# documentation.
	# Moreover, the option ctu-invocation-list shows some example file content
	# which is formatted as a preformatted block.
	paragraphs = [desc]
	extra = ""
	if m := re.search(r"(^\|\s)Value:", desc):
	err_handler.record_use_of_tweak("accepted values")
	paragraphs = [desc[: m.start()], "Accepted values:" + desc[m.end() :]]
	elif m := re.search(r"\s*Example file.content:", desc):
	err_handler.record_use_of_tweak("example file content")
	paragraphs = [desc[: m.start()]]
	extra = "Example file content::\n\n " + desc[m.end() :] + "\n\n"

	wrapped = [textwrap.fill(p, width=80) for p in paragraphs if p.strip()]

	return "\n\n".join(wrapped + [""]) + extra


	def default_to_rst(tok):
	if tok.kind == TT.string:
	if tok.code == '""':
	return "(empty string)"
	return tok.code
	if tok.kind == TT.ident:
	return tok.code
	if tok.kind == TT.number:
	return tok.code.replace("'", "")
	raise ValueError(f"unexpected token as default value: {tok.kind.name}")


	def defaults_to_rst_paragraph(defaults):
	strs = [default_to_rst(d) for d in defaults]

	if len(strs) == 1:
	return f"Default value: {strs[0]}\n\n"
	if len(strs) == 2:
	return (
	f"Default value: {strs[0]} (in shallow mode) / {strs[1]} (in deep mode)\n\n"
	)
	raise ValueError("unexpected count of default values: %d" % len(defaults))


	def macro_call_to_rst_paragraphs(macro_call):
	try:
	arg_count = len(macro_call.args)
	param_count = MACRO_NAMES_PARAMCOUNTS[macro_call.name]
	if arg_count != param_count:
	raise ValueError(
	f"expected {param_count} arguments for {macro_call.name}, found {arg_count}"
	)

	_, _, cmdflag, desc, *defaults = macro_call.args

	return (
	cmdflag_to_rst_title(cmdflag)
	+ desc_to_rst_paragraphs(desc)
	+ defaults_to_rst_paragraph(defaults)
	)
	except ValueError as ve:
	err_handler.report_error(ve.args[0])
	return ""


	def get_option_list(input_file):
	with open(input_file, encoding="utf-8") as f:
	contents = f.read()
	tokens = join_strings(tokenize(contents))
	macro_calls = get_calls(tokens, MACRO_NAMES_PARAMCOUNTS)

	result = ""
	for mc in macro_calls:
	result += macro_call_to_rst_paragraphs(mc)
	return result


	p = argparse.ArgumentParser()
	p.add_argument("--options-def", help="path to AnalyzerOptions.def")
	p.add_argument("--template", help="template file")
	p.add_argument("--out", help="output file")
	opts = p.parse_args()

	with open(opts.template, encoding="utf-8") as f:
	doc_template = f.read()

	PLACEHOLDER = ".. OPTIONS_LIST_PLACEHOLDER\n"

	rst_output = doc_template.replace(PLACEHOLDER, get_option_list(opts.options_def))

	err_handler.report_unused_tweaks()

	with open(opts.out, "w", newline="", encoding="utf-8") as f:
	f.write(rst_output)

	if err_handler.seen_errors:
	sys.exit(1)