utils/filecheck_lint/filecheck_lint.py - llvm-project/llvm - Git at Google

 # ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # ===----------------------------------------------------------------------===##
 """A linter that detects potential typos in FileCheck directive names.

 Consider a broken test foo.cpp:

 // RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
 // RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
 auto x = 42;
 // NEWW: auto is a c++11 extension
 // ODL-NOT: auto is a c++11 extension

 We first detect the locally valid FileCheck directive prefixes by parsing the
 --check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
 {CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.

 Then we look for lines that look like directives. These are of the form 'FOO:',
 usually at the beginning of a line or a comment. If any of these are a
 "near-miss" for a directive name, then we suspect this is a typo and report it.

 Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
 """

 import itertools
 import logging
 import pathlib
 import re
 import sys
 from typing import Generator, Sequence, Tuple

 _distance_threshold = 3
 _prefixes = {"CHECK"}
 _suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
 # 'NOTE' and 'TODO' are not directives, but are likely to be false positives
 # if encountered and to generate noise as a result. We filter them out also to
 # avoid this.
 _lit_directives = {
     "RUN",
     "REQUIRES",
     "UNSUPPORTED",
     "XFAIL",
     "DEFINE",
     "REDEFINE",
 }
 # 'COM' and 'RUN' are default comment prefixes for FileCheck.
 _comment_prefixes = {"COM", "RUN"}
 _ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})


 def levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
     """Computes the edit distance between two strings.

     Additions, deletions, and substitutions all count as a single operation.
     """
     if not s1:
         return len(s2)
     if not s2:
         return len(s1)

     distances = range(len(s2) + 1)
     for i in range(len(s1)):
         new_distances = [i + 1]
         for j in range(len(s2)):
             cost = min(
                 distances[j] + int(s1[i] != s2[j]),
                 distances[j + 1] + 1,
                 new_distances[-1] + 1,
             )
             new_distances.append(cost)
         distances = new_distances
     return distances[-1]


 class FileRange:
     """Stores the coordinates of a span on a single line within a file.

     Attributes:
       line:         the line number
       start_column: the (inclusive) column where the span starts
       end_column:   the (inclusive) column where the span ends
     """

     line: int
     start_column: int
     end_column: int

     def __init__(
         self, content: str, start_byte: int, end_byte: int
     ):  # pylint: disable=g-doc-args
         """Derives a span's coordinates based on a string and start/end bytes.

         `start_byte` and `end_byte` are assumed to be on the same line.
         """
         content_before_span = content[:start_byte]
         self.line = content_before_span.count("\n") + 1
         self.start_column = start_byte - content_before_span.rfind("\n")
         self.end_column = self.start_column + (end_byte - start_byte - 1)

     def __str__(self) -> str:
         return f"{self.line}:{self.start_column}-{self.end_column}"


 class Diagnostic:
     """Stores information about one typo and a suggested fix.

     Attributes:
       filepath:   the path to the file in which the typo was found
       filerange:  the position at which the typo was found in the file
       typo:       the typo
       fix:        a suggested fix
     """

     filepath: pathlib.Path
     filerange: FileRange
     typo: str
     fix: str

     def __init__(
         self,
         filepath: pathlib.Path,
         filerange: FileRange,
         typo: str,
         fix: str,  # pylint: disable=redefined-outer-name
     ):
         self.filepath = filepath
         self.filerange = filerange
         self.typo = typo
         self.fix = fix

     def __str__(self) -> str:
         return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}"

     def summary(self) -> str:
         return (
             f'Found potentially misspelled directive "{self.typo}". Did you mean '
             f'"{self.fix}"?'
         )


 def find_potential_directives(
     content: str,
 ) -> Generator[Tuple[FileRange, str], None, None]:
     """Extracts all the potential FileCheck directives from a string.

     What constitutes a potential directive is loosely defined---we err on the side
     of capturing more strings than is necessary, rather than missing any.

     Args:
       content: the string in which to look for directives

     Yields:
       Tuples (p, d) where p is the span where the potential directive occurs
       within the string and d is the potential directive.
     """
     directive_pattern = re.compile(
         r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE
     )
     for match in re.finditer(directive_pattern, content):
         potential_directive, span = match.group(1), match.span(1)
         yield (FileRange(content, span[0], span[1]), potential_directive)


 # TODO(bchetioui): also parse comment prefixes to ignore.
 def parse_custom_prefixes(
     content: str,
 ) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
     """Parses custom prefixes defined in the string provided.

     For example, given the following file content:
       RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
       RUN: something_else | FileCheck %s -check-prefix 'CHECK3'

     the custom prefixes are CHECK1, CHECK2, and CHECK3.
     """
     param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
     for m in re.finditer(
         r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content
     ):
         prefixes = m.group(1)
         if prefixes.startswith("'") or prefixes.startswith('"'):
             prefixes = prefixes[1:-1]
         for prefix in prefixes.split(","):
             yield prefix


 def find_directive_typos(
     content: str,
     filepath: pathlib.Path,
     threshold: int = 3,
 ) -> Generator[Diagnostic, None, None]:
     """Detects potential typos in FileCheck directives.

     Args:
       content: the content of the file
       filepath: the path to the file to check for typos in directives
       threshold: the (inclusive) maximum edit distance between a potential
         directive and an actual directive, such that the potential directive is
         classified as a typo

     Yields:
       Diagnostics, in order from the top of the file.
     """
     all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
     all_directives = (
         [
             f"{prefix}{suffix}"
             for prefix, suffix in itertools.product(all_prefixes, _suffixes)
         ]
         + list(_ignore)
         + list(all_prefixes)
     )

     def find_best_match(typo):
         return min(
             [(threshold + 1, typo)]
             + [
                 (levenshtein(typo, d), d)
                 for d in all_directives
                 if abs(len(d) - len(typo)) <= threshold
             ],
             key=lambda tup: tup[0],
         )

     potential_directives = find_potential_directives(content)

     for filerange, potential_directive in potential_directives:
         # TODO(bchetioui): match count directives more finely. We skip directives
         # starting with 'CHECK-COUNT-' for the moment as they require more complex
         # logic to be handled correctly.
         if any(
             potential_directive.startswith(f"{prefix}-COUNT-")
             for prefix in all_prefixes
         ):
             continue

         # Ignoring potential typos that will not be matched later due to a too low
         # threshold, in order to avoid potentially long computation times.
         if len(potential_directive) > max(map(len, all_directives)) + threshold:
             continue

         score, best_match = find_best_match(potential_directive)
         if score == 0:  # This is an actual directive, ignore.
             continue
         elif score <= threshold and best_match not in _ignore:
             yield Diagnostic(filepath, filerange, potential_directive, best_match)


 def main(argv: Sequence[str]):
     if len(argv) < 2:
         print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
         exit(1)

     for filepath in argv[1:]:
         logging.info("Checking %s", filepath)
         with open(filepath, "rt") as f:
             content = f.read()
         for diagnostic in find_directive_typos(
             content,
             pathlib.Path(filepath),
             threshold=_distance_threshold,
         ):
             print(diagnostic)


 if __name__ == "__main__":
     main(sys.argv)
	# ===----------------------------------------------------------------------===##
	#
	# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	#
	# ===----------------------------------------------------------------------===##
	"""A linter that detects potential typos in FileCheck directive names.

	Consider a broken test foo.cpp:

	// RUN: clang -cc1 -ast-dump %s \| FileCheck %s --check-prefix=NEW
	// RUN: clang -cc1 -ast-dump %s -std=c++98 \| FileCheck %s --check-prefix=OLD
	auto x = 42;
	// NEWW: auto is a c++11 extension
	// ODL-NOT: auto is a c++11 extension

	We first detect the locally valid FileCheck directive prefixes by parsing the
	--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
	{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.

	Then we look for lines that look like directives. These are of the form 'FOO:',
	usually at the beginning of a line or a comment. If any of these are a
	"near-miss" for a directive name, then we suspect this is a typo and report it.

	Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
	"""

	import itertools
	import logging
	import pathlib
	import re
	import sys
	from typing import Generator, Sequence, Tuple

	_distance_threshold = 3
	_prefixes = {"CHECK"}
	_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
	# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
	# if encountered and to generate noise as a result. We filter them out also to
	# avoid this.
	_lit_directives = {
	"RUN",
	"REQUIRES",
	"UNSUPPORTED",
	"XFAIL",
	"DEFINE",
	"REDEFINE",
	}
	# 'COM' and 'RUN' are default comment prefixes for FileCheck.
	_comment_prefixes = {"COM", "RUN"}
	_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})


	def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args
	"""Computes the edit distance between two strings.

	Additions, deletions, and substitutions all count as a single operation.
	"""
	if not s1:
	return len(s2)
	if not s2:
	return len(s1)

	distances = range(len(s2) + 1)
	for i in range(len(s1)):
	new_distances = [i + 1]
	for j in range(len(s2)):
	cost = min(
	distances[j] + int(s1[i] != s2[j]),
	distances[j + 1] + 1,
	new_distances[-1] + 1,
	)
	new_distances.append(cost)
	distances = new_distances
	return distances[-1]


	class FileRange:
	"""Stores the coordinates of a span on a single line within a file.

	Attributes:
	line: the line number
	start_column: the (inclusive) column where the span starts
	end_column: the (inclusive) column where the span ends
	"""

	line: int
	start_column: int
	end_column: int

	def __init__(
	self, content: str, start_byte: int, end_byte: int
	): # pylint: disable=g-doc-args
	"""Derives a span's coordinates based on a string and start/end bytes.

	`start_byte` and `end_byte` are assumed to be on the same line.
	"""
	content_before_span = content[:start_byte]
	self.line = content_before_span.count("\n") + 1
	self.start_column = start_byte - content_before_span.rfind("\n")
	self.end_column = self.start_column + (end_byte - start_byte - 1)

	def __str__(self) -> str:
	return f"{self.line}:{self.start_column}-{self.end_column}"


	class Diagnostic:
	"""Stores information about one typo and a suggested fix.

	Attributes:
	filepath: the path to the file in which the typo was found
	filerange: the position at which the typo was found in the file
	typo: the typo
	fix: a suggested fix
	"""

	filepath: pathlib.Path
	filerange: FileRange
	typo: str
	fix: str

	def __init__(
	self,
	filepath: pathlib.Path,
	filerange: FileRange,
	typo: str,
	fix: str, # pylint: disable=redefined-outer-name
	):
	self.filepath = filepath
	self.filerange = filerange
	self.typo = typo
	self.fix = fix

	def __str__(self) -> str:
	return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}"

	def summary(self) -> str:
	return (
	f'Found potentially misspelled directive "{self.typo}". Did you mean '
	f'"{self.fix}"?'
	)


	def find_potential_directives(
	content: str,
	) -> Generator[Tuple[FileRange, str], None, None]:
	"""Extracts all the potential FileCheck directives from a string.

	What constitutes a potential directive is loosely defined---we err on the side
	of capturing more strings than is necessary, rather than missing any.

	Args:
	content: the string in which to look for directives

	Yields:
	Tuples (p, d) where p is the span where the potential directive occurs
	within the string and d is the potential directive.
	"""
	directive_pattern = re.compile(
	r"(?:^\|//\|;\|#)[^\d\w\-_]([\d\w\-_][\s\d\w\-_]):", re.MULTILINE
	)
	for match in re.finditer(directive_pattern, content):
	potential_directive, span = match.group(1), match.span(1)
	yield (FileRange(content, span[0], span[1]), potential_directive)


	# TODO(bchetioui): also parse comment prefixes to ignore.
	def parse_custom_prefixes(
	content: str,
	) -> Generator[str, None, None]: # pylint: disable=g-doc-args
	"""Parses custom prefixes defined in the string provided.

	For example, given the following file content:
	RUN: something \| FileCheck %s -check-prefixes CHECK1,CHECK2
	RUN: something_else \| FileCheck %s -check-prefix 'CHECK3'

	the custom prefixes are CHECK1, CHECK2, and CHECK3.
	"""
	param_re = r"\|".join([r"'[^']'", r'"[^"]"', r'[^\'"\s]+'])
	for m in re.finditer(
	r"-check-prefix(?:es)?(?:\s+\|=)({})".format(param_re), content
	):
	prefixes = m.group(1)
	if prefixes.startswith("'") or prefixes.startswith('"'):
	prefixes = prefixes[1:-1]
	for prefix in prefixes.split(","):
	yield prefix


	def find_directive_typos(
	content: str,
	filepath: pathlib.Path,
	threshold: int = 3,
	) -> Generator[Diagnostic, None, None]:
	"""Detects potential typos in FileCheck directives.

	Args:
	content: the content of the file
	filepath: the path to the file to check for typos in directives
	threshold: the (inclusive) maximum edit distance between a potential
	directive and an actual directive, such that the potential directive is
	classified as a typo

	Yields:
	Diagnostics, in order from the top of the file.
	"""
	all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
	all_directives = (
	[
	f"{prefix}{suffix}"
	for prefix, suffix in itertools.product(all_prefixes, _suffixes)
	]
	+ list(_ignore)
	+ list(all_prefixes)
	)

	def find_best_match(typo):
	return min(
	[(threshold + 1, typo)]
	+ [
	(levenshtein(typo, d), d)
	for d in all_directives
	if abs(len(d) - len(typo)) <= threshold
	],
	key=lambda tup: tup[0],
	)

	potential_directives = find_potential_directives(content)

	for filerange, potential_directive in potential_directives:
	# TODO(bchetioui): match count directives more finely. We skip directives
	# starting with 'CHECK-COUNT-' for the moment as they require more complex
	# logic to be handled correctly.
	if any(
	potential_directive.startswith(f"{prefix}-COUNT-")
	for prefix in all_prefixes
	):
	continue

	# Ignoring potential typos that will not be matched later due to a too low
	# threshold, in order to avoid potentially long computation times.
	if len(potential_directive) > max(map(len, all_directives)) + threshold:
	continue

	score, best_match = find_best_match(potential_directive)
	if score == 0: # This is an actual directive, ignore.
	continue
	elif score <= threshold and best_match not in _ignore:
	yield Diagnostic(filepath, filerange, potential_directive, best_match)


	def main(argv: Sequence[str]):
	if len(argv) < 2:
	print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
	exit(1)

	for filepath in argv[1:]:
	logging.info("Checking %s", filepath)
	with open(filepath, "rt") as f:
	content = f.read()
	for diagnostic in find_directive_typos(
	content,
	pathlib.Path(filepath),
	threshold=_distance_threshold,
	):
	print(diagnostic)


	if __name__ == "__main__":
	main(sys.argv)