blob: cae4b3396b3d2c6cf45dba592227b105d152f818 [file] [log] [blame]
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
"""A linter that detects potential typos in FileCheck directive names.
Consider a broken test foo.cpp:
// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
auto x = 42;
// NEWW: auto is a c++11 extension
// ODL-NOT: auto is a c++11 extension
We first detect the locally valid FileCheck directive prefixes by parsing the
--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
Then we look for lines that look like directives. These are of the form 'FOO:',
usually at the beginning of a line or a comment. If any of these are a
"near-miss" for a directive name, then we suspect this is a typo and report it.
Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
"""
import itertools
import logging
import pathlib
import re
import sys
from typing import Generator, Sequence, Tuple
_distance_threshold = 3
_prefixes = {'CHECK'}
_suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'}
# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
# if encountered and to generate noise as a result. We filter them out also to
# avoid this.
_lit_directives = {
'RUN',
'REQUIRES',
'UNSUPPORTED',
'XFAIL',
'DEFINE',
'REDEFINE',
}
# 'COM' and 'RUN' are default comment prefixes for FileCheck.
_comment_prefixes = {'COM', 'RUN'}
_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'})
def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args
"""Computes the edit distance between two strings.
Additions, deletions, and substitutions all count as a single operation.
"""
if not s1:
return len(s2)
if not s2:
return len(s1)
distances = range(len(s2) + 1)
for i in range(len(s1)):
new_distances = [i + 1]
for j in range(len(s2)):
cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1,
new_distances[-1] + 1)
new_distances.append(cost)
distances = new_distances
return distances[-1]
class FileRange:
"""Stores the coordinates of a span on a single line within a file.
Attributes:
line: the line number
start_column: the (inclusive) column where the span starts
end_column: the (inclusive) column where the span ends
"""
line: int
start_column: int
end_column: int
def __init__(self, content: str, start_byte: int, end_byte: int): # pylint: disable=g-doc-args
"""Derives a span's coordinates based on a string and start/end bytes.
`start_byte` and `end_byte` are assumed to be on the same line.
"""
content_before_span = content[:start_byte]
self.line = content_before_span.count('\n') + 1
self.start_column = start_byte - content_before_span.rfind('\n')
self.end_column = self.start_column + (end_byte - start_byte - 1)
def __str__(self) -> str:
return f'{self.line}:{self.start_column}-{self.end_column}'
class Diagnostic:
"""Stores information about one typo and a suggested fix.
Attributes:
filepath: the path to the file in which the typo was found
filerange: the position at which the typo was found in the file
typo: the typo
fix: a suggested fix
"""
filepath: pathlib.Path
filerange: FileRange
typo: str
fix: str
def __init__(
self,
filepath: pathlib.Path,
filerange: FileRange,
typo: str,
fix: str # pylint: disable=redefined-outer-name
):
self.filepath = filepath
self.filerange = filerange
self.typo = typo
self.fix = fix
def __str__(self) -> str:
return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}'
def summary(self) -> str:
return (
f'Found potentially misspelled directive "{self.typo}". Did you mean '
f'"{self.fix}"?')
def find_potential_directives(
content: str,) -> Generator[Tuple[FileRange, str], None, None]:
"""Extracts all the potential FileCheck directives from a string.
What constitutes a potential directive is loosely defined---we err on the side
of capturing more strings than is necessary, rather than missing any.
Args:
content: the string in which to look for directives
Yields:
Tuples (p, d) where p is the span where the potential directive occurs
within the string and d is the potential directive.
"""
directive_pattern = re.compile(
r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE)
for match in re.finditer(directive_pattern, content):
potential_directive, span = match.group(1), match.span(1)
yield (FileRange(content, span[0], span[1]), potential_directive)
# TODO(bchetioui): also parse comment prefixes to ignore.
def parse_custom_prefixes(content: str) -> Generator[str, None, None]: # pylint: disable=g-doc-args
"""Parses custom prefixes defined in the string provided.
For example, given the following file content:
RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
the custom prefixes are CHECK1, CHECK2, and CHECK3.
"""
param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re),
content):
prefixes = m.group(1)
if prefixes.startswith('\'') or prefixes.startswith('"'):
prefixes = prefixes[1:-1]
for prefix in prefixes.split(','):
yield prefix
def find_directive_typos(
content: str,
filepath: pathlib.Path,
threshold: int = 3,
) -> Generator[Diagnostic, None, None]:
"""Detects potential typos in FileCheck directives.
Args:
content: the content of the file
filepath: the path to the file to check for typos in directives
threshold: the (inclusive) maximum edit distance between a potential
directive and an actual directive, such that the potential directive is
classified as a typo
Yields:
Diagnostics, in order from the top of the file.
"""
all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
all_directives = ([
f'{prefix}{suffix}'
for prefix, suffix in itertools.product(all_prefixes, _suffixes)
] + list(_ignore) + list(all_prefixes))
def find_best_match(typo):
return min(
[(threshold + 1, typo)] + [(levenshtein(typo, d), d)
for d in all_directives
if abs(len(d) - len(typo)) <= threshold],
key=lambda tup: tup[0],
)
potential_directives = find_potential_directives(content)
for filerange, potential_directive in potential_directives:
# TODO(bchetioui): match count directives more finely. We skip directives
# starting with 'CHECK-COUNT-' for the moment as they require more complex
# logic to be handled correctly.
if any(
potential_directive.startswith(f'{prefix}-COUNT-')
for prefix in all_prefixes):
continue
# Ignoring potential typos that will not be matched later due to a too low
# threshold, in order to avoid potentially long computation times.
if len(potential_directive) > max(map(len, all_directives)) + threshold:
continue
score, best_match = find_best_match(potential_directive)
if score == 0: # This is an actual directive, ignore.
continue
elif score <= threshold and best_match not in _ignore:
yield Diagnostic(filepath, filerange, potential_directive, best_match)
def main(argv: Sequence[str]):
if len(argv) < 2:
print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n')
exit(1)
for filepath in argv[1:]:
logging.info('Checking %s', filepath)
with open(filepath, 'rt') as f:
content = f.read()
for diagnostic in find_directive_typos(
content,
pathlib.Path(filepath),
threshold=_distance_threshold,
):
print(diagnostic)
if __name__ == '__main__':
main(sys.argv)