| # ===----------------------------------------------------------------------===## |
| # |
| # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| # |
| # ===----------------------------------------------------------------------===## |
| """A linter that detects potential typos in FileCheck directive names. |
| |
| Consider a broken test foo.cpp: |
| |
| // RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW |
| // RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD |
| auto x = 42; |
| // NEWW: auto is a c++11 extension |
| // ODL-NOT: auto is a c++11 extension |
| |
| We first detect the locally valid FileCheck directive prefixes by parsing the |
| --check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are |
| {CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. |
| |
| Then we look for lines that look like directives. These are of the form 'FOO:', |
| usually at the beginning of a line or a comment. If any of these are a |
| "near-miss" for a directive name, then we suspect this is a typo and report it. |
| |
| Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n |
| """ |
| |
| import itertools |
| import logging |
| import pathlib |
| import re |
| import sys |
| from typing import Generator, Sequence, Tuple |
| |
| _distance_threshold = 3 |
| _prefixes = {"CHECK"} |
| _suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"} |
| # 'NOTE' and 'TODO' are not directives, but are likely to be false positives |
| # if encountered and to generate noise as a result. We filter them out also to |
| # avoid this. |
| _lit_directives = { |
| "RUN", |
| "REQUIRES", |
| "UNSUPPORTED", |
| "XFAIL", |
| "DEFINE", |
| "REDEFINE", |
| } |
| # 'COM' and 'RUN' are default comment prefixes for FileCheck. |
| _comment_prefixes = {"COM", "RUN"} |
| _ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"}) |
| |
| |
| def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args |
| """Computes the edit distance between two strings. |
| |
| Additions, deletions, and substitutions all count as a single operation. |
| """ |
| if not s1: |
| return len(s2) |
| if not s2: |
| return len(s1) |
| |
| distances = range(len(s2) + 1) |
| for i in range(len(s1)): |
| new_distances = [i + 1] |
| for j in range(len(s2)): |
| cost = min( |
| distances[j] + int(s1[i] != s2[j]), |
| distances[j + 1] + 1, |
| new_distances[-1] + 1, |
| ) |
| new_distances.append(cost) |
| distances = new_distances |
| return distances[-1] |
| |
| |
| class FileRange: |
| """Stores the coordinates of a span on a single line within a file. |
| |
| Attributes: |
| line: the line number |
| start_column: the (inclusive) column where the span starts |
| end_column: the (inclusive) column where the span ends |
| """ |
| |
| line: int |
| start_column: int |
| end_column: int |
| |
| def __init__( |
| self, content: str, start_byte: int, end_byte: int |
| ): # pylint: disable=g-doc-args |
| """Derives a span's coordinates based on a string and start/end bytes. |
| |
| `start_byte` and `end_byte` are assumed to be on the same line. |
| """ |
| content_before_span = content[:start_byte] |
| self.line = content_before_span.count("\n") + 1 |
| self.start_column = start_byte - content_before_span.rfind("\n") |
| self.end_column = self.start_column + (end_byte - start_byte - 1) |
| |
| def __str__(self) -> str: |
| return f"{self.line}:{self.start_column}-{self.end_column}" |
| |
| |
| class Diagnostic: |
| """Stores information about one typo and a suggested fix. |
| |
| Attributes: |
| filepath: the path to the file in which the typo was found |
| filerange: the position at which the typo was found in the file |
| typo: the typo |
| fix: a suggested fix |
| """ |
| |
| filepath: pathlib.Path |
| filerange: FileRange |
| typo: str |
| fix: str |
| |
| def __init__( |
| self, |
| filepath: pathlib.Path, |
| filerange: FileRange, |
| typo: str, |
| fix: str, # pylint: disable=redefined-outer-name |
| ): |
| self.filepath = filepath |
| self.filerange = filerange |
| self.typo = typo |
| self.fix = fix |
| |
| def __str__(self) -> str: |
| return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}" |
| |
| def summary(self) -> str: |
| return ( |
| f'Found potentially misspelled directive "{self.typo}". Did you mean ' |
| f'"{self.fix}"?' |
| ) |
| |
| |
| def find_potential_directives( |
| content: str, |
| ) -> Generator[Tuple[FileRange, str], None, None]: |
| """Extracts all the potential FileCheck directives from a string. |
| |
| What constitutes a potential directive is loosely defined---we err on the side |
| of capturing more strings than is necessary, rather than missing any. |
| |
| Args: |
| content: the string in which to look for directives |
| |
| Yields: |
| Tuples (p, d) where p is the span where the potential directive occurs |
| within the string and d is the potential directive. |
| """ |
| directive_pattern = re.compile( |
| r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE |
| ) |
| for match in re.finditer(directive_pattern, content): |
| potential_directive, span = match.group(1), match.span(1) |
| yield (FileRange(content, span[0], span[1]), potential_directive) |
| |
| |
| # TODO(bchetioui): also parse comment prefixes to ignore. |
| def parse_custom_prefixes( |
| content: str, |
| ) -> Generator[str, None, None]: # pylint: disable=g-doc-args |
| """Parses custom prefixes defined in the string provided. |
| |
| For example, given the following file content: |
| RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 |
| RUN: something_else | FileCheck %s -check-prefix 'CHECK3' |
| |
| the custom prefixes are CHECK1, CHECK2, and CHECK3. |
| """ |
| param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) |
| for m in re.finditer( |
| r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content |
| ): |
| prefixes = m.group(1) |
| if prefixes.startswith("'") or prefixes.startswith('"'): |
| prefixes = prefixes[1:-1] |
| for prefix in prefixes.split(","): |
| yield prefix |
| |
| |
| def find_directive_typos( |
| content: str, |
| filepath: pathlib.Path, |
| threshold: int = 3, |
| ) -> Generator[Diagnostic, None, None]: |
| """Detects potential typos in FileCheck directives. |
| |
| Args: |
| content: the content of the file |
| filepath: the path to the file to check for typos in directives |
| threshold: the (inclusive) maximum edit distance between a potential |
| directive and an actual directive, such that the potential directive is |
| classified as a typo |
| |
| Yields: |
| Diagnostics, in order from the top of the file. |
| """ |
| all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) |
| all_directives = ( |
| [ |
| f"{prefix}{suffix}" |
| for prefix, suffix in itertools.product(all_prefixes, _suffixes) |
| ] |
| + list(_ignore) |
| + list(all_prefixes) |
| ) |
| |
| def find_best_match(typo): |
| return min( |
| [(threshold + 1, typo)] |
| + [ |
| (levenshtein(typo, d), d) |
| for d in all_directives |
| if abs(len(d) - len(typo)) <= threshold |
| ], |
| key=lambda tup: tup[0], |
| ) |
| |
| potential_directives = find_potential_directives(content) |
| |
| for filerange, potential_directive in potential_directives: |
| # TODO(bchetioui): match count directives more finely. We skip directives |
| # starting with 'CHECK-COUNT-' for the moment as they require more complex |
| # logic to be handled correctly. |
| if any( |
| potential_directive.startswith(f"{prefix}-COUNT-") |
| for prefix in all_prefixes |
| ): |
| continue |
| |
| # Ignoring potential typos that will not be matched later due to a too low |
| # threshold, in order to avoid potentially long computation times. |
| if len(potential_directive) > max(map(len, all_directives)) + threshold: |
| continue |
| |
| score, best_match = find_best_match(potential_directive) |
| if score == 0: # This is an actual directive, ignore. |
| continue |
| elif score <= threshold and best_match not in _ignore: |
| yield Diagnostic(filepath, filerange, potential_directive, best_match) |
| |
| |
| def main(argv: Sequence[str]): |
| if len(argv) < 2: |
| print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n") |
| exit(1) |
| |
| for filepath in argv[1:]: |
| logging.info("Checking %s", filepath) |
| with open(filepath, "rt") as f: |
| content = f.read() |
| for diagnostic in find_directive_typos( |
| content, |
| pathlib.Path(filepath), |
| threshold=_distance_threshold, |
| ): |
| print(diagnostic) |
| |
| |
| if __name__ == "__main__": |
| main(sys.argv) |