|  | //===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file implements fuzzy-matching of strings against identifiers. | 
|  | // It indicates both the existence and quality of a match: | 
|  | // 'eb' matches both 'emplace_back' and 'embed', the former has a better score. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H | 
|  | #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H | 
|  |  | 
|  | #include "llvm/ADT/ArrayRef.h" | 
|  | #include "llvm/ADT/SmallString.h" | 
|  | #include "llvm/ADT/StringRef.h" | 
|  | #include "llvm/Support/raw_ostream.h" | 
|  | #include <optional> | 
|  |  | 
|  | namespace clang { | 
|  | namespace clangd { | 
|  |  | 
|  | // Utilities for word segmentation. | 
|  | // FuzzyMatcher already incorporates this logic, so most users don't need this. | 
|  | // | 
|  | // A name like "fooBar_baz" consists of several parts foo, bar, baz. | 
|  | // Aligning segmentation of word and pattern improves the fuzzy-match. | 
|  | // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" | 
|  | // | 
|  | // First we classify each character into types (uppercase, lowercase, etc). | 
|  | // Then we look at the sequence: e.g. [upper, lower] is the start of a segment. | 
|  |  | 
|  | // We distinguish the types of characters that affect segmentation. | 
|  | // It's not obvious how to segment digits, we treat them as lowercase letters. | 
|  | // As we don't decode UTF-8, we treat bytes over 127 as lowercase too. | 
|  | // This means we require exact (case-sensitive) match for those characters. | 
|  | enum CharType : unsigned char { | 
|  | Empty = 0,       // Before-the-start and after-the-end (and control chars). | 
|  | Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes. | 
|  | Upper = 2,       // Uppercase letters. | 
|  | Punctuation = 3, // ASCII punctuation (including Space) | 
|  | }; | 
|  | // A CharTypeSet is a bitfield representing all the character types in a word. | 
|  | // Its bits are 1<<Empty, 1<<Lower, etc. | 
|  | using CharTypeSet = unsigned char; | 
|  |  | 
|  | // Each character's Role is the Head or Tail of a segment, or a Separator. | 
|  | // e.g. XMLHttpRequest_Async | 
|  | //      +--+---+------ +---- | 
|  | //      ^Head   ^Tail ^Separator | 
|  | enum CharRole : unsigned char { | 
|  | Unknown = 0,   // Stray control characters or impossible states. | 
|  | Tail = 1,      // Part of a word segment, but not the first character. | 
|  | Head = 2,      // The first character of a word segment. | 
|  | Separator = 3, // Punctuation characters that separate word segments. | 
|  | }; | 
|  |  | 
|  | // Compute segmentation of Text. | 
|  | // Character roles are stored in Roles (Roles.size() must equal Text.size()). | 
|  | // The set of character types encountered is returned, this may inform | 
|  | // heuristics for dealing with poorly-segmented identifiers like "strndup". | 
|  | CharTypeSet calculateRoles(llvm::StringRef Text, | 
|  | llvm::MutableArrayRef<CharRole> Roles); | 
|  |  | 
|  | // A matcher capable of matching and scoring strings against a single pattern. | 
|  | // It's optimized for matching against many strings - match() does not allocate. | 
|  | class FuzzyMatcher { | 
|  | public: | 
|  | // Characters beyond MaxPat are ignored. | 
|  | FuzzyMatcher(llvm::StringRef Pattern); | 
|  |  | 
|  | // If Word matches the pattern, return a score indicating the quality match. | 
|  | // Scores usually fall in a [0,1] range, with 1 being a very good score. | 
|  | // "Super" scores in (1,2] are possible if the pattern is the full word. | 
|  | // Characters beyond MaxWord are ignored. | 
|  | std::optional<float> match(llvm::StringRef Word); | 
|  |  | 
|  | llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); } | 
|  | bool empty() const { return PatN == 0; } | 
|  |  | 
|  | // Dump internal state from the last match() to the stream, for debugging. | 
|  | // Returns the pattern with [] around matched characters, e.g. | 
|  | //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr" | 
|  | llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const; | 
|  |  | 
|  | private: | 
|  | // We truncate the pattern and the word to bound the cost of matching. | 
|  | constexpr static int MaxPat = 63, MaxWord = 127; | 
|  | // Action describes how a word character was matched to the pattern. | 
|  | // It should be an enum, but this causes bitfield problems: | 
|  | //   - for MSVC the enum type must be explicitly unsigned for correctness | 
|  | //   - GCC 4.8 complains not all values fit if the type is unsigned | 
|  | using Action = bool; | 
|  | constexpr static Action Miss = false; // Word character was skipped. | 
|  | constexpr static Action Match = true; // Matched against a pattern character. | 
|  |  | 
|  | bool init(llvm::StringRef Word); | 
|  | void buildGraph(); | 
|  | bool allowMatch(int P, int W, Action Last) const; | 
|  | int skipPenalty(int W, Action Last) const; | 
|  | int matchBonus(int P, int W, Action Last) const; | 
|  |  | 
|  | // Pattern data is initialized by the constructor, then constant. | 
|  | char Pat[MaxPat];         // Pattern data | 
|  | int PatN;                 // Length | 
|  | char LowPat[MaxPat];      // Pattern in lowercase | 
|  | CharRole PatRole[MaxPat]; // Pattern segmentation info | 
|  | CharTypeSet PatTypeSet;   // Bitmask of 1<<CharType for all Pattern characters | 
|  | float ScoreScale;         // Normalizes scores for the pattern length. | 
|  |  | 
|  | // Word data is initialized on each call to match(), mostly by init(). | 
|  | char Word[MaxWord];         // Word data | 
|  | int WordN;                  // Length | 
|  | char LowWord[MaxWord];      // Word in lowercase | 
|  | CharRole WordRole[MaxWord]; // Word segmentation info | 
|  | CharTypeSet WordTypeSet;    // Bitmask of 1<<CharType for all Word characters | 
|  | bool WordContainsPattern;   // Simple substring check | 
|  |  | 
|  | // Cumulative best-match score table. | 
|  | // Boundary conditions are filled in by the constructor. | 
|  | // The rest is repopulated for each match(), by buildGraph(). | 
|  | struct ScoreInfo { | 
|  | signed int Score : 15; | 
|  | Action Prev : 1; | 
|  | }; | 
|  | ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2]; | 
|  | }; | 
|  |  | 
|  | } // namespace clangd | 
|  | } // namespace clang | 
|  |  | 
|  | #endif |