| //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'), |
| // we consider the possible partial match states: |
| // |
| // u n i q u e _ p t r |
| // +--------------------- |
| // |A . . . . . . . . . . |
| // u| |
| // |. . . . . . . . . . . |
| // _| |
| // |. . . . . . . O . . . |
| // p| |
| // |. . . . . . . . . . B |
| // |
| // Each dot represents some prefix of the pattern being matched against some |
| // prefix of the word. |
| // - A is the initial state: '' matched against '' |
| // - O is an intermediate state: 'u_' matched against 'unique_' |
| // - B is the target state: 'u_p' matched against 'unique_ptr' |
| // |
| // We aim to find the best path from A->B. |
| // - Moving right (consuming a word character) |
| // Always legal: not all word characters must match. |
| // - Moving diagonally (consuming both a word and pattern character) |
| // Legal if the characters match. |
| // - Moving down (consuming a pattern character) is never legal. |
| // Never legal: all pattern characters must match something. |
| // Characters are matched case-insensitively. |
| // The first pattern character may only match the start of a word segment. |
| // |
| // The scoring is based on heuristics: |
| // - when matching a character, apply a bonus or penalty depending on the |
| // match quality (does case match, do word segments align, etc) |
| // - when skipping a character, apply a penalty if it hurts the match |
| // (it starts a word segment, or splits the matched region, etc) |
| // |
| // These heuristics require the ability to "look backward" one character, to |
| // see whether it was matched or not. Therefore the dynamic-programming matrix |
| // has an extra dimension (last character matched). |
| // Each entry also has an additional flag indicating whether the last-but-one |
| // character matched, which is needed to trace back through the scoring table |
| // and reconstruct the match. |
| // |
| // We treat strings as byte-sequences, so only ASCII has first-class support. |
| // |
| // This algorithm was inspired by VS code's client-side filtering, and aims |
| // to be mostly-compatible. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "FuzzyMatch.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/Support/Format.h" |
| |
| namespace clang { |
| namespace clangd { |
| |
| constexpr int FuzzyMatcher::MaxPat; |
| constexpr int FuzzyMatcher::MaxWord; |
| |
| static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; } |
| // A "negative infinity" score that won't overflow. |
| // We use this to mark unreachable states and forbidden solutions. |
| // Score field is 15 bits wide, min value is -2^14, we use half of that. |
| static constexpr int AwfulScore = -(1 << 13); |
| static bool isAwful(int S) { return S < AwfulScore / 2; } |
| static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score. |
| |
| FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern) |
| : PatN(std::min<int>(MaxPat, Pattern.size())), |
| ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) { |
| std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat); |
| for (int I = 0; I < PatN; ++I) |
| LowPat[I] = lower(Pat[I]); |
| Scores[0][0][Miss] = {0, Miss}; |
| Scores[0][0][Match] = {AwfulScore, Miss}; |
| for (int P = 0; P <= PatN; ++P) |
| for (int W = 0; W < P; ++W) |
| for (Action A : {Miss, Match}) |
| Scores[P][W][A] = {AwfulScore, Miss}; |
| PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN), |
| llvm::makeMutableArrayRef(PatRole, PatN)); |
| } |
| |
| llvm::Optional<float> FuzzyMatcher::match(llvm::StringRef Word) { |
| if (!(WordContainsPattern = init(Word))) |
| return llvm::None; |
| if (!PatN) |
| return 1; |
| buildGraph(); |
| auto Best = std::max(Scores[PatN][WordN][Miss].Score, |
| Scores[PatN][WordN][Match].Score); |
| if (isAwful(Best)) |
| return llvm::None; |
| float Score = |
| ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best)); |
| // If the pattern is as long as the word, we have an exact string match, |
| // since every pattern character must match something. |
| if (WordN == PatN) |
| Score *= 2; // May not be perfect 2 if case differs in a significant way. |
| return Score; |
| } |
| |
| // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte. |
| // The top 6 bits of the char select the byte, the bottom 2 select the offset. |
| // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower. |
| constexpr static uint8_t CharTypes[] = { |
| 0x00, 0x00, 0x00, 0x00, // Control characters |
| 0x00, 0x00, 0x00, 0x00, // Control characters |
| 0xff, 0xff, 0xff, 0xff, // Punctuation |
| 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation. |
| 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O |
| 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation. |
| 0x57, 0x55, 0x55, 0x55, // ` and a-o |
| 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL. |
| 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower. |
| 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8). |
| 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, |
| 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, |
| }; |
| |
| // The Role can be determined from the Type of a character and its neighbors: |
| // |
| // Example | Chars | Type | Role |
| // ---------+--------------+----- |
| // F(o)oBar | Foo | Ull | Tail |
| // Foo(B)ar | oBa | lUl | Head |
| // (f)oo | ^fo | Ell | Head |
| // H(T)TP | HTT | UUU | Tail |
| // |
| // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role. |
| // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset. |
| // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head. |
| constexpr static uint8_t CharRoles[] = { |
| // clang-format off |
| // Curr= Empty Lower Upper Separ |
| /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head |
| /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail |
| /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail |
| /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start |
| // clang-format on |
| }; |
| |
| template <typename T> static T packedLookup(const uint8_t *Data, int I) { |
| return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3); |
| } |
| CharTypeSet calculateRoles(llvm::StringRef Text, |
| llvm::MutableArrayRef<CharRole> Roles) { |
| assert(Text.size() == Roles.size()); |
| if (Text.size() == 0) |
| return 0; |
| CharType Type = packedLookup<CharType>(CharTypes, Text[0]); |
| CharTypeSet TypeSet = 1 << Type; |
| // Types holds a sliding window of (Prev, Curr, Next) types. |
| // Initial value is (Empty, Empty, type of Text[0]). |
| int Types = Type; |
| // Rotate slides in the type of the next character. |
| auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; |
| for (unsigned I = 0; I < Text.size() - 1; ++I) { |
| // For each character, rotate in the next, and look up the role. |
| Type = packedLookup<CharType>(CharTypes, Text[I + 1]); |
| TypeSet |= 1 << Type; |
| Rotate(Type); |
| Roles[I] = packedLookup<CharRole>(CharRoles, Types); |
| } |
| // For the last character, the "next character" is Empty. |
| Rotate(Empty); |
| Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types); |
| return TypeSet; |
| } |
| |
| // Sets up the data structures matching Word. |
| // Returns false if we can cheaply determine that no match is possible. |
| bool FuzzyMatcher::init(llvm::StringRef NewWord) { |
| WordN = std::min<int>(MaxWord, NewWord.size()); |
| if (PatN > WordN) |
| return false; |
| std::copy(NewWord.begin(), NewWord.begin() + WordN, Word); |
| if (PatN == 0) |
| return true; |
| for (int I = 0; I < WordN; ++I) |
| LowWord[I] = lower(Word[I]); |
| |
| // Cheap subsequence check. |
| for (int W = 0, P = 0; P != PatN; ++W) { |
| if (W == WordN) |
| return false; |
| if (LowWord[W] == LowPat[P]) |
| ++P; |
| } |
| |
| // FIXME: some words are hard to tokenize algorithmically. |
| // e.g. vsprintf is V S Print F, and should match [pri] but not [int]. |
| // We could add a tokenization dictionary for common stdlib names. |
| WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN), |
| llvm::makeMutableArrayRef(WordRole, WordN)); |
| return true; |
| } |
| |
| // The forwards pass finds the mappings of Pattern onto Word. |
| // Score = best score achieved matching Word[..W] against Pat[..P]. |
| // Unlike other tables, indices range from 0 to N *inclusive* |
| // Matched = whether we chose to match Word[W] with Pat[P] or not. |
| // |
| // Points are mostly assigned to matched characters, with 1 being a good score |
| // and 3 being a great one. So we treat the score range as [0, 3 * PatN]. |
| // This range is not strict: we can apply larger bonuses/penalties, or penalize |
| // non-matched characters. |
| void FuzzyMatcher::buildGraph() { |
| for (int W = 0; W < WordN; ++W) { |
| Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss), |
| Miss}; |
| Scores[0][W + 1][Match] = {AwfulScore, Miss}; |
| } |
| for (int P = 0; P < PatN; ++P) { |
| for (int W = P; W < WordN; ++W) { |
| auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W]; |
| |
| auto MatchMissScore = PreMiss[Match].Score; |
| auto MissMissScore = PreMiss[Miss].Score; |
| if (P < PatN - 1) { // Skipping trailing characters is always free. |
| MatchMissScore -= skipPenalty(W, Match); |
| MissMissScore -= skipPenalty(W, Miss); |
| } |
| Score[Miss] = (MatchMissScore > MissMissScore) |
| ? ScoreInfo{MatchMissScore, Match} |
| : ScoreInfo{MissMissScore, Miss}; |
| |
| auto &PreMatch = Scores[P][W]; |
| auto MatchMatchScore = |
| allowMatch(P, W, Match) |
| ? PreMatch[Match].Score + matchBonus(P, W, Match) |
| : AwfulScore; |
| auto MissMatchScore = allowMatch(P, W, Miss) |
| ? PreMatch[Miss].Score + matchBonus(P, W, Miss) |
| : AwfulScore; |
| Score[Match] = (MatchMatchScore > MissMatchScore) |
| ? ScoreInfo{MatchMatchScore, Match} |
| : ScoreInfo{MissMatchScore, Miss}; |
| } |
| } |
| } |
| |
| bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const { |
| if (LowPat[P] != LowWord[W]) |
| return false; |
| // We require a "strong" match: |
| // - for the first pattern character. [foo] !~ "barefoot" |
| // - after a gap. [pat] !~ "patnther" |
| if (Last == Miss) { |
| // We're banning matches outright, so conservatively accept some other cases |
| // where our segmentation might be wrong: |
| // - allow matching B in ABCDef (but not in NDEBUG) |
| // - we'd like to accept print in sprintf, but too many false positives |
| if (WordRole[W] == Tail && |
| (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower))) |
| return false; |
| } |
| return true; |
| } |
| |
| int FuzzyMatcher::skipPenalty(int W, Action Last) const { |
| if (W == 0) // Skipping the first character. |
| return 3; |
| if (WordRole[W] == Head) // Skipping a segment. |
| return 1; // We want to keep this lower than a consecutive match bonus. |
| // Instead of penalizing non-consecutive matches, we give a bonus to a |
| // consecutive match in matchBonus. This produces a better score distribution |
| // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'. |
| return 0; |
| } |
| |
| int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { |
| assert(LowPat[P] == LowWord[W]); |
| int S = 1; |
| bool IsPatSingleCase = |
| (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper); |
| // Bonus: case matches, or a Head in the pattern aligns with one in the word. |
| // Single-case patterns lack segmentation signals and we assume any character |
| // can be a head of a segment. |
| if (Pat[P] == Word[W] || |
| (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head))) |
| ++S; |
| // Bonus: a consecutive match. First character match also gets a bonus to |
| // ensure prefix final match score normalizes to 1.0. |
| if (W == 0 || Last == Match) |
| S += 2; |
| // Penalty: matching inside a segment (and previous char wasn't matched). |
| if (WordRole[W] == Tail && P && Last == Miss) |
| S -= 3; |
| // Penalty: a Head in the pattern matches in the middle of a word segment. |
| if (PatRole[P] == Head && WordRole[W] == Tail) |
| --S; |
| // Penalty: matching the first pattern character in the middle of a segment. |
| if (P == 0 && WordRole[W] == Tail) |
| S -= 4; |
| assert(S <= PerfectBonus); |
| return S; |
| } |
| |
| llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { |
| llvm::SmallString<256> Result; |
| OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against [" |
| << llvm::StringRef(Pat, PatN) << "] ===\n"; |
| if (PatN == 0) { |
| OS << "Pattern is empty: perfect match.\n"; |
| return Result = llvm::StringRef(Word, WordN); |
| } |
| if (WordN == 0) { |
| OS << "Word is empty: no match.\n"; |
| return Result; |
| } |
| if (!WordContainsPattern) { |
| OS << "Substring check failed.\n"; |
| return Result; |
| } |
| if (isAwful(std::max(Scores[PatN][WordN][Match].Score, |
| Scores[PatN][WordN][Miss].Score))) { |
| OS << "Substring check passed, but all matches are forbidden\n"; |
| } |
| if (!(PatTypeSet & 1 << Upper)) |
| OS << "Lowercase query, so scoring ignores case\n"; |
| |
| // Traverse Matched table backwards to reconstruct the Pattern/Word mapping. |
| // The Score table has cumulative scores, subtracting along this path gives |
| // us the per-letter scores. |
| Action Last = |
| (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score) |
| ? Match |
| : Miss; |
| int S[MaxWord]; |
| Action A[MaxWord]; |
| for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) { |
| A[W] = Last; |
| const auto &Cell = Scores[P + 1][W + 1][Last]; |
| if (Last == Match) |
| --P; |
| const auto &Prev = Scores[P + 1][W][Cell.Prev]; |
| S[W] = Cell.Score - Prev.Score; |
| Last = Cell.Prev; |
| } |
| for (int I = 0; I < WordN; ++I) { |
| if (A[I] == Match && (I == 0 || A[I - 1] == Miss)) |
| Result.push_back('['); |
| if (A[I] == Miss && I > 0 && A[I - 1] == Match) |
| Result.push_back(']'); |
| Result.push_back(Word[I]); |
| } |
| if (A[WordN - 1] == Match) |
| Result.push_back(']'); |
| |
| for (char C : llvm::StringRef(Word, WordN)) |
| OS << " " << C << " "; |
| OS << "\n"; |
| for (int I = 0, J = 0; I < WordN; I++) |
| OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " "; |
| OS << "\n"; |
| for (int I = 0; I < WordN; I++) |
| OS << llvm::format("%2d ", S[I]); |
| OS << "\n"; |
| |
| OS << "\nSegmentation:"; |
| OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n "; |
| for (int I = 0; I < WordN; ++I) |
| OS << "?-+ "[static_cast<int>(WordRole[I])]; |
| OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n "; |
| for (int I = 0; I < PatN; ++I) |
| OS << "?-+ "[static_cast<int>(PatRole[I])]; |
| OS << "\n"; |
| |
| OS << "\nScoring table (last-Miss, last-Match):\n"; |
| OS << " | "; |
| for (char C : llvm::StringRef(Word, WordN)) |
| OS << " " << C << " "; |
| OS << "\n"; |
| OS << "-+----" << std::string(WordN * 4, '-') << "\n"; |
| for (int I = 0; I <= PatN; ++I) { |
| for (Action A : {Miss, Match}) { |
| OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|"; |
| for (int J = 0; J <= WordN; ++J) { |
| if (!isAwful(Scores[I][J][A].Score)) |
| OS << llvm::format("%3d%c", Scores[I][J][A].Score, |
| Scores[I][J][A].Prev == Match ? '*' : ' '); |
| else |
| OS << " "; |
| } |
| OS << "\n"; |
| } |
| } |
| |
| return Result; |
| } |
| |
| } // namespace clangd |
| } // namespace clang |