blob: 7ebd5f685685ec0022b18075bdb2b0cfef613926 [file] [log] [blame]
//===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "Dex.h"
#include "FileDistance.h"
#include "FuzzyMatch.h"
#include "Quality.h"
#include "index/Index.h"
#include "index/dex/Iterator.h"
#include "support/Logger.h"
#include "support/Trace.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Support/ScopedPrinter.h"
#include <algorithm>
#include <queue>
namespace clang {
namespace clangd {
namespace dex {
std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
RelationSlab Rels) {
auto Size = Symbols.bytes() + Refs.bytes();
// There is no need to include "Rels" in Data because the relations are self-
// contained, without references into a backing store.
auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
namespace {
// Mark symbols which are can be used for code completion.
const Token RestrictedForCodeCompletion =
Token(Token::Kind::Sentinel, "Restricted For Code Completion");
// Helper to efficiently assemble the inverse index (token -> matching docs).
// The output is a nice uniform structure keyed on Token, but constructing
// the Token object every time we want to insert into the map is wasteful.
// Instead we have various maps keyed on things that are cheap to compute,
// and produce the Token keys once at the end.
class IndexBuilder {
llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
std::vector<DocID> RestrictedCCDocs;
llvm::StringMap<std::vector<DocID>> TypeDocs;
llvm::StringMap<std::vector<DocID>> ScopeDocs;
llvm::StringMap<std::vector<DocID>> ProximityDocs;
std::vector<Trigram> TrigramScratch;
// Add the tokens which are given symbol's characteristics.
// This includes fuzzy matching trigrams, symbol's scope, etc.
// FIXME(kbobyrev): Support more token types:
// * Namespace proximity
void add(const Symbol &Sym, DocID D) {
generateIdentifierTrigrams(Sym.Name, TrigramScratch);
for (Trigram T : TrigramScratch)
if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
for (const auto &ProximityURI :
if (Sym.Flags & Symbol::IndexedForCodeCompletion)
if (!Sym.Type.empty())
// Assemble the final compressed posting lists for the added symbols.
llvm::DenseMap<Token, PostingList> build() {
llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
TrigramDocs.size() +
RestrictedCCDocs.size() +
TypeDocs.size() +
ScopeDocs.size() +
for (const auto &E : TrigramDocs)
Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
for (const auto &E : TypeDocs)
Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
for (const auto &E : ScopeDocs)
Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
for (const auto &E : ProximityDocs)
Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
if (!RestrictedCCDocs.empty())
Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
return Result;
} // namespace
void Dex::buildIndex() {
this->Corpus = dex::Corpus(Symbols.size());
std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
for (size_t I = 0; I < Symbols.size(); ++I) {
const Symbol *Sym = Symbols[I];
LookupTable[Sym->ID] = Sym;
ScoredSymbols[I] = {quality(*Sym), Sym};
// Symbols are sorted by symbol qualities so that items in the posting lists
// are stored in the descending order of symbol quality.
llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
// SymbolQuality was empty up until now.
// Populate internal storage using Symbol + Score pairs.
for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
SymbolQuality[I] = ScoredSymbols[I].first;
Symbols[I] = ScoredSymbols[I].second;
// Build posting lists for symbols.
IndexBuilder Builder;
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
Builder.add(*Symbols[SymbolRank], SymbolRank);
InvertedIndex =;
std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
auto It = InvertedIndex.find(Tok);
return It == InvertedIndex.end() ? Corpus.none()
: It->second.iterator(&It->first);
// Constructs BOOST iterators for Path Proximities.
std::unique_ptr<Iterator> Dex::createFileProximityIterator(
llvm::ArrayRef<std::string> ProximityPaths) const {
std::vector<std::unique_ptr<Iterator>> BoostingIterators;
// Deduplicate parent URIs extracted from the ProximityPaths.
llvm::StringSet<> ParentURIs;
llvm::StringMap<SourceParams> Sources;
for (const auto &Path : ProximityPaths) {
Sources[Path] = SourceParams();
auto PathURI = URI::create(Path);
const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
for (const auto &ProximityURI : PathProximityURIs)
// Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
// for all parameters except for Proximity Path distance signal.
SymbolRelevanceSignals PathProximitySignals;
// DistanceCalculator will find the shortest distance from ProximityPaths to
// any URI extracted from the ProximityPaths.
URIDistance DistanceCalculator(Sources);
PathProximitySignals.FileProximityMatch = &DistanceCalculator;
// Try to build BOOST iterator for each Proximity Path provided by
// ProximityPaths. Boosting factor should depend on the distance to the
// Proximity Path: the closer processed path is, the higher boosting factor.
for (const auto &ParentURI : ParentURIs.keys()) {
// FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
if (It->kind() != Iterator::Kind::False) {
PathProximitySignals.SymbolURI = ParentURI;
std::move(It), PathProximitySignals.evaluateHeuristics()));
return Corpus.unionOf(std::move(BoostingIterators));
// Constructs BOOST iterators for preferred types.
Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
std::vector<std::unique_ptr<Iterator>> BoostingIterators;
SymbolRelevanceSignals PreferredTypeSignals;
PreferredTypeSignals.TypeMatchesPreferred = true;
auto Boost = PreferredTypeSignals.evaluateHeuristics();
for (const auto &T : Types)
Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
return Corpus.unionOf(std::move(BoostingIterators));
/// Constructs iterators over tokens extracted from the query and exhausts it
/// while applying Callback to each symbol in the order of decreasing quality
/// of the matched symbols.
bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
llvm::function_ref<void(const Symbol &)> Callback) const {
assert(!StringRef(Req.Query).contains("::") &&
"There must be no :: in query.");
trace::Span Tracer("Dex fuzzyFind");
FuzzyMatcher Filter(Req.Query);
// For short queries we use specialized trigrams that don't yield all results.
// Prevent clients from postfiltering them for longer queries.
bool More = !Req.Query.empty() && Req.Query.size() < 3;
std::vector<std::unique_ptr<Iterator>> Criteria;
const auto TrigramTokens = generateQueryTrigrams(Req.Query);
// Generate query trigrams and construct AND iterator over all query
// trigrams.
std::vector<std::unique_ptr<Iterator>> TrigramIterators;
for (const auto &Trigram : TrigramTokens)
// Generate scope tokens for search query.
std::vector<std::unique_ptr<Iterator>> ScopeIterators;
for (const auto &Scope : Req.Scopes)
ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
if (Req.AnyScope)
Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
// Add proximity paths boosting (all symbols, some boosted).
// Add boosting for preferred types.
if (Req.RestrictForCodeCompletion)
// Use TRUE iterator if both trigrams and scopes from the query are not
// present in the symbol index.
auto Root = Corpus.intersect(move(Criteria));
// Retrieve more items than it was requested: some of the items with high
// final score might not be retrieved otherwise.
// FIXME(kbobyrev): Tune this ratio.
if (Req.Limit)
Root = Corpus.limit(move(Root), *Req.Limit * 100);
SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
vlog("Dex query tree: {0}", *Root);
using IDAndScore = std::pair<DocID, float>;
std::vector<IDAndScore> IDAndScores = consume(*Root);
auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
return LHS.second > RHS.second;
TopN<IDAndScore, decltype(Compare)> Top(
Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
for (const auto &IDAndScore : IDAndScores) {
const DocID SymbolDocID = IDAndScore.first;
const auto *Sym = Symbols[SymbolDocID];
const llvm::Optional<float> Score = Filter.match(Sym->Name);
if (!Score)
// Combine Fuzzy Matching score, precomputed symbol quality and boosting
// score for a cumulative final symbol score.
const float FinalScore =
(*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
// If Top.push(...) returns true, it means that it had to pop an item. In
// this case, it is possible to retrieve more symbols.
if (Top.push({SymbolDocID, FinalScore}))
More = true;
// Apply callback to the top Req.Limit items in the descending
// order of cumulative score.
for (const auto &Item : std::move(Top).items())
return More;
void Dex::lookup(const LookupRequest &Req,
llvm::function_ref<void(const Symbol &)> Callback) const {
trace::Span Tracer("Dex lookup");
for (const auto &ID : Req.IDs) {
auto I = LookupTable.find(ID);
if (I != LookupTable.end())
bool Dex::refs(const RefsRequest &Req,
llvm::function_ref<void(const Ref &)> Callback) const {
trace::Span Tracer("Dex refs");
uint32_t Remaining =
for (const auto &ID : Req.IDs)
for (const auto &Ref : Refs.lookup(ID)) {
if (!static_cast<int>(Req.Filter & Ref.Kind))
if (Remaining == 0)
return true; // More refs were available.
return false; // We reported all refs.
void Dex::relations(
const RelationsRequest &Req,
llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
trace::Span Tracer("Dex relations");
uint32_t Remaining =
for (const SymbolID &Subject : Req.Subjects) {
LookupRequest LookupReq;
auto It = Relations.find(
std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
if (It != Relations.end()) {
for (const auto &Object : It->second) {
if (Remaining > 0) {
lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
llvm::unique_function<IndexContents(llvm::StringRef) const>
Dex::indexedFiles() const {
return [this](llvm::StringRef FileURI) {
return Files.contains(FileURI) ? IdxContents : IndexContents::None;
size_t Dex::estimateMemoryUsage() const {
size_t Bytes = Symbols.size() * sizeof(const Symbol *);
Bytes += SymbolQuality.size() * sizeof(float);
Bytes += LookupTable.getMemorySize();
Bytes += InvertedIndex.getMemorySize();
for (const auto &TokenToPostingList : InvertedIndex)
Bytes += TokenToPostingList.second.bytes();
Bytes += Refs.getMemorySize();
Bytes += Relations.getMemorySize();
return Bytes + BackingDataSize;
std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
std::vector<std::string> Result;
auto ParsedURI = URI::parse(URIPath);
assert(ParsedURI &&
"Non-empty argument of generateProximityURIs() should be a valid "
llvm::StringRef Body = ParsedURI->body();
// FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
// size of resulting vector. Some projects might want to have higher limit if
// the file hierarchy is deeper. For the generic case, it would be useful to
// calculate Limit in the index build stage by calculating the maximum depth
// of the project source tree at runtime.
size_t Limit = 5;
// Insert original URI before the loop: this would save a redundant iteration
// with a URI parse.
while (!Body.empty() && --Limit > 0) {
// FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
// could be optimized.
Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
if (!Body.empty())
URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
return Result;
} // namespace dex
} // namespace clangd
} // namespace clang