[clangd] Record information about non self-contained headers in IncludeStructure

This will be useful for IncludeCleaner.

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D114370

GitOrigin-RevId: cd0ca5a0eaa1b75b445e82753ea093bbb8e7e85c
diff --git a/clangd/CodeComplete.cpp b/clangd/CodeComplete.cpp
index ac6c3b0..63111c2 100644
--- a/clangd/CodeComplete.cpp
+++ b/clangd/CodeComplete.cpp
@@ -1272,8 +1272,7 @@
   // Force them to be deserialized so SemaCodeComplete sees them.
   loadMainFilePreambleMacros(Clang->getPreprocessor(), Input.Preamble);
   if (Includes)
-    Clang->getPreprocessor().addPPCallbacks(
-        Includes->collect(Clang->getSourceManager()));
+    Clang->getPreprocessor().addPPCallbacks(Includes->collect(*Clang));
   if (llvm::Error Err = Action.Execute()) {
     log("Execute() failed when running codeComplete for {0}: {1}",
         Input.FileName, toString(std::move(Err)));
diff --git a/clangd/Headers.cpp b/clangd/Headers.cpp
index 9f5ab24..dc11306 100644
--- a/clangd/Headers.cpp
+++ b/clangd/Headers.cpp
@@ -22,12 +22,12 @@
 
 namespace clang {
 namespace clangd {
-namespace {
 
-class RecordHeaders : public PPCallbacks {
+class IncludeStructure::RecordHeaders : public PPCallbacks {
 public:
-  RecordHeaders(const SourceManager &SM, IncludeStructure *Out)
-      : SM(SM), Out(Out) {}
+  RecordHeaders(const SourceManager &SM, HeaderSearch &HeaderInfo,
+                IncludeStructure *Out)
+      : SM(SM), HeaderInfo(HeaderInfo), Out(Out) {}
 
   // Record existing #includes - both written and resolved paths. Only #includes
   // in the main file are collected.
@@ -85,10 +85,17 @@
         InBuiltinFile = true;
       }
       break;
-    case PPCallbacks::ExitFile:
+    case PPCallbacks::ExitFile: {
       if (PrevFID == BuiltinFile)
         InBuiltinFile = false;
+      // At file exit time HeaderSearchInfo is valid and can be used to
+      // determine whether the file was a self-contained header or not.
+      if (const FileEntry *FE = SM.getFileEntryForID(PrevFID))
+        if (!isSelfContainedHeader(FE, PrevFID, SM, HeaderInfo))
+          Out->NonSelfContained.insert(
+              *Out->getID(SM.getFileEntryForID(PrevFID)));
       break;
+    }
     case PPCallbacks::RenameFile:
     case PPCallbacks::SystemHeaderPragma:
       break;
@@ -97,6 +104,7 @@
 
 private:
   const SourceManager &SM;
+  HeaderSearch &HeaderInfo;
   // Set after entering the <built-in> file.
   FileID BuiltinFile;
   // Indicates whether <built-in> file is part of include stack.
@@ -105,8 +113,6 @@
   IncludeStructure *Out;
 };
 
-} // namespace
-
 bool isLiteralInclude(llvm::StringRef Include) {
   return Include.startswith("<") || Include.startswith("\"");
 }
@@ -152,9 +158,11 @@
 }
 
 std::unique_ptr<PPCallbacks>
-IncludeStructure::collect(const SourceManager &SM) {
+IncludeStructure::collect(const CompilerInstance &CI) {
+  auto &SM = CI.getSourceManager();
   MainFileEntry = SM.getFileEntryForID(SM.getMainFileID());
-  return std::make_unique<RecordHeaders>(SM, this);
+  return std::make_unique<RecordHeaders>(
+      SM, CI.getPreprocessor().getHeaderSearchInfo(), this);
 }
 
 llvm::Optional<IncludeStructure::HeaderID>
diff --git a/clangd/Headers.h b/clangd/Headers.h
index 7b42598..b6c6747 100644
--- a/clangd/Headers.h
+++ b/clangd/Headers.h
@@ -17,10 +17,12 @@
 #include "clang/Basic/FileEntry.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Format/Format.h"
+#include "clang/Frontend/CompilerInstance.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Error.h"
@@ -123,7 +125,7 @@
 
   // Returns a PPCallback that visits all inclusions in the main file and
   // populates the structure.
-  std::unique_ptr<PPCallbacks> collect(const SourceManager &SM);
+  std::unique_ptr<PPCallbacks> collect(const CompilerInstance &CI);
 
   // HeaderID identifies file in the include graph. It corresponds to a
   // FileEntry rather than a FileID, but stays stable across preamble & main
@@ -138,6 +140,10 @@
     return RealPathNames[static_cast<unsigned>(ID)];
   }
 
+  bool isSelfContained(HeaderID ID) const {
+    return !NonSelfContained.contains(ID);
+  }
+
   // Return all transitively reachable files.
   llvm::ArrayRef<std::string> allHeaders() const { return RealPathNames; }
 
@@ -158,6 +164,8 @@
   // content of the main file changes.
   static const HeaderID MainFileID = HeaderID(0u);
 
+  class RecordHeaders;
+
 private:
   // MainFileEntry will be used to check if the queried file is the main file
   // or not.
@@ -170,6 +178,9 @@
   // and RealPathName and UniqueID are not preserved in
   // the preamble.
   llvm::DenseMap<llvm::sys::fs::UniqueID, HeaderID> UIDToIndex;
+  // Contains HeaderIDs of all non self-contained entries in the
+  // IncludeStructure.
+  llvm::DenseSet<HeaderID> NonSelfContained;
 };
 
 // Calculates insertion edit for including a new header in a file.
diff --git a/clangd/ParsedAST.cpp b/clangd/ParsedAST.cpp
index 719c374..5239bc3 100644
--- a/clangd/ParsedAST.cpp
+++ b/clangd/ParsedAST.cpp
@@ -446,8 +446,7 @@
   // Important: collectIncludeStructure is registered *after* ReplayPreamble!
   // Otherwise we would collect the replayed includes again...
   // (We can't *just* use the replayed includes, they don't have Resolved path).
-  Clang->getPreprocessor().addPPCallbacks(
-      Includes.collect(Clang->getSourceManager()));
+  Clang->getPreprocessor().addPPCallbacks(Includes.collect(*Clang));
   // Copy over the macros in the preamble region of the main file, and combine
   // with non-preamble macros below.
   MainFileMacros Macros;
diff --git a/clangd/Preamble.cpp b/clangd/Preamble.cpp
index 8b80ee1..79165e3 100644
--- a/clangd/Preamble.cpp
+++ b/clangd/Preamble.cpp
@@ -23,6 +23,7 @@
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
+#include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
@@ -98,6 +99,7 @@
     CanonIncludes.addSystemHeadersMapping(CI.getLangOpts());
     LangOpts = &CI.getLangOpts();
     SourceMgr = &CI.getSourceManager();
+    Compiler = &CI;
   }
 
   std::unique_ptr<PPCallbacks> createPPCallbacks() override {
@@ -105,7 +107,7 @@
            "SourceMgr and LangOpts must be set at this point");
 
     return std::make_unique<PPChainedCallbacks>(
-        Includes.collect(*SourceMgr),
+        Includes.collect(*Compiler),
         std::make_unique<PPChainedCallbacks>(
             std::make_unique<CollectMainFileMacros>(*SourceMgr, Macros),
             collectPragmaMarksCallback(*SourceMgr, Marks)));
@@ -140,6 +142,7 @@
   std::unique_ptr<CommentHandler> IWYUHandler = nullptr;
   const clang::LangOptions *LangOpts = nullptr;
   const SourceManager *SourceMgr = nullptr;
+  const CompilerInstance *Compiler = nullptr;
 };
 
 // Represents directives other than includes, where basic textual information is
@@ -283,10 +286,9 @@
   PreprocessOnlyAction Action;
   if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0]))
     return error("failed BeginSourceFile");
-  const auto &SM = Clang->getSourceManager();
   Preprocessor &PP = Clang->getPreprocessor();
   IncludeStructure Includes;
-  PP.addPPCallbacks(Includes.collect(SM));
+  PP.addPPCallbacks(Includes.collect(*Clang));
   ScannedPreamble SP;
   SP.Bounds = Bounds;
   PP.addPPCallbacks(
diff --git a/clangd/SourceCode.cpp b/clangd/SourceCode.cpp
index 16072f3..f3a60d6 100644
--- a/clangd/SourceCode.cpp
+++ b/clangd/SourceCode.cpp
@@ -1182,5 +1182,58 @@
   return SM.getBufferData(FID).startswith(PROTO_HEADER_COMMENT);
 }
 
+namespace {
+
+// Is Line an #if or #ifdef directive?
+// FIXME: This makes headers with #ifdef LINUX/WINDOWS/MACOS marked as non
+// self-contained and is probably not what we want.
+bool isIf(llvm::StringRef Line) {
+  Line = Line.ltrim();
+  if (!Line.consume_front("#"))
+    return false;
+  Line = Line.ltrim();
+  return Line.startswith("if");
+}
+
+// Is Line an #error directive mentioning includes?
+bool isErrorAboutInclude(llvm::StringRef Line) {
+  Line = Line.ltrim();
+  if (!Line.consume_front("#"))
+    return false;
+  Line = Line.ltrim();
+  if (!Line.startswith("error"))
+    return false;
+  return Line.contains_insensitive(
+      "includ"); // Matches "include" or "including".
+}
+
+// Heuristically headers that only want to be included via an umbrella.
+bool isDontIncludeMeHeader(llvm::StringRef Content) {
+  llvm::StringRef Line;
+  // Only sniff up to 100 lines or 10KB.
+  Content = Content.take_front(100 * 100);
+  for (unsigned I = 0; I < 100 && !Content.empty(); ++I) {
+    std::tie(Line, Content) = Content.split('\n');
+    if (isIf(Line) && isErrorAboutInclude(Content.split('\n').first))
+      return true;
+  }
+  return false;
+}
+
+} // namespace
+
+bool isSelfContainedHeader(const FileEntry *FE, FileID FID,
+                           const SourceManager &SM, HeaderSearch &HeaderInfo) {
+  // FIXME: Should files that have been #import'd be considered
+  // self-contained? That's really a property of the includer,
+  // not of the file.
+  if (!HeaderInfo.isFileMultipleIncludeGuarded(FE) &&
+      !HeaderInfo.hasFileBeenImported(FE))
+    return false;
+  // This pattern indicates that a header can't be used without
+  // particular preprocessor state, usually set up by another header.
+  return !isDontIncludeMeHeader(SM.getBufferData(FID));
+}
+
 } // namespace clangd
 } // namespace clang
diff --git a/clangd/SourceCode.h b/clangd/SourceCode.h
index 459c943..315d79a 100644
--- a/clangd/SourceCode.h
+++ b/clangd/SourceCode.h
@@ -21,6 +21,7 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
+#include "clang/Lex/HeaderSearch.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/StringRef.h"
@@ -324,6 +325,11 @@
 /// Returns true if the given location is in a generated protobuf file.
 bool isProtoFile(SourceLocation Loc, const SourceManager &SourceMgr);
 
+/// This scans source code, and should not be called when using a preamble.
+/// Prefer to access the cache in IncludeStructure::isSelfContained if you can.
+bool isSelfContainedHeader(const FileEntry *FE, FileID ID,
+                           const SourceManager &SM, HeaderSearch &HeaderInfo);
+
 } // namespace clangd
 } // namespace clang
 #endif
diff --git a/clangd/index/SymbolCollector.cpp b/clangd/index/SymbolCollector.cpp
index dfcd4cb..5171e79 100644
--- a/clangd/index/SymbolCollector.cpp
+++ b/clangd/index/SymbolCollector.cpp
@@ -266,7 +266,8 @@
         return toURI(Canonical);
       }
     }
-    if (!isSelfContainedHeader(FID, FE)) {
+    if (!isSelfContainedHeader(FE, FID, PP->getSourceManager(),
+                               PP->getHeaderSearchInfo())) {
       // A .inc or .def file is often included into a real header to define
       // symbols (e.g. LLVM tablegen files).
       if (Filename.endswith(".inc") || Filename.endswith(".def"))
@@ -278,54 +279,6 @@
     // Standard case: just insert the file itself.
     return toURI(FE);
   }
-
-  bool isSelfContainedHeader(FileID FID, const FileEntry *FE) {
-    // FIXME: Should files that have been #import'd be considered
-    // self-contained? That's really a property of the includer,
-    // not of the file.
-    if (!PP->getHeaderSearchInfo().isFileMultipleIncludeGuarded(FE) &&
-        !PP->getHeaderSearchInfo().hasFileBeenImported(FE))
-      return false;
-    // This pattern indicates that a header can't be used without
-    // particular preprocessor state, usually set up by another header.
-    if (isDontIncludeMeHeader(SM.getBufferData(FID)))
-      return false;
-    return true;
-  }
-
-  // Is Line an #if or #ifdef directive?
-  static bool isIf(llvm::StringRef Line) {
-    Line = Line.ltrim();
-    if (!Line.consume_front("#"))
-      return false;
-    Line = Line.ltrim();
-    return Line.startswith("if");
-  }
-
-  // Is Line an #error directive mentioning includes?
-  static bool isErrorAboutInclude(llvm::StringRef Line) {
-    Line = Line.ltrim();
-    if (!Line.consume_front("#"))
-      return false;
-    Line = Line.ltrim();
-    if (!Line.startswith("error"))
-      return false;
-    return Line.contains_insensitive(
-        "includ"); // Matches "include" or "including".
-  }
-
-  // Heuristically headers that only want to be included via an umbrella.
-  static bool isDontIncludeMeHeader(llvm::StringRef Content) {
-    llvm::StringRef Line;
-    // Only sniff up to 100 lines or 10KB.
-    Content = Content.take_front(100 * 100);
-    for (unsigned I = 0; I < 100 && !Content.empty(); ++I) {
-      std::tie(Line, Content) = Content.split('\n');
-      if (isIf(Line) && isErrorAboutInclude(Content.split('\n').first))
-        return true;
-    }
-    return false;
-  }
 };
 
 // Return the symbol location of the token at \p TokLoc.
diff --git a/clangd/unittests/HeadersTests.cpp b/clangd/unittests/HeadersTests.cpp
index 85c3e88..b9e1d2c 100644
--- a/clangd/unittests/HeadersTests.cpp
+++ b/clangd/unittests/HeadersTests.cpp
@@ -80,8 +80,7 @@
     EXPECT_TRUE(
         Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0]));
     IncludeStructure Includes;
-    Clang->getPreprocessor().addPPCallbacks(
-        Includes.collect(Clang->getSourceManager()));
+    Clang->getPreprocessor().addPPCallbacks(Includes.collect(*Clang));
     EXPECT_FALSE(Action.Execute());
     Action.EndSourceFile();
     return Includes;
@@ -363,6 +362,39 @@
   EXPECT_THAT(collectIncludes().MainFileIncludes,
               Contains(AllOf(IncludeLine(2), Written("<a.h>"))));
 }
+
+TEST_F(HeadersTest, SelfContainedHeaders) {
+  // Including through non-builtin file has no effects.
+  FS.Files[MainFile] = R"cpp(
+#include "includeguarded.h"
+#include "nonguarded.h"
+#include "pp_depend.h"
+#include "pragmaguarded.h"
+)cpp";
+  FS.Files["pragmaguarded.h"] = R"cpp(
+#pragma once
+)cpp";
+  FS.Files["includeguarded.h"] = R"cpp(
+#ifndef INCLUDE_GUARDED_H
+#define INCLUDE_GUARDED_H
+void foo();
+#endif // INCLUDE_GUARDED_H
+)cpp";
+  FS.Files["nonguarded.h"] = R"cpp(
+)cpp";
+  FS.Files["pp_depend.h"] = R"cpp(
+  #ifndef REQUIRED_PP_DIRECTIVE
+  # error You have to have PP directive set to include this one!
+  #endif
+)cpp";
+
+  auto Includes = collectIncludes();
+  EXPECT_TRUE(Includes.isSelfContained(getID("pragmaguarded.h", Includes)));
+  EXPECT_TRUE(Includes.isSelfContained(getID("includeguarded.h", Includes)));
+  EXPECT_FALSE(Includes.isSelfContained(getID("nonguarded.h", Includes)));
+  EXPECT_FALSE(Includes.isSelfContained(getID("pp_depend.h", Includes)));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clangd/unittests/PreambleTests.cpp b/clangd/unittests/PreambleTests.cpp
index 3d02197..2e3dec9 100644
--- a/clangd/unittests/PreambleTests.cpp
+++ b/clangd/unittests/PreambleTests.cpp
@@ -82,8 +82,7 @@
     return {};
   }
   IncludeStructure Includes;
-  Clang->getPreprocessor().addPPCallbacks(
-      Includes.collect(Clang->getSourceManager()));
+  Clang->getPreprocessor().addPPCallbacks(Includes.collect(*Clang));
   if (llvm::Error Err = Action.Execute()) {
     ADD_FAILURE() << "failed to execute action: " << std::move(Err);
     return {};