tools/llvm-exegesis/lib/BenchmarkRunner.cpp - llvm-project/llvm - Git at Google

 //===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include <cmath>
 #include <memory>
 #include <string>

 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
 #include "MCInstrDescView.h"
 #include "MmapUtils.h"
 #include "PerfHelper.h"
 #include "SubprocessMemory.h"
 #include "Target.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SystemZ/zOSSupport.h"

 #ifdef __linux__
 #ifdef HAVE_LIBPFM
 #include <perfmon/perf_event.h>
 #endif
 #include <sys/mman.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>
 #include <unistd.h>

 #if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER)
 #include <sys/rseq.h>
 #if defined(RSEQ_SIG) && defined(SYS_rseq)
 #define GLIBC_INITS_RSEQ
 #endif
 #endif
 #endif // __linux__

 namespace llvm {
 namespace exegesis {

 BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
                                  BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
                                  ExecutionModeE ExecutionMode,
                                  ArrayRef<ValidationEvent> ValCounters)
     : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
       ExecutionMode(ExecutionMode), ValidationCounters(ValCounters),
       Scratch(std::make_unique<ScratchSpace>()) {}

 BenchmarkRunner::~BenchmarkRunner() = default;

 void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
     const SmallVectorImpl<int64_t> &NewValues,
     SmallVectorImpl<int64_t> *Result) {
   const size_t NumValues = std::max(NewValues.size(), Result->size());
   if (NumValues > Result->size())
     Result->resize(NumValues, 0);
   for (size_t I = 0, End = NewValues.size(); I < End; ++I)
     (*Result)[I] += NewValues[I];
 }

 Expected<SmallVector<int64_t, 4>>
 BenchmarkRunner::FunctionExecutor::runAndSample(
     const char *Counters, ArrayRef<const char *> ValidationCounters,
     SmallVectorImpl<int64_t> &ValidationCounterValues) const {
   // We sum counts when there are several counters for a single ProcRes
   // (e.g. P23 on SandyBridge).
   SmallVector<int64_t, 4> CounterValues;
   SmallVector<StringRef, 2> CounterNames;
   StringRef(Counters).split(CounterNames, '+');
   for (auto &CounterName : CounterNames) {
     CounterName = CounterName.trim();
     Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter(
         CounterName, ValidationCounters, ValidationCounterValues);
     if (!ValueOrError)
       return ValueOrError.takeError();
     accumulateCounterValues(ValueOrError.get(), &CounterValues);
   }
   return CounterValues;
 }

 namespace {
 class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
 public:
   static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
   create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
          BenchmarkRunner::ScratchSpace *Scratch) {
     Expected<ExecutableFunction> EF =
         ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));

     if (!EF)
       return EF.takeError();

     return std::unique_ptr<InProcessFunctionExecutorImpl>(
         new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
   }

 private:
   InProcessFunctionExecutorImpl(const LLVMState &State,
                                 ExecutableFunction Function,
                                 BenchmarkRunner::ScratchSpace *Scratch)
       : State(State), Function(std::move(Function)), Scratch(Scratch) {}

   static void accumulateCounterValues(const SmallVector<int64_t, 4> &NewValues,
                                       SmallVector<int64_t, 4> *Result) {
     const size_t NumValues = std::max(NewValues.size(), Result->size());
     if (NumValues > Result->size())
       Result->resize(NumValues, 0);
     for (size_t I = 0, End = NewValues.size(); I < End; ++I)
       (*Result)[I] += NewValues[I];
   }

   Expected<SmallVector<int64_t, 4>> runWithCounter(
       StringRef CounterName, ArrayRef<const char *> ValidationCounters,
       SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
     const ExegesisTarget &ET = State.getExegesisTarget();
     char *const ScratchPtr = Scratch->ptr();
     auto CounterOrError =
         ET.createCounter(CounterName, State, ValidationCounters);

     if (!CounterOrError)
       return CounterOrError.takeError();

     pfm::CounterGroup *Counter = CounterOrError.get().get();
     Scratch->clear();
     {
       auto PS = ET.withSavedState();
       CrashRecoveryContext CRC;
       CrashRecoveryContext::Enable();
       const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
         Counter->start();
         this->Function(ScratchPtr);
         Counter->stop();
       });
       CrashRecoveryContext::Disable();
       PS.reset();
       if (Crashed) {
 #ifdef LLVM_ON_UNIX
         // See "Exit Status for Commands":
         // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
         constexpr const int kSigOffset = 128;
         return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
 #else
         // The exit code of the process on windows is not meaningful as a
         // signal, so simply pass in -1 as the signal into the error.
         return make_error<SnippetSignal>(-1);
 #endif // LLVM_ON_UNIX
       }
     }

     auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
     if (!ValidationValuesOrErr)
       return ValidationValuesOrErr.takeError();

     ArrayRef RealValidationValues = *ValidationValuesOrErr;
     for (size_t I = 0; I < RealValidationValues.size(); ++I)
       ValidationCounterValues[I] = RealValidationValues[I];

     return Counter->readOrError(Function.getFunctionBytes());
   }

   const LLVMState &State;
   const ExecutableFunction Function;
   BenchmarkRunner::ScratchSpace *const Scratch;
 };

 #ifdef __linux__
 // The following class implements a function executor that executes the
 // benchmark code within a subprocess rather than within the main llvm-exegesis
 // process. This allows for much more control over the execution context of the
 // snippet, particularly with regard to memory. This class performs all the
 // necessary functions to create the subprocess, execute the snippet in the
 // subprocess, and report results/handle errors.
 class SubProcessFunctionExecutorImpl
     : public BenchmarkRunner::FunctionExecutor {
 public:
   static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>>
   create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
          const BenchmarkKey &Key) {
     Expected<ExecutableFunction> EF =
         ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
     if (!EF)
       return EF.takeError();

     return std::unique_ptr<SubProcessFunctionExecutorImpl>(
         new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key));
   }

 private:
   SubProcessFunctionExecutorImpl(const LLVMState &State,
                                  ExecutableFunction Function,
                                  const BenchmarkKey &Key)
       : State(State), Function(std::move(Function)), Key(Key) {}

   enum ChildProcessExitCodeE {
     CounterFDReadFailed = 1,
     RSeqDisableFailed,
     FunctionDataMappingFailed,
     AuxiliaryMemorySetupFailed
   };

   StringRef childProcessExitCodeToString(int ExitCode) const {
     switch (ExitCode) {
     case ChildProcessExitCodeE::CounterFDReadFailed:
       return "Counter file descriptor read failed";
     case ChildProcessExitCodeE::RSeqDisableFailed:
       return "Disabling restartable sequences failed";
     case ChildProcessExitCodeE::FunctionDataMappingFailed:
       return "Failed to map memory for assembled snippet";
     case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
       return "Failed to setup auxiliary memory";
     default:
       return "Child process returned with unknown exit code";
     }
   }

   Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const {
     struct msghdr Message = {};
     char Buffer[CMSG_SPACE(sizeof(FD))];
     memset(Buffer, 0, sizeof(Buffer));
     Message.msg_control = Buffer;
     Message.msg_controllen = sizeof(Buffer);

     struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
     ControlMessage->cmsg_level = SOL_SOCKET;
     ControlMessage->cmsg_type = SCM_RIGHTS;
     ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD));

     memcpy(CMSG_DATA(ControlMessage), &FD, sizeof(FD));

     Message.msg_controllen = CMSG_SPACE(sizeof(FD));

     ssize_t BytesWritten = sendmsg(SocketFD, &Message, 0);

     if (BytesWritten < 0)
       return make_error<Failure>("Failed to write FD to socket: " +
                                  Twine(strerror(errno)));

     return Error::success();
   }

   Expected<int> getFileDescriptorFromSocket(int SocketFD) const {
     struct msghdr Message = {};

     char ControlBuffer[256];
     Message.msg_control = ControlBuffer;
     Message.msg_controllen = sizeof(ControlBuffer);

     ssize_t BytesRead = recvmsg(SocketFD, &Message, 0);

     if (BytesRead < 0)
       return make_error<Failure>("Failed to read FD from socket: " +
                                  Twine(strerror(errno)));

     struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);

     int FD;

     if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD)))
       return make_error<Failure>("Failed to get correct number of bytes for "
                                  "file descriptor from socket.");

     memcpy(&FD, CMSG_DATA(ControlMessage), sizeof(FD));

     return FD;
   }

   Error
   runParentProcess(pid_t ChildPID, int WriteFD, StringRef CounterName,
                    SmallVectorImpl<int64_t> &CounterValues,
                    ArrayRef<const char *> ValidationCounters,
                    SmallVectorImpl<int64_t> &ValidationCounterValues) const {
     auto WriteFDClose = make_scope_exit([WriteFD]() { close(WriteFD); });
     const ExegesisTarget &ET = State.getExegesisTarget();
     auto CounterOrError =
         ET.createCounter(CounterName, State, ValidationCounters, ChildPID);

     if (!CounterOrError)
       return CounterOrError.takeError();

     pfm::CounterGroup *Counter = CounterOrError.get().get();

     // Make sure to attach to the process (and wait for the sigstop to be
     // delivered and for the process to continue) before we write to the counter
     // file descriptor. Attaching to the process before writing to the socket
     // ensures that the subprocess at most has blocked on the read call. If we
     // attach afterwards, the subprocess might exit before we get to the attach
     // call due to effects like scheduler contention, introducing transient
     // failures.
     if (ptrace(PTRACE_ATTACH, ChildPID, NULL, NULL) != 0)
       return make_error<Failure>("Failed to attach to the child process: " +
                                  Twine(strerror(errno)));

     if (waitpid(ChildPID, NULL, 0) == -1) {
       return make_error<Failure>(
           "Failed to wait for child process to stop after attaching: " +
           Twine(strerror(errno)));
     }

     if (ptrace(PTRACE_CONT, ChildPID, NULL, NULL) != 0)
       return make_error<Failure>(
           "Failed to continue execution of the child process: " +
           Twine(strerror(errno)));

     int CounterFileDescriptor = Counter->getFileDescriptor();
     Error SendError =
         sendFileDescriptorThroughSocket(WriteFD, CounterFileDescriptor);

     if (SendError)
       return SendError;

     int ChildStatus;
     if (waitpid(ChildPID, &ChildStatus, 0) == -1) {
       return make_error<Failure>(
           "Waiting for the child process to complete failed: " +
           Twine(strerror(errno)));
     }

     if (WIFEXITED(ChildStatus)) {
       int ChildExitCode = WEXITSTATUS(ChildStatus);
       if (ChildExitCode == 0) {
         // The child exited succesfully, read counter values and return
         // success.
         auto CounterValueOrErr = Counter->readOrError();
         if (!CounterValueOrErr)
           return CounterValueOrErr.takeError();
         CounterValues = std::move(*CounterValueOrErr);

         auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
         if (!ValidationValuesOrErr)
           return ValidationValuesOrErr.takeError();

         ArrayRef RealValidationValues = *ValidationValuesOrErr;
         for (size_t I = 0; I < RealValidationValues.size(); ++I)
           ValidationCounterValues[I] = RealValidationValues[I];

         return Error::success();
       }
       // The child exited, but not successfully.
       return make_error<Failure>(
           "Child benchmarking process exited with non-zero exit code: " +
           childProcessExitCodeToString(ChildExitCode));
     }

     // An error was encountered running the snippet, process it
     siginfo_t ChildSignalInfo;
     if (ptrace(PTRACE_GETSIGINFO, ChildPID, NULL, &ChildSignalInfo) == -1) {
       return make_error<Failure>("Getting signal info from the child failed: " +
                                  Twine(strerror(errno)));
     }

     // Send SIGKILL rather than SIGTERM as the child process has no SIGTERM
     // handlers to run, and calling SIGTERM would mean that ptrace will force
     // it to block in the signal-delivery-stop for the SIGSEGV/other signals,
     // and upon exit.
     if (kill(ChildPID, SIGKILL) == -1)
       return make_error<Failure>("Failed to kill child benchmarking proces: " +
                                  Twine(strerror(errno)));

     // Wait for the process to exit so that there are no zombie processes left
     // around.
     if (waitpid(ChildPID, NULL, 0) == -1)
       return make_error<Failure>("Failed to wait for process to die: " +
                                  Twine(strerror(errno)));

     if (ChildSignalInfo.si_signo == SIGSEGV)
       return make_error<SnippetSegmentationFault>(
           reinterpret_cast<intptr_t>(ChildSignalInfo.si_addr));

     return make_error<SnippetSignal>(ChildSignalInfo.si_signo);
   }

   Error createSubProcessAndRunBenchmark(
       StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues,
       ArrayRef<const char *> ValidationCounters,
       SmallVectorImpl<int64_t> &ValidationCounterValues) const {
     int PipeFiles[2];
     int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles);
     if (PipeSuccessOrErr != 0) {
       return make_error<Failure>(
           "Failed to create a pipe for interprocess communication between "
           "llvm-exegesis and the benchmarking subprocess: " +
           Twine(strerror(errno)));
     }

     SubprocessMemory SPMemory;
     Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid());
     if (MemoryInitError)
       return MemoryInitError;

     Error AddMemDefError =
         SPMemory.addMemoryDefinition(Key.MemoryValues, getpid());
     if (AddMemDefError)
       return AddMemDefError;

     long ParentTID = SubprocessMemory::getCurrentTID();
     pid_t ParentOrChildPID = fork();

     if (ParentOrChildPID == -1) {
       return make_error<Failure>("Failed to create child process: " +
                                  Twine(strerror(errno)));
     }

     if (ParentOrChildPID == 0) {
       // We are in the child process, close the write end of the pipe.
       close(PipeFiles[1]);
       // Unregister handlers, signal handling is now handled through ptrace in
       // the host process.
       sys::unregisterHandlers();
       runChildSubprocess(PipeFiles[0], Key, ParentTID);
       // The child process terminates in the above function, so we should never
       // get to this point.
       llvm_unreachable("Child process didn't exit when expected.");
     }

     // Close the read end of the pipe as we only need to write to the subprocess
     // from the parent process.
     close(PipeFiles[0]);
     return runParentProcess(ParentOrChildPID, PipeFiles[1], CounterName,
                             CounterValues, ValidationCounters,
                             ValidationCounterValues);
   }

   void disableCoreDumps() const {
     struct rlimit rlim;

     rlim.rlim_cur = 0;
     setrlimit(RLIMIT_CORE, &rlim);
   }

   [[noreturn]] void runChildSubprocess(int Pipe, const BenchmarkKey &Key,
                                        long ParentTID) const {
     // Disable core dumps in the child process as otherwise everytime we
     // encounter an execution failure like a segmentation fault, we will create
     // a core dump. We report the information directly rather than require the
     // user inspect a core dump.
     disableCoreDumps();

     // The following occurs within the benchmarking subprocess.
     pid_t ParentPID = getppid();

     Expected<int> CounterFileDescriptorOrError =
         getFileDescriptorFromSocket(Pipe);

     if (!CounterFileDescriptorOrError)
       exit(ChildProcessExitCodeE::CounterFDReadFailed);

     int CounterFileDescriptor = *CounterFileDescriptorOrError;

 // Glibc versions greater than 2.35 automatically call rseq during
 // initialization. Unmapping the region that glibc sets up for this causes
 // segfaults in the program. Unregister the rseq region so that we can safely
 // unmap it later
 #ifdef GLIBC_INITS_RSEQ
     long RseqDisableOutput =
         syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset,
                 __rseq_size, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
     if (RseqDisableOutput != 0)
       exit(ChildProcessExitCodeE::RSeqDisableFailed);
 #endif // GLIBC_INITS_RSEQ

     // The frontend that generates the memory annotation structures should
     // validate that the address to map the snippet in at is a multiple of
     // the page size. Assert that this is true here.
     assert(Key.SnippetAddress % getpagesize() == 0 &&
            "The snippet address needs to be aligned to a page boundary.");

     size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
     void *MapAddress = NULL;
     int MapFlags = MAP_PRIVATE | MAP_ANONYMOUS;

     if (Key.SnippetAddress != 0) {
       MapAddress = reinterpret_cast<void *>(Key.SnippetAddress);
       MapFlags |= MAP_FIXED_NOREPLACE;
     }

     char *FunctionDataCopy =
         (char *)mmap(MapAddress, FunctionDataCopySize, PROT_READ | PROT_WRITE,
                      MapFlags, 0, 0);
     if ((intptr_t)FunctionDataCopy == -1)
       exit(ChildProcessExitCodeE::FunctionDataMappingFailed);

     memcpy(FunctionDataCopy, this->Function.FunctionBytes.data(),
            this->Function.FunctionBytes.size());
     mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC);

     Expected<int> AuxMemFDOrError =
         SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
             Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
     if (!AuxMemFDOrError)
       exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);

     ((void (*)(size_t, int))(intptr_t)FunctionDataCopy)(FunctionDataCopySize,
                                                         *AuxMemFDOrError);

     exit(0);
   }

   Expected<SmallVector<int64_t, 4>> runWithCounter(
       StringRef CounterName, ArrayRef<const char *> ValidationCounters,
       SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
     SmallVector<int64_t, 4> Value(1, 0);
     Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(
         CounterName, Value, ValidationCounters, ValidationCounterValues);

     if (PossibleBenchmarkError)
       return std::move(PossibleBenchmarkError);

     return Value;
   }

   const LLVMState &State;
   const ExecutableFunction Function;
   const BenchmarkKey &Key;
 };
 #endif // __linux__
 } // namespace

 Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
     const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
     unsigned MinInstructions, unsigned LoopBodySize,
     bool GenerateMemoryInstructions) const {
   const std::vector<MCInst> &Instructions = BC.Key.Instructions;
   SmallString<0> Buffer;
   raw_svector_ostream OS(Buffer);
   if (Error E = assembleToStream(
           State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
           Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
                            GenerateMemoryInstructions),
           OS, BC.Key, GenerateMemoryInstructions)) {
     return std::move(E);
   }
   return Buffer;
 }

 Expected<BenchmarkRunner::RunnableConfiguration>
 BenchmarkRunner::getRunnableConfiguration(
     const BenchmarkCode &BC, unsigned MinInstructions, unsigned LoopBodySize,
     const SnippetRepetitor &Repetitor) const {
   RunnableConfiguration RC;

   Benchmark &BenchmarkResult = RC.BenchmarkResult;
   BenchmarkResult.Mode = Mode;
   BenchmarkResult.CpuName =
       std::string(State.getTargetMachine().getTargetCPU());
   BenchmarkResult.LLVMTriple =
       State.getTargetMachine().getTargetTriple().normalize();
   BenchmarkResult.MinInstructions = MinInstructions;
   BenchmarkResult.Info = BC.Info;

   const std::vector<MCInst> &Instructions = BC.Key.Instructions;

   bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;

   BenchmarkResult.Key = BC.Key;

   // Assemble at least kMinInstructionsForSnippet instructions by repeating
   // the snippet for debug/analysis. This is so that the user clearly
   // understands that the inside instructions are repeated.
   if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
     const int MinInstructionsForSnippet = 4 * Instructions.size();
     const int LoopBodySizeForSnippet = 2 * Instructions.size();
     auto Snippet =
         assembleSnippet(BC, Repetitor, MinInstructionsForSnippet,
                         LoopBodySizeForSnippet, GenerateMemoryInstructions);
     if (Error E = Snippet.takeError())
       return std::move(E);

     if (auto Err = getBenchmarkFunctionBytes(*Snippet,
                                              BenchmarkResult.AssembledSnippet))
       return std::move(Err);
   }

   // Assemble enough repetitions of the snippet so we have at least
   // MinInstructions instructions.
   if (BenchmarkPhaseSelector >
       BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
     auto Snippet =
         assembleSnippet(BC, Repetitor, BenchmarkResult.MinInstructions,
                         LoopBodySize, GenerateMemoryInstructions);
     if (Error E = Snippet.takeError())
       return std::move(E);
     RC.ObjectFile = getObjectFromBuffer(*Snippet);
   }

   return std::move(RC);
 }

 Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
 BenchmarkRunner::createFunctionExecutor(
     object::OwningBinary<object::ObjectFile> ObjectFile,
     const BenchmarkKey &Key) const {
   switch (ExecutionMode) {
   case ExecutionModeE::InProcess: {
     auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
         State, std::move(ObjectFile), Scratch.get());
     if (!InProcessExecutorOrErr)
       return InProcessExecutorOrErr.takeError();

     return std::move(*InProcessExecutorOrErr);
   }
   case ExecutionModeE::SubProcess: {
 #ifdef __linux__
     auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
         State, std::move(ObjectFile), Key);
     if (!SubProcessExecutorOrErr)
       return SubProcessExecutorOrErr.takeError();

     return std::move(*SubProcessExecutorOrErr);
 #else
     return make_error<Failure>(
         "The subprocess execution mode is only supported on Linux");
 #endif
   }
   }
   llvm_unreachable("ExecutionMode is outside expected range");
 }

 std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
     RunnableConfiguration &&RC,
     const std::optional<StringRef> &DumpFile) const {
   Benchmark &BenchmarkResult = RC.BenchmarkResult;
   object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;

   if (DumpFile && BenchmarkPhaseSelector >
                       BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
     auto ObjectFilePath =
         writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
     if (Error E = ObjectFilePath.takeError()) {
       return {std::move(E), std::move(BenchmarkResult)};
     }
     outs() << "Check generated assembly with: /usr/bin/objdump -d "
            << *ObjectFilePath << "\n";
   }

   if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
     BenchmarkResult.Error = "actual measurements skipped.";
     return {Error::success(), std::move(BenchmarkResult)};
   }

   Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
       createFunctionExecutor(std::move(ObjectFile), RC.BenchmarkResult.Key);
   if (!Executor)
     return {Executor.takeError(), std::move(BenchmarkResult)};
   auto NewMeasurements = runMeasurements(**Executor);

   if (Error E = NewMeasurements.takeError()) {
     return {std::move(E), std::move(BenchmarkResult)};
   }
   assert(BenchmarkResult.MinInstructions > 0 && "invalid MinInstructions");
   for (BenchmarkMeasure &BM : *NewMeasurements) {
     // Scale the measurements by the number of instructions.
     BM.PerInstructionValue /= BenchmarkResult.MinInstructions;
     // Scale the measurements by the number of times the entire snippet is
     // repeated.
     BM.PerSnippetValue /=
         std::ceil(BenchmarkResult.MinInstructions /
                   static_cast<double>(BenchmarkResult.Key.Instructions.size()));
   }
   BenchmarkResult.Measurements = std::move(*NewMeasurements);

   return {Error::success(), std::move(BenchmarkResult)};
 }

 Expected<std::string>
 BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
   int ResultFD = 0;
   SmallString<256> ResultPath = FileName;
   if (Error E = errorCodeToError(
           FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o",
                                                           ResultFD, ResultPath)
                            : sys::fs::openFileForReadWrite(
                                  FileName, ResultFD, sys::fs::CD_CreateAlways,
                                  sys::fs::OF_None)))
     return std::move(E);
   raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
   OFS.write(Buffer.data(), Buffer.size());
   OFS.flush();
   return std::string(ResultPath);
 }

 static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS,
                           const ValidationEvent RHS) {
   return static_cast<int>(LHS.first) < static_cast<int>(RHS);
 }

 Error BenchmarkRunner::getValidationCountersToRun(
     SmallVector<const char *> &ValCountersToRun) const {
   const PfmCountersInfo &PCI = State.getPfmCounters();
   ValCountersToRun.reserve(ValidationCounters.size());

   ValCountersToRun.reserve(ValidationCounters.size());
   ArrayRef TargetValidationEvents(PCI.ValidationEvents,
                                   PCI.NumValidationEvents);
   for (const ValidationEvent RequestedValEvent : ValidationCounters) {
     auto ValCounterIt =
         lower_bound(TargetValidationEvents, RequestedValEvent, EventLessThan);
     if (ValCounterIt == TargetValidationEvents.end() ||
         ValCounterIt->first != RequestedValEvent)
       return make_error<Failure>("Cannot create validation counter");

     assert(ValCounterIt->first == RequestedValEvent &&
            "The array of validation events from the target should be sorted");
     ValCountersToRun.push_back(ValCounterIt->second);
   }

   return Error::success();
 }

 BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}

 } // namespace exegesis
 } // namespace llvm