|  | //===-- Benchmark function --------------------------------------*- C++ -*-===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | // This file mainly defines a `Benchmark` function. | 
|  | // | 
|  | // The benchmarking process is as follows: | 
|  | // - We start by measuring the time it takes to run the function | 
|  | // `InitialIterations` times. This is called a Sample. From this we can derive | 
|  | // the time it took to run a single iteration. | 
|  | // | 
|  | // - We repeat the previous step with a greater number of iterations to lower | 
|  | // the impact of the measurement. We can derive a more precise estimation of the | 
|  | // runtime for a single iteration. | 
|  | // | 
|  | // - Each sample gives a more accurate estimation of the runtime for a single | 
|  | // iteration but also takes more time to run. We stop the process when: | 
|  | //   * The measure stabilize under a certain precision (Epsilon), | 
|  | //   * The overall benchmarking time is greater than MaxDuration, | 
|  | //   * The overall sample count is greater than MaxSamples, | 
|  | //   * The last sample used more than MaxIterations iterations. | 
|  | // | 
|  | // - We also makes sure that the benchmark doesn't run for a too short period of | 
|  | // time by defining MinDuration and MinSamples. | 
|  |  | 
|  | #ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H | 
|  | #define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H | 
|  |  | 
|  | #include "benchmark/benchmark.h" | 
|  | #include "llvm/ADT/ArrayRef.h" | 
|  | #include "llvm/ADT/SmallVector.h" | 
|  | #include <array> | 
|  | #include <chrono> | 
|  | #include <cmath> | 
|  | #include <cstdint> | 
|  | #include <optional> | 
|  |  | 
|  | namespace llvm { | 
|  | namespace libc_benchmarks { | 
|  |  | 
|  | using Duration = std::chrono::duration<double>; | 
|  |  | 
|  | enum class BenchmarkLog { | 
|  | None, // Don't keep the internal state of the benchmark. | 
|  | Last, // Keep only the last batch. | 
|  | Full  // Keep all iterations states, useful for testing or debugging. | 
|  | }; | 
|  |  | 
|  | // An object to configure the benchmark stopping conditions. | 
|  | // See documentation at the beginning of the file for the overall algorithm and | 
|  | // meaning of each field. | 
|  | struct BenchmarkOptions { | 
|  | // The minimum time for which the benchmark is running. | 
|  | Duration MinDuration = std::chrono::seconds(0); | 
|  | // The maximum time for which the benchmark is running. | 
|  | Duration MaxDuration = std::chrono::seconds(10); | 
|  | // The number of iterations in the first sample. | 
|  | uint32_t InitialIterations = 1; | 
|  | // The maximum number of iterations for any given sample. | 
|  | uint32_t MaxIterations = 10000000; | 
|  | // The minimum number of samples. | 
|  | uint32_t MinSamples = 4; | 
|  | // The maximum number of samples. | 
|  | uint32_t MaxSamples = 1000; | 
|  | // The benchmark will stop if the relative difference between the current and | 
|  | // the last estimation is less than epsilon. This is 1% by default. | 
|  | double Epsilon = 0.01; | 
|  | // The number of iterations grows exponentially between each sample. | 
|  | // Must be greater or equal to 1. | 
|  | double ScalingFactor = 1.4; | 
|  | BenchmarkLog Log = BenchmarkLog::None; | 
|  | }; | 
|  |  | 
|  | // The state of a benchmark. | 
|  | enum class BenchmarkStatus { | 
|  | Running, | 
|  | MaxDurationReached, | 
|  | MaxIterationsReached, | 
|  | MaxSamplesReached, | 
|  | PrecisionReached, | 
|  | }; | 
|  |  | 
|  | // The internal state of the benchmark, useful to debug, test or report | 
|  | // statistics. | 
|  | struct BenchmarkState { | 
|  | size_t LastSampleIterations; | 
|  | Duration LastBatchElapsed; | 
|  | BenchmarkStatus CurrentStatus; | 
|  | Duration CurrentBestGuess; // The time estimation for a single run of `foo`. | 
|  | double ChangeRatio; // The change in time estimation between previous and | 
|  | // current samples. | 
|  | }; | 
|  |  | 
|  | // A lightweight result for a benchmark. | 
|  | struct BenchmarkResult { | 
|  | BenchmarkStatus TerminationStatus = BenchmarkStatus::Running; | 
|  | Duration BestGuess = {}; | 
|  | std::optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog; | 
|  | }; | 
|  |  | 
|  | // Stores information about a cache in the host memory system. | 
|  | struct CacheInfo { | 
|  | std::string Type; //  e.g. "Instruction", "Data", "Unified". | 
|  | int Level;        // 0 is closest to processing unit. | 
|  | int Size;         // In bytes. | 
|  | int NumSharing;   // The number of processing units (Hyper-Threading Thread) | 
|  | // with which this cache is shared. | 
|  | }; | 
|  |  | 
|  | // Stores information about the host. | 
|  | struct HostState { | 
|  | std::string CpuName; // returns a string compatible with the -march option. | 
|  | double CpuFrequency; // in Hertz. | 
|  | std::vector<CacheInfo> Caches; | 
|  |  | 
|  | static HostState get(); | 
|  | }; | 
|  |  | 
|  | namespace internal { | 
|  |  | 
|  | struct Measurement { | 
|  | size_t Iterations = 0; | 
|  | Duration Elapsed = {}; | 
|  | }; | 
|  |  | 
|  | // Updates the estimation of the elapsed time for a single iteration. | 
|  | class RefinableRuntimeEstimation { | 
|  | Duration TotalTime = {}; | 
|  | size_t TotalIterations = 0; | 
|  |  | 
|  | public: | 
|  | Duration update(const Measurement &M) { | 
|  | assert(M.Iterations > 0); | 
|  | // Duration is encoded as a double (see definition). | 
|  | // `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect | 
|  | // loss of precision due to radically different scales. | 
|  | TotalTime += M.Elapsed; | 
|  | TotalIterations += M.Iterations; | 
|  | return TotalTime / TotalIterations; | 
|  | } | 
|  | }; | 
|  |  | 
|  | // This class tracks the progression of the runtime estimation. | 
|  | class RuntimeEstimationProgression { | 
|  | RefinableRuntimeEstimation RRE; | 
|  |  | 
|  | public: | 
|  | Duration CurrentEstimation = {}; | 
|  |  | 
|  | // Returns the change ratio between our best guess so far and the one from the | 
|  | // new measurement. | 
|  | double computeImprovement(const Measurement &M) { | 
|  | const Duration NewEstimation = RRE.update(M); | 
|  | const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0)); | 
|  | CurrentEstimation = NewEstimation; | 
|  | return Ratio; | 
|  | } | 
|  | }; | 
|  |  | 
|  | } // namespace internal | 
|  |  | 
|  | // Measures the runtime of `foo` until conditions defined by `Options` are met. | 
|  | // | 
|  | // To avoid measurement's imprecisions we measure batches of `foo`. | 
|  | // The batch size is growing by `ScalingFactor` to minimize the effect of | 
|  | // measuring. | 
|  | // | 
|  | // Note: The benchmark is not responsible for serializing the executions of | 
|  | // `foo`. It is not suitable for measuring, very small & side effect free | 
|  | // functions, as the processor is free to execute several executions in | 
|  | // parallel. | 
|  | // | 
|  | // - Options: A set of parameters controlling the stopping conditions for the | 
|  | //     benchmark. | 
|  | // - foo: The function under test. It takes one value and returns one value. | 
|  | //     The input value is used to randomize the execution of `foo` as part of a | 
|  | //     batch to mitigate the effect of the branch predictor. Signature: | 
|  | //     `ProductType foo(ParameterProvider::value_type value);` | 
|  | //     The output value is a product of the execution of `foo` and prevents the | 
|  | //     compiler from optimizing out foo's body. | 
|  | // - ParameterProvider: An object responsible for providing a range of | 
|  | //     `Iterations` values to use as input for `foo`. The `value_type` of the | 
|  | //     returned container has to be compatible with `foo` argument. | 
|  | //     Must implement one of: | 
|  | //     `Container<ParameterType> generateBatch(size_t Iterations);` | 
|  | //     `const Container<ParameterType>& generateBatch(size_t Iterations);` | 
|  | // - Clock: An object providing the current time. Must implement: | 
|  | //     `std::chrono::time_point now();` | 
|  | template <typename Function, typename ParameterProvider, | 
|  | typename BenchmarkClock = const std::chrono::high_resolution_clock> | 
|  | BenchmarkResult benchmark(const BenchmarkOptions &Options, | 
|  | ParameterProvider &PP, Function foo, | 
|  | BenchmarkClock &Clock = BenchmarkClock()) { | 
|  | BenchmarkResult Result; | 
|  | internal::RuntimeEstimationProgression REP; | 
|  | Duration TotalBenchmarkDuration = {}; | 
|  | size_t Iterations = std::max(Options.InitialIterations, uint32_t(1)); | 
|  | size_t Samples = 0; | 
|  | if (Options.ScalingFactor < 1.0) | 
|  | report_fatal_error("ScalingFactor should be >= 1"); | 
|  | if (Options.Log != BenchmarkLog::None) | 
|  | Result.MaybeBenchmarkLog.emplace(); | 
|  | for (;;) { | 
|  | // Request a new Batch of size `Iterations`. | 
|  | const auto &Batch = PP.generateBatch(Iterations); | 
|  |  | 
|  | // Measuring this Batch. | 
|  | const auto StartTime = Clock.now(); | 
|  | for (const auto Parameter : Batch) { | 
|  | const auto Production = foo(Parameter); | 
|  | benchmark::DoNotOptimize(Production); | 
|  | } | 
|  | const auto EndTime = Clock.now(); | 
|  | const Duration Elapsed = EndTime - StartTime; | 
|  |  | 
|  | // Updating statistics. | 
|  | ++Samples; | 
|  | TotalBenchmarkDuration += Elapsed; | 
|  | const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed}); | 
|  | Result.BestGuess = REP.CurrentEstimation; | 
|  |  | 
|  | // Stopping condition. | 
|  | if (TotalBenchmarkDuration >= Options.MinDuration && | 
|  | Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) | 
|  | Result.TerminationStatus = BenchmarkStatus::PrecisionReached; | 
|  | else if (Samples >= Options.MaxSamples) | 
|  | Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached; | 
|  | else if (TotalBenchmarkDuration >= Options.MaxDuration) | 
|  | Result.TerminationStatus = BenchmarkStatus::MaxDurationReached; | 
|  | else if (Iterations >= Options.MaxIterations) | 
|  | Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached; | 
|  |  | 
|  | if (Result.MaybeBenchmarkLog) { | 
|  | auto &BenchmarkLog = *Result.MaybeBenchmarkLog; | 
|  | if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty()) | 
|  | BenchmarkLog.pop_back(); | 
|  | BenchmarkState BS; | 
|  | BS.LastSampleIterations = Iterations; | 
|  | BS.LastBatchElapsed = Elapsed; | 
|  | BS.CurrentStatus = Result.TerminationStatus; | 
|  | BS.CurrentBestGuess = Result.BestGuess; | 
|  | BS.ChangeRatio = ChangeRatio; | 
|  | BenchmarkLog.push_back(BS); | 
|  | } | 
|  |  | 
|  | if (Result.TerminationStatus != BenchmarkStatus::Running) | 
|  | return Result; | 
|  |  | 
|  | if (Options.ScalingFactor > 1 && | 
|  | Iterations * Options.ScalingFactor == Iterations) | 
|  | report_fatal_error( | 
|  | "`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor " | 
|  | "or InitialIterations."); | 
|  |  | 
|  | Iterations *= Options.ScalingFactor; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Interprets `Array` as a circular buffer of `Size` elements. | 
|  | template <typename T> class CircularArrayRef { | 
|  | llvm::ArrayRef<T> Array; | 
|  | size_t Size; | 
|  |  | 
|  | public: | 
|  | using value_type = T; | 
|  | using reference = T &; | 
|  | using const_reference = const T &; | 
|  | using difference_type = ssize_t; | 
|  | using size_type = size_t; | 
|  |  | 
|  | class const_iterator { | 
|  | using iterator_category = std::input_iterator_tag; | 
|  | llvm::ArrayRef<T> Array; | 
|  | size_t Index; | 
|  | size_t Offset; | 
|  |  | 
|  | public: | 
|  | explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0) | 
|  | : Array(Array), Index(Index), Offset(Index % Array.size()) {} | 
|  | const_iterator &operator++() { | 
|  | ++Index; | 
|  | ++Offset; | 
|  | if (Offset == Array.size()) | 
|  | Offset = 0; | 
|  | return *this; | 
|  | } | 
|  | bool operator==(const_iterator Other) const { return Index == Other.Index; } | 
|  | bool operator!=(const_iterator Other) const { return !(*this == Other); } | 
|  | const T &operator*() const { return Array[Offset]; } | 
|  | }; | 
|  |  | 
|  | CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size) | 
|  | : Array(Array), Size(Size) { | 
|  | assert(Array.size() > 0); | 
|  | } | 
|  |  | 
|  | const_iterator begin() const { return const_iterator(Array); } | 
|  | const_iterator end() const { return const_iterator(Array, Size); } | 
|  | }; | 
|  |  | 
|  | // A convenient helper to produce a CircularArrayRef from an ArrayRef. | 
|  | template <typename T> | 
|  | CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) { | 
|  | return {Array, Size}; | 
|  | } | 
|  |  | 
|  | // Creates an std::array which storage size is constrained under `Bytes`. | 
|  | template <typename T, size_t Bytes> | 
|  | using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>; | 
|  |  | 
|  | // A convenient helper to produce a CircularArrayRef from a | 
|  | // ByteConstrainedArray. | 
|  | template <typename T, size_t N> | 
|  | CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) { | 
|  | return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size}; | 
|  | } | 
|  |  | 
|  | // Makes sure the binary was compiled in release mode and that frequency | 
|  | // governor is set on performance. | 
|  | void checkRequirements(); | 
|  |  | 
|  | } // namespace libc_benchmarks | 
|  | } // namespace llvm | 
|  |  | 
|  | #endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H |