| #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| |
| #include "benchmarks/gpu/Random.h" |
| |
| #include "benchmarks/gpu/timing/timing.h" |
| |
| #include "hdr/stdint_proxy.h" |
| #include "src/__support/CPP/algorithm.h" |
| #include "src/__support/CPP/array.h" |
| #include "src/__support/CPP/string_view.h" |
| #include "src/__support/CPP/type_traits.h" |
| #include "src/__support/FPUtil/FPBits.h" |
| #include "src/__support/FPUtil/sqrt.h" |
| #include "src/__support/macros/config.h" |
| |
| namespace LIBC_NAMESPACE_DECL { |
| |
| namespace benchmarks { |
| |
| struct BenchmarkOptions { |
| uint32_t initial_iterations = 1; |
| uint32_t min_iterations = 1; |
| uint32_t max_iterations = 10000000; |
| uint32_t min_samples = 4; |
| uint32_t max_samples = 1000; |
| int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us |
| int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second |
| double epsilon = 0.0001; |
| double scaling_factor = 1.4; |
| }; |
| |
| class RefinableRuntimeEstimator { |
| uint32_t iterations = 0; |
| uint64_t sum_of_cycles = 0; |
| uint64_t sum_of_squared_cycles = 0; |
| |
| public: |
| void update(uint64_t cycles) noexcept { |
| iterations += 1; |
| sum_of_cycles += cycles; |
| sum_of_squared_cycles += cycles * cycles; |
| } |
| |
| void update(const RefinableRuntimeEstimator &other) noexcept { |
| iterations += other.iterations; |
| sum_of_cycles += other.sum_of_cycles; |
| sum_of_squared_cycles += other.sum_of_squared_cycles; |
| } |
| |
| double get_mean() const noexcept { |
| if (iterations == 0) |
| return 0.0; |
| |
| return static_cast<double>(sum_of_cycles) / iterations; |
| } |
| |
| double get_variance() const noexcept { |
| if (iterations == 0) |
| return 0.0; |
| |
| const double num = static_cast<double>(iterations); |
| const double sum_x = static_cast<double>(sum_of_cycles); |
| const double sum_x2 = static_cast<double>(sum_of_squared_cycles); |
| |
| const double mean_of_squares = sum_x2 / num; |
| const double mean = sum_x / num; |
| const double mean_squared = mean * mean; |
| const double variance = mean_of_squares - mean_squared; |
| |
| return variance < 0.0 ? 0.0 : variance; |
| } |
| |
| double get_stddev() const noexcept { |
| return fputil::sqrt<double>(get_variance()); |
| } |
| |
| uint32_t get_iterations() const noexcept { return iterations; } |
| }; |
| |
| // Tracks the progression of the runtime estimation |
| class RuntimeEstimationProgression { |
| RefinableRuntimeEstimator estimator; |
| double current_mean = 0.0; |
| |
| public: |
| const RefinableRuntimeEstimator &get_estimator() const noexcept { |
| return estimator; |
| } |
| |
| double |
| compute_improvement(const RefinableRuntimeEstimator &sample_estimator) { |
| if (sample_estimator.get_iterations() == 0) |
| return 1.0; |
| |
| estimator.update(sample_estimator); |
| |
| const double new_mean = estimator.get_mean(); |
| if (current_mean == 0.0 || new_mean == 0.0) { |
| current_mean = new_mean; |
| return 1.0; |
| } |
| |
| double ratio = (current_mean / new_mean) - 1.0; |
| if (ratio < 0) |
| ratio = -ratio; |
| |
| current_mean = new_mean; |
| return ratio; |
| } |
| }; |
| |
| struct BenchmarkResult { |
| uint64_t total_iterations = 0; |
| double cycles = 0; |
| double standard_deviation = 0; |
| uint64_t min = UINT64_MAX; |
| uint64_t max = 0; |
| }; |
| |
| struct BenchmarkTarget { |
| using IndexedFnPtr = uint64_t (*)(uint32_t); |
| using IndexlessFnPtr = uint64_t (*)(); |
| |
| enum class Kind : uint8_t { Indexed, Indexless } kind; |
| union { |
| IndexedFnPtr indexed_fn_ptr; |
| IndexlessFnPtr indexless_fn_ptr; |
| }; |
| |
| LIBC_INLINE BenchmarkTarget(IndexedFnPtr func) |
| : kind(Kind::Indexed), indexed_fn_ptr(func) {} |
| LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func) |
| : kind(Kind::Indexless), indexless_fn_ptr(func) {} |
| |
| LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const { |
| return kind == Kind::Indexed ? indexed_fn_ptr(call_index) |
| : indexless_fn_ptr(); |
| } |
| }; |
| |
| BenchmarkResult benchmark(const BenchmarkOptions &options, |
| const BenchmarkTarget &target); |
| |
| class Benchmark { |
| const BenchmarkTarget target; |
| const cpp::string_view suite_name; |
| const cpp::string_view test_name; |
| const uint32_t num_threads; |
| |
| public: |
| Benchmark(uint64_t (*f)(), const char *suite, const char *test, |
| uint32_t threads) |
| : target(BenchmarkTarget(f)), suite_name(suite), test_name(test), |
| num_threads(threads) { |
| add_benchmark(this); |
| } |
| |
| Benchmark(uint64_t (*f)(uint32_t), char const *suite_name, |
| char const *test_name, uint32_t num_threads) |
| : target(BenchmarkTarget(f)), suite_name(suite_name), |
| test_name(test_name), num_threads(num_threads) { |
| add_benchmark(this); |
| } |
| |
| static void run_benchmarks(); |
| const cpp::string_view get_suite_name() const { return suite_name; } |
| const cpp::string_view get_test_name() const { return test_name; } |
| |
| protected: |
| static void add_benchmark(Benchmark *benchmark); |
| |
| private: |
| BenchmarkResult run() { |
| BenchmarkOptions options; |
| return benchmark(options, target); |
| } |
| }; |
| |
| template <typename T> class MathPerf { |
| static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) { |
| const uint64_t tid = gpu::get_thread_id(); |
| return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL); |
| } |
| |
| public: |
| // Returns cycles-per-call (lower is better) |
| template <size_t N = 1, typename Dist> |
| static uint64_t run_throughput(T (*f)(T), const Dist &dist, |
| uint32_t call_index) { |
| cpp::array<T, N> inputs; |
| |
| uint64_t base_seed = static_cast<uint64_t>(call_index); |
| uint64_t salt = static_cast<uint64_t>(N); |
| RandomGenerator rng(make_seed(base_seed, salt)); |
| |
| for (size_t i = 0; i < N; ++i) |
| inputs[i] = dist(rng); |
| |
| uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); |
| |
| return total_time / N; |
| } |
| |
| // Returns cycles-per-call (lower is better) |
| template <size_t N = 1, typename Dist1, typename Dist2> |
| static uint64_t run_throughput(T (*f)(T, T), const Dist1 &dist1, |
| const Dist2 &dist2, uint32_t call_index) { |
| cpp::array<T, N> inputs1; |
| cpp::array<T, N> inputs2; |
| |
| uint64_t base_seed = static_cast<uint64_t>(call_index); |
| uint64_t salt = static_cast<uint64_t>(N); |
| RandomGenerator rng(make_seed(base_seed, salt)); |
| |
| for (size_t i = 0; i < N; ++i) { |
| inputs1[i] = dist1(rng); |
| inputs2[i] = dist2(rng); |
| } |
| |
| uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); |
| |
| return total_time / N; |
| } |
| }; |
| |
| } // namespace benchmarks |
| } // namespace LIBC_NAMESPACE_DECL |
| |
| // Passing -1 indicates the benchmark should be run with as many threads as |
| // allocated by the user in the benchmark's CMake. |
| #define BENCHMARK(SuiteName, TestName, Func) \ |
| LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| Func, #SuiteName, #TestName, -1) |
| |
| #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ |
| LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| Func, #SuiteName, #TestName, NumThreads) |
| |
| #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ |
| BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) |
| |
| #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ |
| BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ |
| LIBC_NAMESPACE::gpu::get_lane_size()) |
| |
| #endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |