| #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| |
| #include "benchmarks/gpu/timing/timing.h" |
| |
| #include "hdr/stdint_proxy.h" |
| #include "src/__support/CPP/algorithm.h" |
| #include "src/__support/CPP/array.h" |
| #include "src/__support/CPP/string_view.h" |
| #include "src/__support/CPP/type_traits.h" |
| #include "src/__support/FPUtil/FPBits.h" |
| #include "src/__support/FPUtil/sqrt.h" |
| #include "src/__support/macros/config.h" |
| |
| namespace LIBC_NAMESPACE_DECL { |
| |
| namespace benchmarks { |
| |
| struct BenchmarkOptions { |
| uint32_t initial_iterations = 1; |
| uint32_t min_iterations = 1; |
| uint32_t max_iterations = 10000000; |
| uint32_t min_samples = 4; |
| uint32_t max_samples = 1000; |
| int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us |
| int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second |
| double epsilon = 0.0001; |
| double scaling_factor = 1.4; |
| }; |
| |
| class RefinableRuntimeEstimator { |
| uint32_t iterations = 0; |
| uint64_t sum_of_cycles = 0; |
| uint64_t sum_of_squared_cycles = 0; |
| |
| public: |
| void update(uint64_t cycles) noexcept { |
| iterations += 1; |
| sum_of_cycles += cycles; |
| sum_of_squared_cycles += cycles * cycles; |
| } |
| |
| void update(const RefinableRuntimeEstimator &other) noexcept { |
| iterations += other.iterations; |
| sum_of_cycles += other.sum_of_cycles; |
| sum_of_squared_cycles += other.sum_of_squared_cycles; |
| } |
| |
| double get_mean() const noexcept { |
| if (iterations == 0) |
| return 0.0; |
| |
| return static_cast<double>(sum_of_cycles) / iterations; |
| } |
| |
| double get_variance() const noexcept { |
| if (iterations == 0) |
| return 0.0; |
| |
| const double num = static_cast<double>(iterations); |
| const double sum_x = static_cast<double>(sum_of_cycles); |
| const double sum_x2 = static_cast<double>(sum_of_squared_cycles); |
| |
| const double mean_of_squares = sum_x2 / num; |
| const double mean = sum_x / num; |
| const double mean_squared = mean * mean; |
| const double variance = mean_of_squares - mean_squared; |
| |
| return variance < 0.0 ? 0.0 : variance; |
| } |
| |
| double get_stddev() const noexcept { |
| return fputil::sqrt<double>(get_variance()); |
| } |
| |
| uint32_t get_iterations() const noexcept { return iterations; } |
| }; |
| |
| // Tracks the progression of the runtime estimation |
| class RuntimeEstimationProgression { |
| RefinableRuntimeEstimator estimator; |
| double current_mean = 0.0; |
| |
| public: |
| const RefinableRuntimeEstimator &get_estimator() const noexcept { |
| return estimator; |
| } |
| |
| double |
| compute_improvement(const RefinableRuntimeEstimator &sample_estimator) { |
| if (sample_estimator.get_iterations() == 0) |
| return 1.0; |
| |
| estimator.update(sample_estimator); |
| |
| const double new_mean = estimator.get_mean(); |
| if (current_mean == 0.0 || new_mean == 0.0) { |
| current_mean = new_mean; |
| return 1.0; |
| } |
| |
| double ratio = (current_mean / new_mean) - 1.0; |
| if (ratio < 0) |
| ratio = -ratio; |
| |
| current_mean = new_mean; |
| return ratio; |
| } |
| }; |
| |
| struct BenchmarkResult { |
| uint64_t total_iterations = 0; |
| double cycles = 0; |
| double standard_deviation = 0; |
| uint64_t min = UINT64_MAX; |
| uint64_t max = 0; |
| }; |
| |
| struct BenchmarkTarget { |
| using IndexedFnPtr = uint64_t (*)(uint32_t); |
| using IndexlessFnPtr = uint64_t (*)(); |
| |
| enum class Kind : uint8_t { Indexed, Indexless } kind; |
| union { |
| IndexedFnPtr indexed_fn_ptr; |
| IndexlessFnPtr indexless_fn_ptr; |
| }; |
| |
| LIBC_INLINE BenchmarkTarget(IndexedFnPtr func) |
| : kind(Kind::Indexed), indexed_fn_ptr(func) {} |
| LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func) |
| : kind(Kind::Indexless), indexless_fn_ptr(func) {} |
| |
| LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const { |
| return kind == Kind::Indexed ? indexed_fn_ptr(call_index) |
| : indexless_fn_ptr(); |
| } |
| }; |
| |
| BenchmarkResult benchmark(const BenchmarkOptions &options, |
| const BenchmarkTarget &target); |
| |
| class Benchmark { |
| const BenchmarkTarget target; |
| const cpp::string_view suite_name; |
| const cpp::string_view test_name; |
| const uint32_t num_threads; |
| |
| public: |
| Benchmark(uint64_t (*f)(), const char *suite, const char *test, |
| uint32_t threads) |
| : target(BenchmarkTarget(f)), suite_name(suite), test_name(test), |
| num_threads(threads) { |
| add_benchmark(this); |
| } |
| |
| Benchmark(uint64_t (*f)(uint32_t), char const *suite_name, |
| char const *test_name, uint32_t num_threads) |
| : target(BenchmarkTarget(f)), suite_name(suite_name), |
| test_name(test_name), num_threads(num_threads) { |
| add_benchmark(this); |
| } |
| |
| static void run_benchmarks(); |
| const cpp::string_view get_suite_name() const { return suite_name; } |
| const cpp::string_view get_test_name() const { return test_name; } |
| |
| protected: |
| static void add_benchmark(Benchmark *benchmark); |
| |
| private: |
| BenchmarkResult run() { |
| BenchmarkOptions options; |
| return benchmark(options, target); |
| } |
| }; |
| |
| class RandomGenerator { |
| uint64_t state; |
| |
| static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept { |
| x += 0x9E3779B97F4A7C15ULL; |
| x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL; |
| x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL; |
| x = (x ^ (x >> 31)); |
| return x ? x : 0x9E3779B97F4A7C15ULL; |
| } |
| |
| public: |
| explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept |
| : state(splitmix64(seed)) {} |
| |
| LIBC_INLINE uint64_t next64() noexcept { |
| uint64_t x = state; |
| x ^= x >> 12; |
| x ^= x << 25; |
| x ^= x >> 27; |
| state = x; |
| return x * 0x2545F4914F6CDD1DULL; |
| } |
| |
| LIBC_INLINE uint32_t next32() noexcept { |
| return static_cast<uint32_t>(next64() >> 32); |
| } |
| }; |
| |
| // We want random floating-point values whose *unbiased* exponent e is |
| // approximately uniform in [min_exp, max_exp]. That is, |
| // 2^min_exp <= |value| < 2^(max_exp + 1). |
| // Caveats / boundaries: |
| // - e = -EXP_BIAS ==> subnormal range (biased exponent = 0). We ensure a |
| // non-zero mantissa so we don't accidentally produce 0. |
| // - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers. |
| // - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp |
| // defaults to EXP_BIAS. |
| template <typename T> |
| static T |
| get_rand_input(RandomGenerator &rng, |
| int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS, |
| int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) { |
| using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>; |
| using Storage = typename FPBits::StorageType; |
| |
| // Sanitize and clamp requested range to what the format supports |
| if (min_exp > max_exp) { |
| auto tmp = min_exp; |
| min_exp = max_exp; |
| max_exp = tmp; |
| }; |
| min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS); |
| max_exp = cpp::min(max_exp, FPBits::EXP_BIAS); |
| |
| // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo |
| // bias |
| auto sample_in_range = [&](uint64_t r) -> int32_t { |
| const uint64_t range = static_cast<uint64_t>( |
| static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1); |
| const uint64_t threshold = (-range) % range; |
| while (r < threshold) |
| r = rng.next64(); |
| return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range)); |
| }; |
| const int32_t e = sample_in_range(rng.next64()); |
| |
| // Start from random bits to get random sign and mantissa |
| FPBits xbits([&] { |
| if constexpr (cpp::is_same_v<T, double>) |
| return FPBits(rng.next64()); |
| else |
| return FPBits(rng.next32()); |
| }()); |
| |
| if (e == -FPBits::EXP_BIAS) { |
| // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0 |
| xbits.set_biased_exponent(Storage(0)); |
| if (xbits.get_mantissa() == Storage(0)) |
| xbits.set_mantissa(Storage(1)); |
| } else { |
| // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS] |
| const int32_t biased = e + FPBits::EXP_BIAS; |
| xbits.set_biased_exponent(static_cast<Storage>(biased)); |
| } |
| return xbits.get_val(); |
| } |
| |
| template <typename T> class MathPerf { |
| static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) { |
| const uint64_t tid = gpu::get_thread_id(); |
| return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL); |
| } |
| |
| public: |
| // Returns cycles-per-call (lower is better) |
| template <size_t N = 1> |
| static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp, |
| uint32_t call_index) { |
| cpp::array<T, N> inputs; |
| |
| uint64_t base_seed = static_cast<uint64_t>(call_index); |
| uint64_t salt = static_cast<uint64_t>(N); |
| RandomGenerator rng(make_seed(base_seed, salt)); |
| |
| for (size_t i = 0; i < N; ++i) |
| inputs[i] = get_rand_input<T>(rng, min_exp, max_exp); |
| |
| uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); |
| |
| return total_time / N; |
| } |
| |
| // Returns cycles-per-call (lower is better) |
| template <size_t N = 1> |
| static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, |
| int arg1_max_exp, int arg2_min_exp, |
| int arg2_max_exp, |
| uint32_t call_index) { |
| cpp::array<T, N> inputs1; |
| cpp::array<T, N> inputs2; |
| |
| uint64_t base_seed = static_cast<uint64_t>(call_index); |
| uint64_t salt = static_cast<uint64_t>(N); |
| RandomGenerator rng(make_seed(base_seed, salt)); |
| |
| for (size_t i = 0; i < N; ++i) { |
| inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp); |
| inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp); |
| } |
| |
| uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); |
| |
| return total_time / N; |
| } |
| }; |
| |
| } // namespace benchmarks |
| } // namespace LIBC_NAMESPACE_DECL |
| |
| // Passing -1 indicates the benchmark should be run with as many threads as |
| // allocated by the user in the benchmark's CMake. |
| #define BENCHMARK(SuiteName, TestName, Func) \ |
| LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| Func, #SuiteName, #TestName, -1) |
| |
| #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ |
| LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| Func, #SuiteName, #TestName, NumThreads) |
| |
| #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ |
| BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) |
| |
| #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ |
| BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ |
| LIBC_NAMESPACE::gpu::get_lane_size()) |
| |
| #endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |