blob: b310d49a60fd1671300b2869a45c6a418044cdb5 [file] [log] [blame]
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#include "benchmarks/gpu/Random.h"
#include "benchmarks/gpu/timing/timing.h"
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/macros/config.h"
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us
int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
double epsilon = 0.0001;
double scaling_factor = 1.4;
};
class RefinableRuntimeEstimator {
uint32_t iterations = 0;
uint64_t sum_of_cycles = 0;
uint64_t sum_of_squared_cycles = 0;
public:
void update(uint64_t cycles) noexcept {
iterations += 1;
sum_of_cycles += cycles;
sum_of_squared_cycles += cycles * cycles;
}
void update(const RefinableRuntimeEstimator &other) noexcept {
iterations += other.iterations;
sum_of_cycles += other.sum_of_cycles;
sum_of_squared_cycles += other.sum_of_squared_cycles;
}
double get_mean() const noexcept {
if (iterations == 0)
return 0.0;
return static_cast<double>(sum_of_cycles) / iterations;
}
double get_variance() const noexcept {
if (iterations == 0)
return 0.0;
const double num = static_cast<double>(iterations);
const double sum_x = static_cast<double>(sum_of_cycles);
const double sum_x2 = static_cast<double>(sum_of_squared_cycles);
const double mean_of_squares = sum_x2 / num;
const double mean = sum_x / num;
const double mean_squared = mean * mean;
const double variance = mean_of_squares - mean_squared;
return variance < 0.0 ? 0.0 : variance;
}
double get_stddev() const noexcept {
return fputil::sqrt<double>(get_variance());
}
uint32_t get_iterations() const noexcept { return iterations; }
};
// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
RefinableRuntimeEstimator estimator;
double current_mean = 0.0;
public:
const RefinableRuntimeEstimator &get_estimator() const noexcept {
return estimator;
}
double
compute_improvement(const RefinableRuntimeEstimator &sample_estimator) {
if (sample_estimator.get_iterations() == 0)
return 1.0;
estimator.update(sample_estimator);
const double new_mean = estimator.get_mean();
if (current_mean == 0.0 || new_mean == 0.0) {
current_mean = new_mean;
return 1.0;
}
double ratio = (current_mean / new_mean) - 1.0;
if (ratio < 0)
ratio = -ratio;
current_mean = new_mean;
return ratio;
}
};
struct BenchmarkResult {
uint64_t total_iterations = 0;
double cycles = 0;
double standard_deviation = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
};
struct BenchmarkTarget {
using IndexedFnPtr = uint64_t (*)(uint32_t);
using IndexlessFnPtr = uint64_t (*)();
enum class Kind : uint8_t { Indexed, Indexless } kind;
union {
IndexedFnPtr indexed_fn_ptr;
IndexlessFnPtr indexless_fn_ptr;
};
LIBC_INLINE BenchmarkTarget(IndexedFnPtr func)
: kind(Kind::Indexed), indexed_fn_ptr(func) {}
LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func)
: kind(Kind::Indexless), indexless_fn_ptr(func) {}
LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const {
return kind == Kind::Indexed ? indexed_fn_ptr(call_index)
: indexless_fn_ptr();
}
};
BenchmarkResult benchmark(const BenchmarkOptions &options,
const BenchmarkTarget &target);
class Benchmark {
const BenchmarkTarget target;
const cpp::string_view suite_name;
const cpp::string_view test_name;
const uint32_t num_threads;
public:
Benchmark(uint64_t (*f)(), const char *suite, const char *test,
uint32_t threads)
: target(BenchmarkTarget(f)), suite_name(suite), test_name(test),
num_threads(threads) {
add_benchmark(this);
}
Benchmark(uint64_t (*f)(uint32_t), char const *suite_name,
char const *test_name, uint32_t num_threads)
: target(BenchmarkTarget(f)), suite_name(suite_name),
test_name(test_name), num_threads(num_threads) {
add_benchmark(this);
}
static void run_benchmarks();
const cpp::string_view get_suite_name() const { return suite_name; }
const cpp::string_view get_test_name() const { return test_name; }
protected:
static void add_benchmark(Benchmark *benchmark);
private:
BenchmarkResult run() {
BenchmarkOptions options;
return benchmark(options, target);
}
};
template <typename T> class MathPerf {
static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
const uint64_t tid = gpu::get_thread_id();
return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL);
}
public:
// Returns cycles-per-call (lower is better)
template <size_t N = 1, typename Dist>
static uint64_t run_throughput(T (*f)(T), const Dist &dist,
uint32_t call_index) {
cpp::array<T, N> inputs;
uint64_t base_seed = static_cast<uint64_t>(call_index);
uint64_t salt = static_cast<uint64_t>(N);
RandomGenerator rng(make_seed(base_seed, salt));
for (size_t i = 0; i < N; ++i)
inputs[i] = dist(rng);
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
return total_time / N;
}
// Returns cycles-per-call (lower is better)
template <size_t N = 1, typename Dist1, typename Dist2>
static uint64_t run_throughput(T (*f)(T, T), const Dist1 &dist1,
const Dist2 &dist2, uint32_t call_index) {
cpp::array<T, N> inputs1;
cpp::array<T, N> inputs2;
uint64_t base_seed = static_cast<uint64_t>(call_index);
uint64_t salt = static_cast<uint64_t>(N);
RandomGenerator rng(make_seed(base_seed, salt));
for (size_t i = 0; i < N; ++i) {
inputs1[i] = dist1(rng);
inputs2[i] = dist2(rng);
}
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
return total_time / N;
}
};
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
// Passing -1 indicates the benchmark should be run with as many threads as
// allocated by the user in the benchmark's CMake.
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, -1)
#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, NumThreads)
#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H