blob: a4a0ff4ec46e5fa61478125b4fdbb3cc8d0478e4 [file] [log] [blame]
#include "LibcGpuBenchmark.h"
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/NearestIntegerOperations.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
#include "src/time/clock.h"
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
FixedVector<Benchmark *, 64> benchmarks;
void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
double value) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
while (true) {
double current_value = FPBits(expected_bits).get_val();
double next_value = current_value + value;
uint64_t desired_bits = FPBits(next_value).uintval();
if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
cpp::MemoryOrder::ACQUIRE,
cpp::MemoryOrder::RELAXED))
break;
}
}
struct AtomicBenchmarkSums {
cpp::Atomic<uint32_t> active_threads = 0;
cpp::Atomic<uint64_t> iterations_sum = 0;
cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
cpp::Atomic<uint64_t> min = UINT64_MAX;
cpp::Atomic<uint64_t> max = 0;
void reset() {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
active_threads.store(0, cpp::MemoryOrder::RELAXED);
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
max.store(0, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
}
void update(const BenchmarkResult &result) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
iterations_sum.fetch_add(result.total_iterations,
cpp::MemoryOrder::RELAXED);
const double n_i = static_cast<double>(result.total_iterations);
const double mean_i = result.cycles;
const double stddev_i = result.standard_deviation;
const double variance_i = stddev_i * stddev_i;
atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
atomic_add_double(weighted_squared_cycles_sum_bits,
n_i * (variance_i + mean_i * mean_i));
// Perform a CAS loop to atomically update the min
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
while (!min.compare_exchange_strong(
orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE,
cpp::MemoryOrder::RELAXED))
;
// Perform a CAS loop to atomically update the max
uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
while (!max.compare_exchange_strong(
orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE,
cpp::MemoryOrder::RELAXED))
;
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
}
};
AtomicBenchmarkSums all_results;
constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";
void print_results(Benchmark *b) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
BenchmarkResult final_result;
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
const uint32_t num_threads =
all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
final_result.total_iterations =
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
if (final_result.total_iterations > 0) {
const uint64_t s1_bits =
all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
cpp::MemoryOrder::RELAXED);
const double S1 = FPBits(s1_bits).get_val();
const double S2 = FPBits(s2_bits).get_val();
const double N = static_cast<double>(final_result.total_iterations);
const double global_mean = S1 / N;
const double global_mean_of_squares = S2 / N;
const double global_variance =
global_mean_of_squares - (global_mean * global_mean);
final_result.cycles = global_mean;
final_result.standard_deviation =
fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
} else {
final_result.cycles = 0.0;
final_result.standard_deviation = 0.0;
}
final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
LIBC_NAMESPACE::printf(
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n",
b->get_test_name().data(), final_result.cycles,
final_result.standard_deviation,
static_cast<unsigned long long>(final_result.min),
static_cast<unsigned long long>(final_result.max),
static_cast<unsigned long long>(final_result.total_iterations),
static_cast<unsigned>(num_threads));
}
void print_header() {
LIBC_NAMESPACE::printf("%s", GREEN);
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
" Min | Max | Iterations | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());
cpp::string separator(titles.size(), '-');
separator[titles.size() - 1] = '\n';
LIBC_NAMESPACE::printf(separator.data());
}
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
if (id == 0)
print_header();
gpu::sync_threads();
for (Benchmark *b : benchmarks) {
if (id == 0)
all_results.reset();
gpu::sync_threads();
if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) {
auto current_result = b->run();
all_results.update(current_result);
}
gpu::sync_threads();
if (id == 0)
print_results(b);
}
gpu::sync_threads();
}
BenchmarkResult benchmark(const BenchmarkOptions &options,
const BenchmarkTarget &target) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
uint32_t iterations = options.initial_iterations;
if (iterations < 1u)
iterations = 1;
uint32_t samples = 0;
uint64_t total_time = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
uint32_t call_index = 0;
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
RefinableRuntimeEstimator sample_estimator;
const clock_t start = clock();
while (sample_estimator.get_iterations() < iterations) {
auto current_result = target(call_index++);
max = cpp::max(max, current_result);
min = cpp::min(min, current_result);
sample_estimator.update(current_result);
}
const clock_t end = clock();
const clock_t duration_ns =
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
total_time += duration_ns;
time_budget -= duration_ns;
samples++;
const double change_ratio = rep.compute_improvement(sample_estimator);
if (samples >= options.max_samples || iterations >= options.max_iterations)
break;
const auto total_iterations = rep.get_estimator().get_iterations();
if (total_time >= options.min_duration && samples >= options.min_samples &&
total_iterations >= options.min_iterations &&
change_ratio < options.epsilon)
break;
iterations = static_cast<uint32_t>(
fputil::ceil(iterations * options.scaling_factor));
}
const auto &estimator = rep.get_estimator();
result.total_iterations = estimator.get_iterations();
result.cycles = estimator.get_mean();
result.standard_deviation = estimator.get_stddev();
result.min = min;
result.max = max;
return result;
}
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL