blob: 778d09dbdf4e07d5c8239ac7eb00779cf857743a [file] [log] [blame] [edit]
// This program tests performance impact of Epilogue Vectorization
// with varying epilogue lengths, and vector widths.
#include <cstdint>
#include <memory>
#include <random>
#include "benchmark/benchmark.h"
static std::mt19937 rng;
uint64_t g_sum = 0;
// Initialize array A with random numbers.
template <typename Ty>
static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(),
std::numeric_limits<Ty>::max());
for (unsigned I = 0; I < N; I++)
A[I] = distrib(rng);
}
// Helper to block optimizing \p F based on its arguments.
template <typename F, typename... Args>
__attribute__((optnone)) static uint64_t callThroughOptnone(F &&f, Args &&...args) {
return f(std::forward<Args>(args)...);
}
template <typename Ty>
static void __attribute__((always_inline))
runBenchForEpilogueVectorization(benchmark::State &state,
uint64_t (*Fn)(Ty *, Ty *, Ty *, int)) {
auto Iterations = state.range(0);
std::unique_ptr<Ty[]> A(new Ty[Iterations]);
std::unique_ptr<Ty[]> B(new Ty[Iterations]);
std::unique_ptr<Ty[]> C(new Ty[Iterations]);
init_data(A, Iterations);
init_data(B, Iterations);
init_data(C, Iterations);
for (auto _ : state) {
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::DoNotOptimize(C);
benchmark::ClobberMemory();
g_sum += callThroughOptnone(Fn, &A[0], &B[0], &C[0], Iterations);
}
}
template <typename Ty>
static uint64_t __attribute__((noinline))
loopAutoVec(Ty *A, Ty *B, Ty *C, int Iterations) {
for (int J = 0; J < Iterations; J++) {
A[J] = B[J] + C[J];
}
return 0;
}
template <typename Ty>
static uint64_t __attribute__((noinline))
loopWithReductionAutoVec(Ty *A, Ty *B, Ty *C, int Iterations) {
uint64_t sum = 0;
for (int J = 0; J < Iterations; J++) {
sum += A[J];
}
return sum;
}
template <typename Ty> void benchAutoVec(benchmark::State &state) {
runBenchForEpilogueVectorization<Ty>(state, &loopAutoVec<Ty>);
}
template <typename Ty> void benchReductionAutoVec(benchmark::State &state) {
runBenchForEpilogueVectorization<Ty>(state, &loopWithReductionAutoVec<Ty>);
}
#ifdef ALL_LOOP_EPILOGUE_TESTS
BENCHMARK_TEMPLATE(benchAutoVec, uint8_t)->DenseRange(65, 127, 1);
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint8_t)->DenseRange(65, 127, 1);
BENCHMARK_TEMPLATE(benchAutoVec, uint16_t)->DenseRange(65, 127, 1);
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint16_t)->DenseRange(65, 127, 1);
BENCHMARK_TEMPLATE(benchAutoVec, uint32_t)->DenseRange(65, 127, 1);
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint32_t)->DenseRange(65, 127, 1);
#else
BENCHMARK_TEMPLATE(benchAutoVec, uint8_t)->Arg(65)->Arg(127);
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint8_t)->Arg(65)->Arg(127);
BENCHMARK_TEMPLATE(benchAutoVec, uint16_t)->Arg(65)->Arg(127);
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint16_t)->Arg(65)->Arg(127);
BENCHMARK_TEMPLATE(benchAutoVec, uint32_t)->Arg(65)->Arg(127);
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint32_t)->Arg(65)->Arg(127);
#endif