blob: 5e4c00a4f705fc43d73bd3f24a174cb81630b50f [file] [log] [blame] [edit]
//===- EarlyExit.cpp - Early Exit Vectorization Benchmarks ------*- C++ -*-===//
///
/// \file
/// Benchmarks for early exit loop vectorization with varying trip counts
/// and element types.
///
//===----------------------------------------------------------------------===//
#include <algorithm>
#include <cstdint>
#include <memory>
#include "benchmark/benchmark.h"
// Check for builtin availability using __has_builtin when available.
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if __has_builtin(__builtin_assume_aligned)
#define ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned(ptr, align)
#else
#define ASSUME_ALIGNED(ptr, align) (ptr)
#endif
#if __has_builtin(__builtin_assume_dereferenceable)
#define ASSUME_DEREFERENCEABLE(ptr, size) __builtin_assume_dereferenceable(ptr, size)
#else
#define ASSUME_DEREFERENCEABLE(ptr, size) ((void)0)
#endif
/// Initialize array \p A with 1s.
template <typename Ty>
static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
std::fill_n(A.get(), N, static_cast<Ty>(1));
}
// Helper to block optimizing \p F based on its arguments.
template <typename F, typename... Args>
__attribute__((optnone)) static uint64_t callThroughOptnone(F &&f,
Args &&...args) {
return f(std::forward<Args>(args)...);
}
enum class EarlyExitPosition { None, First, Mid };
template <typename Ty>
static void __attribute__((always_inline))
runEarlyExitBenchmark(benchmark::State &state,
uint64_t (*Fn)(Ty *, Ty *, Ty *, int),
EarlyExitPosition ExitPos) {
auto Iterations = state.range(0);
std::unique_ptr<Ty[]> A(new Ty[Iterations]);
std::unique_ptr<Ty[]> B(new Ty[Iterations]);
std::unique_ptr<Ty[]> C(new Ty[Iterations]);
init_data(A, Iterations);
init_data(B, Iterations);
init_data(C, Iterations);
switch (ExitPos) {
case EarlyExitPosition::None:
break;
case EarlyExitPosition::First:
A[0] = 0;
break;
case EarlyExitPosition::Mid:
A[Iterations / 2] = 0;
break;
}
uint64_t Sum = 0;
for (auto _ : state) {
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::DoNotOptimize(C);
benchmark::DoNotOptimize(Sum);
benchmark::ClobberMemory();
Sum += callThroughOptnone(Fn, &A[0], &B[0], &C[0], Iterations);
}
}
template <typename Ty>
static uint64_t __attribute__((noinline))
singleEarlyExitWithSingleLoad(Ty *A, Ty *B, Ty *C, int Iterations) {
auto AlignedA = static_cast<Ty *>(ASSUME_ALIGNED(A, sizeof(Ty)));
ASSUME_DEREFERENCEABLE(AlignedA, Iterations * sizeof(Ty));
for (int J = 0; J < Iterations; J++) {
if (AlignedA[J] == 0)
return J;
}
return 0;
}
template <typename Ty>
static uint64_t __attribute__((noinline))
singleEarlyExitWithTwoLoads(Ty *A, Ty *B, Ty *C, int Iterations) {
auto AlignedA = static_cast<Ty *>(ASSUME_ALIGNED(A, sizeof(Ty)));
auto AlignedB = static_cast<Ty *>(ASSUME_ALIGNED(B, sizeof(Ty)));
ASSUME_DEREFERENCEABLE(AlignedA, Iterations * sizeof(Ty));
ASSUME_DEREFERENCEABLE(AlignedB, Iterations * sizeof(Ty));
for (int J = 0; J < Iterations; J++) {
if (AlignedA[J] != AlignedB[J])
return J;
}
return 0;
}
template <typename Ty>
static uint64_t __attribute__((noinline))
singleEarlyExitWithThreeLoadsAndCompute(Ty *A, Ty *B, Ty *C, int Iterations) {
auto AlignedA = static_cast<Ty *>(ASSUME_ALIGNED(A, sizeof(Ty)));
auto AlignedB = static_cast<Ty *>(ASSUME_ALIGNED(B, sizeof(Ty)));
auto AlignedC = static_cast<Ty *>(ASSUME_ALIGNED(C, sizeof(Ty)));
ASSUME_DEREFERENCEABLE(AlignedA, Iterations * sizeof(Ty));
ASSUME_DEREFERENCEABLE(AlignedB, Iterations * sizeof(Ty));
ASSUME_DEREFERENCEABLE(AlignedC, Iterations * sizeof(Ty));
for (int J = 0; J < Iterations; J++) {
// Exits when A[J] == 0: (0 + 1) * 1 == 1
// Continues when A[J] == 1: (1 + 1) * 1 == 2
if ((AlignedA[J] + AlignedB[J]) * AlignedC[J] == 1)
return J;
}
return 0;
}
template <typename Ty>
void autovec_no_early_exit_single_load(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithSingleLoad<Ty>,
EarlyExitPosition::None);
}
template <typename Ty>
void autovec_no_early_exit_two_loads(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithTwoLoads<Ty>,
EarlyExitPosition::None);
}
template <typename Ty>
void autovec_early_exit_taken_first_single_load(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithSingleLoad<Ty>,
EarlyExitPosition::First);
}
template <typename Ty>
void autovec_early_exit_taken_mid_single_load(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithSingleLoad<Ty>,
EarlyExitPosition::Mid);
}
template <typename Ty>
void autovec_early_exit_taken_first_two_loads(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithTwoLoads<Ty>,
EarlyExitPosition::First);
}
template <typename Ty>
void autovec_early_exit_taken_mid_two_loads(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithTwoLoads<Ty>,
EarlyExitPosition::Mid);
}
template <typename Ty>
void autovec_no_early_exit_three_loads(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithThreeLoadsAndCompute<Ty>,
EarlyExitPosition::None);
}
template <typename Ty>
void autovec_early_exit_taken_first_three_loads(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithThreeLoadsAndCompute<Ty>,
EarlyExitPosition::First);
}
template <typename Ty>
void autovec_early_exit_taken_mid_three_loads(benchmark::State &state) {
runEarlyExitBenchmark<Ty>(state, &singleEarlyExitWithThreeLoadsAndCompute<Ty>,
EarlyExitPosition::Mid);
}
#define BENCHMARK_ARGS ->Arg(10)->Arg(16)->Arg(28)->Arg(51)->Arg(256)->Arg(999)
#define REGISTER_BENCHMARK_FOR_TYPES(BenchFunc) \
BENCHMARK_TEMPLATE(BenchFunc, uint8_t) BENCHMARK_ARGS; \
BENCHMARK_TEMPLATE(BenchFunc, uint16_t) BENCHMARK_ARGS; \
BENCHMARK_TEMPLATE(BenchFunc, uint32_t) BENCHMARK_ARGS; \
BENCHMARK_TEMPLATE(BenchFunc, uint64_t) BENCHMARK_ARGS;
REGISTER_BENCHMARK_FOR_TYPES(autovec_no_early_exit_single_load)
REGISTER_BENCHMARK_FOR_TYPES(autovec_early_exit_taken_first_single_load)
REGISTER_BENCHMARK_FOR_TYPES(autovec_early_exit_taken_mid_single_load)
REGISTER_BENCHMARK_FOR_TYPES(autovec_no_early_exit_two_loads)
REGISTER_BENCHMARK_FOR_TYPES(autovec_early_exit_taken_first_two_loads)
REGISTER_BENCHMARK_FOR_TYPES(autovec_early_exit_taken_mid_two_loads)
REGISTER_BENCHMARK_FOR_TYPES(autovec_no_early_exit_three_loads)
REGISTER_BENCHMARK_FOR_TYPES(autovec_early_exit_taken_first_three_loads)
REGISTER_BENCHMARK_FOR_TYPES(autovec_early_exit_taken_mid_three_loads)