blob: a14b8f1610ee1c53f40303c85387b9163d59a74e [file] [log] [blame]
#include <iostream>
#include <math.h>
#include <memory>
#include <random>
#include "benchmark/benchmark.h"
// Initialize arrays A, B and T with random numbers.
template <typename T> static void init_data(T *A, unsigned N) {
std::uniform_real_distribution<> dist(-100, 100);
std::mt19937 rng(12345);
for (unsigned i = 0; i < N; i++)
A[i] = (T)dist(rng);
}
template <unsigned N, typename T>
__attribute__((noinline)) void do_xor(T *A, T *B) {
#pragma clang loop unroll(full)
for (int i = 0; i < N; i++) {
A[i] ^= B[i];
}
}
template <unsigned N, typename T>
void benchmark_xor_runtime_checks_pass(benchmark::State &state) {
T A[N];
T B[N];
init_data(&A[0], N);
init_data(&B[0], N);
for (auto _ : state) {
do_xor<N>(&A[0], &B[0]);
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::ClobberMemory();
}
}
template <unsigned N, typename T>
void benchmark_xor_runtime_checks_fail(benchmark::State &state) {
T A[2 * N];
init_data(&A[0], 2 * N);
for (auto _ : state) {
do_xor<N>(&A[0], &A[3]);
benchmark::DoNotOptimize(A);
benchmark::ClobberMemory();
}
}
template <unsigned N, typename T>
void benchmark_xor_no_runtime_checks_needed(benchmark::State &state) {
T A[N];
T B[N];
init_data(&A[0], N);
init_data(&B[0], N);
for (auto _ : state) {
#pragma clang loop unroll(full)
for (int i = 0; i < N; i++) {
A[i] ^= B[i];
}
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::ClobberMemory();
}
}
BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_pass, 4, int);
BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_fail, 4, int);
BENCHMARK_TEMPLATE(benchmark_xor_no_runtime_checks_needed, 4, int);
BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_pass, 16, int);
BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_fail, 16, int);
BENCHMARK_TEMPLATE(benchmark_xor_no_runtime_checks_needed, 16, int);
template <unsigned N, typename T>
__attribute__((noinline)) void do_add_xor(T *A, T *B, T *C) {
#pragma clang loop unroll(full)
for (int i = 0; i < N; i++) {
A[i] ^= B[i] + C[i];
}
}
template <unsigned N, typename T>
void benchmark_add_xor_runtime_checks_pass(benchmark::State &state) {
T A[N];
T B[N];
T C[N];
init_data(&A[0], N);
init_data(&B[0], N);
init_data(&C[0], N);
for (auto _ : state) {
do_add_xor<N>(&A[0], &B[0], &C[0]);
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::DoNotOptimize(C);
benchmark::ClobberMemory();
}
}
template <unsigned N, typename T>
void benchmark_add_xor_runtime_checks_fail(benchmark::State &state) {
T A[2 * N];
T B[N];
init_data(&A[0], 2 * N);
init_data(&B[0], N);
for (auto _ : state) {
do_add_xor<N>(&A[0], &A[3], &B[0]);
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::ClobberMemory();
}
}
template <unsigned N, typename T>
void benchmark_add_xor_no_runtime_checks_needed(benchmark::State &state) {
T A[N];
T B[N];
T C[N];
init_data(&A[0], N);
init_data(&B[0], N);
init_data(&C[0], N);
for (auto _ : state) {
#pragma clang loop unroll(full)
for (int i = 0; i < N; i++) {
A[i] ^= B[i] + C[i];
}
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::DoNotOptimize(C);
benchmark::ClobberMemory();
}
}
BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_pass, 4, int);
BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_fail, 4, int);
BENCHMARK_TEMPLATE(benchmark_add_xor_no_runtime_checks_needed, 4, int);
BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_pass, 16, int);
BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_fail, 16, int);
BENCHMARK_TEMPLATE(benchmark_add_xor_no_runtime_checks_needed, 16, int);
template <unsigned N, typename T>
__attribute__((noinline)) void do_multiply_accumulate(T *A, T *B, T c) {
#pragma clang loop unroll(full)
for (unsigned int i = 0; i < N; ++i) {
A[i] += c * B[i];
}
}
template <unsigned N, typename T>
void benchmark_multiply_accumulate_runtime_checks_pass(
benchmark::State &state) {
T A[N];
T B[N];
init_data(&A[0], N);
init_data(&B[0], N);
for (auto _ : state) {
do_multiply_accumulate<N>(&A[0], &B[0], B[0]);
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::ClobberMemory();
}
}
template <unsigned N, typename T>
void benchmark_multiply_accumulate_runtime_checks_fail(
benchmark::State &state) {
T A[N];
init_data(&A[0], N);
for (auto _ : state) {
do_multiply_accumulate<N>(&A[0], &A[1], A[0]);
benchmark::DoNotOptimize(A);
benchmark::ClobberMemory();
}
}
template <unsigned N, typename T>
void benchmark_multiply_accumulate_no_runtime_checks_needed(
benchmark::State &state) {
T A[N];
T B[N];
init_data(&A[0], N);
init_data(&B[0], N);
for (auto _ : state) {
T c = A[0];
#pragma clang loop unroll(full)
for (unsigned int i = 0; i < N; ++i) {
A[i] += c * B[i];
}
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::ClobberMemory();
}
}
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_pass, 2,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_fail, 2,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_no_runtime_checks_needed, 2,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_pass, 3,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_fail, 3,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_no_runtime_checks_needed, 3,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_pass, 4,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_fail, 4,
double);
BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_no_runtime_checks_needed, 4,
double);