blob: aba758f2871efa5e8e18a07207fd2e723ddb6da4 [file] [log] [blame] [edit]
#include <algorithm>
#include <functional>
#include <iostream>
/// N is divisible by common VFs. Used to test vector code path without the
/// scalar epilogue.
#define N 512
/// N_ODD is not divisible by common VFs (4, 8, 16) to test scalar epilogue.
#define N_ODD 511
/// N_SMALL is a small trip count divisible by common VFs
#define N_SMALL 64
template <typename Ty>
using TestFnTy = std::function<unsigned(std::function<void(Ty *)>)>;
// Helper to call \p f with \p args and acts as optimization barrier for \p f.
template <typename F, typename... Args>
__attribute__((optnone)) static auto callThroughOptnone(F &&f, Args &&...args) {
return f(std::forward<Args>(args)...);
}
template <typename Ty>
static void
checkVectorFunction(TestFnTy<Ty> ScalarFn, TestFnTy<Ty> VectorFn,
TestFnTy<Ty> ForcedVectorFn, TestFnTy<Ty> InterleavedFn,
TestFnTy<Ty> InterleavedOnlyFn, const char *Name) {
std::cout << "Checking " << Name << "\n";
std::array Tests = {std::make_pair(VectorFn, "autovec"),
std::make_pair(ForcedVectorFn, "vector-forced"),
std::make_pair(InterleavedFn, "interleave-forced"),
std::make_pair(InterleavedOnlyFn, "interleave-only")};
// Check finding the target element at all indices between 0 and N.
for (unsigned IdxToFind = 0; IdxToFind < N; ++IdxToFind) {
// Lambda to initialize all array elements to one, except the one to look
// for to zero.
auto Init1 = [IdxToFind](Ty *Arr) {
std::fill_n(Arr, N, 1);
if (IdxToFind < N)
Arr[IdxToFind] = 0;
};
// Lambda to initialize all array elements to one, except the one to look
// for and the IdxToFind + 3 to zero.
auto Init2 = [IdxToFind](Ty *Arr) {
std::fill_n(Arr, N, 1);
if (IdxToFind < N)
Arr[IdxToFind] = 0;
if (IdxToFind + 3 < N)
Arr[IdxToFind + 3] = 0;
};
auto Reference1 = ScalarFn(Init1);
auto Reference2 = ScalarFn(Init2);
// Run vector functions and check against the scalar result.
for (const auto &[Fn, Name] : Tests) {
auto ToCheck1 = callThroughOptnone(Fn, Init1);
if (Reference1 != ToCheck1) {
std::cerr << "Miscompare for " << Name << ": " << Reference1
<< " != " << ToCheck1 << "\n";
exit(1);
}
auto ToCheck2 = callThroughOptnone(Fn, Init2);
if (Reference2 != ToCheck2) {
std::cerr << "Miscompare for " << Name << ": " << Reference2
<< " != " << ToCheck2 << "\n";
exit(1);
}
}
}
}
// Test function type for three arrays
template <typename Ty>
using TestFnTy3 =
std::function<unsigned(std::function<void(Ty *, Ty *, Ty *)>)>;
// Unified check function for multiple early exits
template <typename Ty, unsigned NumExits, unsigned TripCount, unsigned Stride1,
unsigned Stride2, unsigned Stride3>
static void
checkVectorFunctionMulti(TestFnTy3<Ty> ScalarFn, TestFnTy3<Ty> VectorFn,
TestFnTy3<Ty> ForcedVectorFn,
TestFnTy3<Ty> InterleavedFn,
TestFnTy3<Ty> InterleavedOnlyFn, const char *Name) {
static_assert(NumExits == 2 || NumExits == 3, "NumExits must be 2 or 3");
std::cout << "Checking " << Name << "\n";
std::array Tests = {std::make_pair(VectorFn, "autovec"),
std::make_pair(ForcedVectorFn, "vector-forced"),
std::make_pair(InterleavedFn, "interleave-forced"),
std::make_pair(InterleavedOnlyFn, "interleave-only")};
// Test all combinations of exit positions
// Use stride to reduce test count while still covering important cases
for (unsigned Exit1Idx = 0; Exit1Idx <= TripCount; Exit1Idx += Stride1) {
for (unsigned Exit2Idx = 0; Exit2Idx <= TripCount; Exit2Idx += Stride2) {
for (unsigned Exit3Idx = 0; Exit3Idx <= TripCount; Exit3Idx += Stride3) {
auto Init = [Exit1Idx, Exit2Idx, Exit3Idx](Ty *A, Ty *B, Ty *C) {
// Fill arrays with different values
for (unsigned I = 0; I < TripCount; I++) {
A[I] = I + 1;
B[I] = I + TripCount;
C[I] = I + 2 * TripCount;
}
// Set up first exit condition: A[Exit1Idx] == 0
if (Exit1Idx < TripCount)
A[Exit1Idx] = 0;
// Set up second exit condition: A[Exit2Idx] == B[Exit2Idx]
if (Exit2Idx < TripCount)
B[Exit2Idx] = A[Exit2Idx];
// Set up third exit condition: B[Exit3Idx] == C[Exit3Idx]
if (Exit3Idx < TripCount)
C[Exit3Idx] = B[Exit3Idx];
};
auto Reference = ScalarFn(Init);
for (const auto &[Fn, FnName] : Tests) {
auto ToCheck = callThroughOptnone(Fn, Init);
if (Reference != ToCheck) {
if constexpr (NumExits == 2) {
std::cerr << "Miscompare for " << FnName
<< " at Exit1Idx=" << Exit1Idx
<< ", Exit2Idx=" << Exit2Idx << ": " << Reference
<< " != " << ToCheck << "\n";
} else {
std::cerr << "Miscompare for " << FnName
<< " at Exit1Idx=" << Exit1Idx
<< ", Exit2Idx=" << Exit2Idx
<< ", Exit3Idx=" << Exit3Idx << ": " << Reference
<< " != " << ToCheck << "\n";
}
exit(1);
}
}
}
}
}
// Explicit test for regular exit (no early exit taken)
// All ExitIdx = TripCount means no early exit condition is triggered
auto InitNoExit = [](Ty *A, Ty *B, Ty *C) {
for (unsigned I = 0; I < TripCount; I++) {
A[I] = I + 1;
B[I] = I + TripCount;
C[I] = I + 2 * TripCount;
}
};
auto Reference = callThroughOptnone(ScalarFn, InitNoExit);
for (const auto &[Fn, FnName] : Tests) {
auto ToCheck = callThroughOptnone(Fn, InitNoExit);
if (Reference != ToCheck) {
std::cerr << "Miscompare for " << FnName
<< " at regular exit (no early exit taken): " << Reference
<< " != " << ToCheck << "\n";
exit(1);
}
}
}
/// Define 3 test functions
/// * one with vectorization and interleaving disabled,
/// * one with auto-vectorization w/o any pragmas
/// * one where VF is picked automatically and interleaving is forced.
#define DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT(Init, Src, Loop) \
auto ScalarFn = [](std::function<void(int *)> II) -> unsigned { \
Init; \
II(Src); \
_Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = [](std::function<void(int *)> II) -> unsigned { \
Init; \
II(Src); \
_Pragma("clang loop vectorize(enable)") Loop \
}; \
auto ForcedVectorFn = [](std::function<void(int *)> II) -> unsigned { \
Init; \
II(Src); \
_Pragma("clang loop vectorize_width(8) interleave_count(1)") Loop \
}; \
auto InterleavedFn = [](std::function<void(int *)> II) -> unsigned { \
Init; \
II(Src); \
_Pragma("clang loop vectorize(enable) interleave_count(4)") Loop \
}; \
auto InterleavedOnlyFn = [](std::function<void(int *)> II) -> unsigned { \
Init; \
II(Src); \
_Pragma("clang loop vectorize_width(1) interleave_count(4)") Loop \
};
/// Define test functions for two early exits (uses 3 arrays, third is unused).
#define DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(Init, Src1, Src2, Src3, Loop) \
auto ScalarFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize(enable)") Loop \
}; \
auto ForcedVectorFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize_width(8) interleave_count(1)") Loop \
}; \
auto InterleavedFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize(enable) interleave_count(4)") Loop \
}; \
auto InterleavedOnlyFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize_width(1) interleave_count(4)") Loop \
};
/// Define test functions for three arrays (for three early exits)
#define DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_3(Init, Src1, Src2, Src3, Loop) \
auto ScalarFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize(enable)") Loop \
}; \
auto ForcedVectorFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize_width(8) interleave_count(1)") Loop \
}; \
auto InterleavedFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize(enable) interleave_count(4)") Loop \
}; \
auto InterleavedOnlyFn = \
[](std::function<void(int *, int *, int *)> II) -> unsigned { \
Init II(Src1, Src2, Src3); \
_Pragma("clang loop vectorize_width(1) interleave_count(4)") Loop \
};
int main(void) {
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT(
int A[N];, A, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return I;
} return -1;);
checkVectorFunction<int>(ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn,
InterleavedOnlyFn, "early_exit_find_step_1");
}
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT(
int A[N]; unsigned J = 2;, A, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return J;
J += 3;
} return -1;);
checkVectorFunction<int>(ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn,
InterleavedOnlyFn, "early_exit_find_step_3");
}
// Test: Two early exits to the same exit block.
// First exit: A[I] == 0
// Second exit: A[I] == B[I]
// This tests that exits are checked in the correct program order.
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N]; int B[N]; int C[N];
, A, B, C, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I + 1000;
} return -1;);
checkVectorFunctionMulti<int, 2, N, 3, 3, N + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_same_block");
}
// Test: Two early exits with different return values.
// First exit: A[I] == 0 -> return I
// Second exit: A[I] == B[I] -> return I * 2
// Latch exit: return -1
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N]; int B[N]; int C[N];
, A, B, C, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I * 2;
} return -1;);
checkVectorFunctionMulti<int, 2, N, 3, 3, N + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_different_values");
}
// Test: Two early exits with different comparison types.
// First exit: A[I] == 0
// Second exit: A[I] == B[I]
// Both use equality checks that checkVectorFunctionMulti can set up.
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N]; int B[N]; int C[N];
, A, B, C, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return I + 500;
if (A[I] == B[I])
return I;
} return -1;);
checkVectorFunctionMulti<int, 2, N, 3, 3, N + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_swapped_returns");
}
// Test: Three early exits to the same exit block.
// First exit: A[I] == 0
// Second exit: A[I] == B[I]
// Third exit: B[I] == C[I]
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_3(
int A[N]; int B[N]; int C[N];
, A, B, C, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I + 1000;
if (B[I] == C[I])
return I + 2000;
} return -1;);
checkVectorFunctionMulti<int, 3, N, 17, 17, 17>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"three_early_exits_same_block");
}
// Test: Three early exits with different return expressions.
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_3(
int A[N]; int B[N]; int C[N];
, A, B, C, for (unsigned I = 0; I < N; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I * 2;
if (B[I] == C[I])
return I * 3;
} return -1;);
checkVectorFunctionMulti<int, 3, N, 17, 17, 17>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"three_early_exits_different_values");
}
// Test: Two early exits with odd trip count (tests scalar epilogue).
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N_ODD]; int B[N_ODD]; int C[N_ODD];
, A, B, C, for (unsigned I = 0; I < N_ODD; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I + 1000;
} return -1;);
checkVectorFunctionMulti<int, 2, N_ODD, 3, 3, N_ODD + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_odd_trip_count");
}
// Test: Two early exits with odd trip count and different return values.
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N_ODD]; int B[N_ODD]; int C[N_ODD];
, A, B, C, for (unsigned I = 0; I < N_ODD; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I * 2;
} return -1;);
checkVectorFunctionMulti<int, 2, N_ODD, 3, 3, N_ODD + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_odd_different_values");
}
// Test: Three early exits with odd trip count.
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_3(
int A[N_ODD]; int B[N_ODD]; int C[N_ODD];
, A, B, C, for (unsigned I = 0; I < N_ODD; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I + 1000;
if (B[I] == C[I])
return I + 2000;
} return -1;);
checkVectorFunctionMulti<int, 3, N_ODD, 17, 17, 17>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"three_early_exits_odd_trip_count");
}
// Test: Two early exits with small known trip count (no scalar remainder).
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N_SMALL]; int B[N_SMALL]; int C[N_SMALL];
, A, B, C, for (unsigned I = 0; I < N_SMALL; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I + 1000;
} return -1;);
checkVectorFunctionMulti<int, 2, N_SMALL, 3, 3, N_SMALL + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_small_known_tc");
}
// Test: Three early exits with small known trip count (no scalar remainder).
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_3(
int A[N_SMALL]; int B[N_SMALL]; int C[N_SMALL];
, A, B, C, for (unsigned I = 0; I < N_SMALL; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I + 1000;
if (B[I] == C[I])
return I + 2000;
} return -1;);
checkVectorFunctionMulti<int, 3, N_SMALL, 7, 7, 7>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"three_early_exits_small_known_tc");
}
// Test: Two early exits with small known trip count and different returns.
{
DEFINE_SCALAR_AND_VECTOR_EARLY_EXIT_2(
int A[N_SMALL]; int B[N_SMALL]; int C[N_SMALL];
, A, B, C, for (unsigned I = 0; I < N_SMALL; I++) {
if (A[I] == 0)
return I;
if (A[I] == B[I])
return I * 2;
} return -1;);
checkVectorFunctionMulti<int, 2, N_SMALL, 3, 3, N_SMALL + 1>(
ScalarFn, VectorFn, ForcedVectorFn, InterleavedFn, InterleavedOnlyFn,
"two_early_exits_small_different_values");
}
return 0;
}