| // This program tests performance impact of Interleaving Count with varying loop |
| // iteration count for different types of loops, such as loops with or |
| // without reductions inside it, loops with different vectorization widths. |
| #include <iostream> |
| #include <memory> |
| #include <random> |
| |
| #include "benchmark/benchmark.h" |
| |
| #define ELEMENTS 2048 |
| #define ALIGNED16 __attribute__((aligned(16))) |
| |
| static std::mt19937 rng; |
| unsigned int g_sum = 0; |
| |
| int A[ELEMENTS] ALIGNED16; |
| int B[ELEMENTS] ALIGNED16; |
| int C[ELEMENTS] ALIGNED16; |
| int D[ELEMENTS] ALIGNED16; |
| int E[ELEMENTS] ALIGNED16; |
| int F[ELEMENTS] ALIGNED16; |
| |
| // Initialize arrays with random numbers. |
| static void init_data(unsigned N) { |
| std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(), |
| std::numeric_limits<int>::max()); |
| for (unsigned I = 0; I < N; I++) { |
| A[I] = distrib(rng); |
| B[I] = distrib(rng); |
| C[I] = distrib(rng); |
| D[I] = distrib(rng); |
| E[I] = distrib(rng); |
| F[I] = distrib(rng); |
| } |
| } |
| |
| static void __attribute__((always_inline)) |
| runBenchForLoopInterleaving(benchmark::State &state, int (*Fn)(int), |
| int Iterations) { |
| std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(), |
| std::numeric_limits<int>::max()); |
| init_data(ELEMENTS); |
| for (auto _ : state) { |
| benchmark::DoNotOptimize(A); |
| benchmark::DoNotOptimize(B); |
| benchmark::DoNotOptimize(C); |
| benchmark::DoNotOptimize(D); |
| benchmark::DoNotOptimize(E); |
| benchmark::DoNotOptimize(F); |
| benchmark::ClobberMemory(); |
| g_sum += Fn(Iterations); |
| } |
| } |
| |
| #define STRINGIFY(a) #a |
| |
| // Loops without Reduction with different vectorization configurations |
| |
| static int __attribute__((noinline)) loopNoReductionAutoVec(int Iterations) { |
| #pragma clang loop unroll(disable) |
| for (int J = 0; J < Iterations; J++) { |
| A[J] = B[J] + C[J]; |
| } |
| return 0; |
| } |
| |
| static int __attribute__((noinline)) bigLoopNoReductionAutoVec(int Iterations) { |
| #pragma clang loop unroll(disable) |
| for (int J = 0; J < Iterations; J++) { |
| A[J] = B[J] + C[J]; |
| D[J]++; |
| E[J] *= 2; |
| F[J] /= 5; |
| } |
| return 0; |
| } |
| |
| #define loopNoReductionWithVecHint(vw, ic) \ |
| static int __attribute__((noinline)) \ |
| loopWithVW##vw##IC##ic(int Iterations) { \ |
| _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ |
| ic))) for (int J = 0; J < Iterations; J++) { \ |
| A[J] = B[J] + C[J]; \ |
| } \ |
| return 0; \ |
| } |
| |
| #define bigLoopNoReductionWithVecHint(vw, ic) \ |
| static int __attribute__((noinline)) \ |
| bigLoopWithVW##vw##IC##ic(int Iterations) { \ |
| _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ |
| ic))) for (int J = 0; J < Iterations; J++) { \ |
| A[J] = B[J] + C[J]; \ |
| D[J]++; \ |
| E[J] *= 2; \ |
| F[J] /= 5; \ |
| } \ |
| return 0; \ |
| } |
| |
| // Loops with Reduction with different vectorization configurations |
| |
| static int __attribute__((noinline)) loopWithReductionAutoVec(int Iterations) { |
| unsigned sum = 0; |
| #pragma clang loop unroll(disable) |
| for (int J = 0; J < Iterations; J++) { |
| sum += A[J]; |
| } |
| return sum; |
| } |
| |
| static int __attribute__((noinline)) |
| bigLoopWithReductionAutoVec(int Iterations) { |
| unsigned sum = 0; |
| #pragma clang loop unroll(disable) |
| for (int J = 0; J < Iterations; J++) { |
| sum += A[J]; |
| D[J]++; |
| E[J] *= 2; |
| F[J] /= 5; |
| } |
| return sum; |
| } |
| |
| #define loopWithReductionWithVecHint(vw, ic) \ |
| static int __attribute__((noinline)) \ |
| loopWithReductionWithVW##vw##IC##ic(int Iterations) { \ |
| unsigned sum = 0; \ |
| _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ |
| ic))) for (int J = 0; J < Iterations; J++) { \ |
| sum += A[J]; \ |
| } \ |
| return sum; \ |
| } |
| |
| #define bigLoopWithReductionWithVecHint(vw, ic) \ |
| static int __attribute__((noinline)) \ |
| bigLoopWithReductionWithVW##vw##IC##ic(int Iterations) { \ |
| unsigned sum = 0; \ |
| _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ |
| ic))) for (int J = 0; J < Iterations; J++) { \ |
| sum += A[J]; \ |
| D[J]++; \ |
| E[J] *= 2; \ |
| F[J] /= 5; \ |
| } \ |
| return sum; \ |
| } |
| |
| // We are evaluating 4 types of loops for different vectorization configurations |
| // 1) Loops without reductions |
| // 2) Loops with reductions |
| // 3) Bigger loop bodies without reductions |
| // 4) Bigger loop bodies with some reductions |
| // For each, we are evaluating the following vectorization configurations of |
| // vectorization width (VW), interleaving count (IC): |
| // 1) automatically selected by the compiler (without vectorization hint) |
| // 2) VW=4, IC=1 |
| // 3) VW=4, IC=2 |
| // 4) VW=4, IC=4 |
| // 5) VW=1, IC=1 |
| // 6) VW=1, IC=2 |
| // 7) VW=1, IC=4 |
| // Of these, configurations 5-7 are skipped for loop type 1 & 3). |
| // Creating a function for the above configurations with different Vectorization |
| // Hints: |
| loopNoReductionWithVecHint(4, 1); |
| loopNoReductionWithVecHint(4, 2); |
| loopNoReductionWithVecHint(4, 4); |
| loopWithReductionWithVecHint(4, 1); |
| loopWithReductionWithVecHint(4, 2); |
| loopWithReductionWithVecHint(4, 4); |
| loopWithReductionWithVecHint(1, 1); |
| loopWithReductionWithVecHint(1, 2); |
| loopWithReductionWithVecHint(1, 4); |
| bigLoopNoReductionWithVecHint(4, 1); |
| bigLoopNoReductionWithVecHint(4, 2); |
| bigLoopNoReductionWithVecHint(4, 4); |
| bigLoopWithReductionWithVecHint(4, 1); |
| bigLoopWithReductionWithVecHint(4, 2); |
| bigLoopWithReductionWithVecHint(4, 4); |
| bigLoopWithReductionWithVecHint(1, 1); |
| bigLoopWithReductionWithVecHint(1, 2); |
| bigLoopWithReductionWithVecHint(1, 4); |
| |
| #define ADD_BENCHMARK(Itr) \ |
| void benchAutoVecForLoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \ |
| } \ |
| BENCHMARK(benchAutoVecForLoopTC##Itr); \ |
| void benchForIC1VW4LoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithVW4IC1, Itr); \ |
| } \ |
| BENCHMARK(benchForIC1VW4LoopTC##Itr); \ |
| void benchForIC2VW4LoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithVW4IC2, Itr); \ |
| } \ |
| BENCHMARK(benchForIC2VW4LoopTC##Itr); \ |
| void benchForIC4VW4LoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithVW4IC4, Itr); \ |
| } \ |
| BENCHMARK(benchForIC4VW4LoopTC##Itr); \ |
| void benchForLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionAutoVec, Itr); \ |
| } \ |
| BENCHMARK(benchForLoopWithReductionAutoVecTC##Itr); \ |
| void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC1, Itr); \ |
| } \ |
| BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr); \ |
| void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC2, Itr); \ |
| } \ |
| BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr); \ |
| void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC4, Itr); \ |
| } \ |
| BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr); \ |
| void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC1, Itr); \ |
| } \ |
| BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr); \ |
| void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC2, Itr); \ |
| } \ |
| BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr); \ |
| void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC4, Itr); \ |
| } \ |
| BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr); \ |
| void benchAutoVecForBigLoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \ |
| } \ |
| BENCHMARK(benchAutoVecForBigLoopTC##Itr); \ |
| void benchForIC1VW4BigLoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithVW4IC1, Itr); \ |
| } \ |
| BENCHMARK(benchForIC1VW4BigLoopTC##Itr); \ |
| void benchForIC2VW4BigLoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithVW4IC2, Itr); \ |
| } \ |
| BENCHMARK(benchForIC2VW4BigLoopTC##Itr); \ |
| void benchForIC4VW4BigLoopTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithVW4IC4, Itr); \ |
| } \ |
| BENCHMARK(benchForIC4VW4BigLoopTC##Itr); \ |
| void benchForBigLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionAutoVec, Itr); \ |
| } \ |
| BENCHMARK(benchForBigLoopWithReductionAutoVecTC##Itr); \ |
| void benchForIC1VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC1, Itr); \ |
| } \ |
| BENCHMARK(benchForIC1VW4BigLoopWithReductionTC##Itr); \ |
| void benchForIC2VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC2, Itr); \ |
| } \ |
| BENCHMARK(benchForIC2VW4BigLoopWithReductionTC##Itr); \ |
| void benchForIC4VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC4, Itr); \ |
| } \ |
| BENCHMARK(benchForIC4VW4BigLoopWithReductionTC##Itr); \ |
| void benchForIC1VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC1, Itr); \ |
| } \ |
| BENCHMARK(benchForIC1VW1BigLoopWithReductionTC##Itr); \ |
| void benchForIC2VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC2, Itr); \ |
| } \ |
| BENCHMARK(benchForIC2VW1BigLoopWithReductionTC##Itr); \ |
| void benchForIC4VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ |
| runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC4, Itr); \ |
| } \ |
| BENCHMARK(benchForIC4VW1BigLoopWithReductionTC##Itr); |
| |
| #ifdef ALL_LOOP_IC_TESTS |
| ADD_BENCHMARK(1) |
| ADD_BENCHMARK(2) |
| ADD_BENCHMARK(3) |
| ADD_BENCHMARK(4) |
| ADD_BENCHMARK(5) |
| ADD_BENCHMARK(6) |
| ADD_BENCHMARK(7) |
| ADD_BENCHMARK(8) |
| ADD_BENCHMARK(9) |
| ADD_BENCHMARK(10) |
| ADD_BENCHMARK(11) |
| ADD_BENCHMARK(12) |
| ADD_BENCHMARK(13) |
| ADD_BENCHMARK(14) |
| ADD_BENCHMARK(15) |
| ADD_BENCHMARK(16) |
| ADD_BENCHMARK(17) |
| ADD_BENCHMARK(18) |
| ADD_BENCHMARK(19) |
| ADD_BENCHMARK(20) |
| ADD_BENCHMARK(21) |
| ADD_BENCHMARK(22) |
| ADD_BENCHMARK(23) |
| ADD_BENCHMARK(24) |
| ADD_BENCHMARK(25) |
| ADD_BENCHMARK(26) |
| ADD_BENCHMARK(27) |
| ADD_BENCHMARK(28) |
| ADD_BENCHMARK(29) |
| ADD_BENCHMARK(30) |
| ADD_BENCHMARK(31) |
| ADD_BENCHMARK(32) |
| ADD_BENCHMARK(33) |
| ADD_BENCHMARK(34) |
| ADD_BENCHMARK(35) |
| ADD_BENCHMARK(36) |
| ADD_BENCHMARK(37) |
| ADD_BENCHMARK(38) |
| ADD_BENCHMARK(39) |
| ADD_BENCHMARK(40) |
| ADD_BENCHMARK(41) |
| ADD_BENCHMARK(42) |
| ADD_BENCHMARK(43) |
| ADD_BENCHMARK(44) |
| ADD_BENCHMARK(45) |
| ADD_BENCHMARK(46) |
| ADD_BENCHMARK(47) |
| ADD_BENCHMARK(48) |
| ADD_BENCHMARK(49) |
| ADD_BENCHMARK(50) |
| ADD_BENCHMARK(51) |
| ADD_BENCHMARK(52) |
| ADD_BENCHMARK(53) |
| ADD_BENCHMARK(54) |
| ADD_BENCHMARK(55) |
| ADD_BENCHMARK(56) |
| ADD_BENCHMARK(57) |
| ADD_BENCHMARK(58) |
| ADD_BENCHMARK(59) |
| ADD_BENCHMARK(60) |
| ADD_BENCHMARK(61) |
| ADD_BENCHMARK(62) |
| ADD_BENCHMARK(63) |
| ADD_BENCHMARK(64) |
| ADD_BENCHMARK(65) |
| ADD_BENCHMARK(66) |
| ADD_BENCHMARK(67) |
| ADD_BENCHMARK(68) |
| ADD_BENCHMARK(69) |
| ADD_BENCHMARK(70) |
| ADD_BENCHMARK(71) |
| ADD_BENCHMARK(72) |
| ADD_BENCHMARK(73) |
| ADD_BENCHMARK(74) |
| ADD_BENCHMARK(75) |
| ADD_BENCHMARK(76) |
| ADD_BENCHMARK(77) |
| ADD_BENCHMARK(78) |
| ADD_BENCHMARK(79) |
| ADD_BENCHMARK(80) |
| ADD_BENCHMARK(81) |
| ADD_BENCHMARK(82) |
| ADD_BENCHMARK(83) |
| ADD_BENCHMARK(84) |
| ADD_BENCHMARK(85) |
| ADD_BENCHMARK(86) |
| ADD_BENCHMARK(87) |
| ADD_BENCHMARK(88) |
| ADD_BENCHMARK(89) |
| ADD_BENCHMARK(90) |
| ADD_BENCHMARK(91) |
| ADD_BENCHMARK(92) |
| ADD_BENCHMARK(93) |
| ADD_BENCHMARK(94) |
| ADD_BENCHMARK(95) |
| ADD_BENCHMARK(96) |
| ADD_BENCHMARK(97) |
| ADD_BENCHMARK(98) |
| ADD_BENCHMARK(99) |
| ADD_BENCHMARK(100) |
| ADD_BENCHMARK(101) |
| ADD_BENCHMARK(102) |
| ADD_BENCHMARK(103) |
| ADD_BENCHMARK(104) |
| ADD_BENCHMARK(105) |
| ADD_BENCHMARK(106) |
| ADD_BENCHMARK(107) |
| ADD_BENCHMARK(108) |
| ADD_BENCHMARK(109) |
| ADD_BENCHMARK(110) |
| ADD_BENCHMARK(111) |
| ADD_BENCHMARK(112) |
| ADD_BENCHMARK(113) |
| ADD_BENCHMARK(114) |
| ADD_BENCHMARK(115) |
| ADD_BENCHMARK(116) |
| ADD_BENCHMARK(117) |
| ADD_BENCHMARK(118) |
| ADD_BENCHMARK(119) |
| ADD_BENCHMARK(120) |
| ADD_BENCHMARK(121) |
| ADD_BENCHMARK(122) |
| ADD_BENCHMARK(123) |
| ADD_BENCHMARK(124) |
| ADD_BENCHMARK(125) |
| ADD_BENCHMARK(126) |
| ADD_BENCHMARK(127) |
| ADD_BENCHMARK(128) |
| #else |
| ADD_BENCHMARK(1) |
| ADD_BENCHMARK(2) |
| ADD_BENCHMARK(3) |
| ADD_BENCHMARK(4) |
| ADD_BENCHMARK(7) |
| ADD_BENCHMARK(8) |
| ADD_BENCHMARK(15) |
| ADD_BENCHMARK(16) |
| ADD_BENCHMARK(31) |
| ADD_BENCHMARK(32) |
| ADD_BENCHMARK(63) |
| ADD_BENCHMARK(64) |
| ADD_BENCHMARK(127) |
| ADD_BENCHMARK(128) |
| #endif |