| #include <omp.h> |
| #include <malloc.h> |
| #include <stdio.h> |
| #include <memory.h> |
| |
| #define LOOP_IV_TYPE0 LOOP_TYPES |
| #define LOOP_TYPE0 LOOP_TYPES |
| #define LOOP_STYPE0 LOOP_TYPES |
| |
| #define LOOP_IV_TYPE1 LOOP_TYPES |
| #define LOOP_TYPE1 LOOP_TYPES |
| #define LOOP_STYPE1 LOOP_TYPES |
| |
| #define LOOP_IV_TYPE2 LOOP_TYPES |
| #define LOOP_TYPE2 LOOP_TYPES |
| #define LOOP_STYPE2 LOOP_TYPES |
| |
| #define MAX_THREADS 256 |
| |
| #if defined VERBOSE |
| #define PRINTF(...) printf(__VA_ARGS__) |
| #else |
| #define PRINTF(...) |
| #endif |
| |
| LOOP_TYPE0 iLB, iUB; |
| LOOP_TYPE1 jA0, jB0; |
| LOOP_TYPE2 kA0, kB0; |
| |
| LOOP_STYPE0 iStep; |
| LOOP_STYPE1 jA1, jB1, jStep; |
| LOOP_STYPE2 kA1, kB1, kStep; |
| |
| // We can check <=, <, >=, > (!= has different pattern) |
| // Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls |
| // of the test from main |
| |
| #if defined LOOP_LE0 |
| #define COMPARE0 <= |
| #elif defined LOOP_LT0 |
| #define COMPARE0 < |
| #elif defined LOOP_GE0 |
| #define COMPARE0 >= |
| #elif defined LOOP_GT0 |
| #define COMPARE0 > |
| #endif |
| |
| #if defined LOOP_LE1 |
| #define COMPARE1 <= |
| #elif defined LOOP_LT1 |
| #define COMPARE1 < |
| #elif defined LOOP_GE1 |
| #define COMPARE1 >= |
| #elif defined LOOP_GT1 |
| #define COMPARE1 > |
| #endif |
| |
| #if defined LOOP_LE2 |
| #define COMPARE2 <= |
| #elif defined LOOP_LT2 |
| #define COMPARE2 < |
| #elif defined LOOP_GE2 |
| #define COMPARE2 >= |
| #elif defined LOOP_GT2 |
| #define COMPARE2 > |
| #endif |
| |
| typedef struct { |
| LOOP_IV_TYPE0 i; |
| LOOP_IV_TYPE1 j; |
| LOOP_IV_TYPE2 k; |
| } spaceType; |
| |
| spaceType *AllocSpace(unsigned size) { |
| |
| spaceType *p = (spaceType *)malloc(size * sizeof(spaceType)); |
| memset(p, 0, size * sizeof(spaceType)); |
| return p; |
| } |
| |
| void FreeSpace(spaceType *space) { free(space); } |
| |
| // record an iteration |
| void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i, |
| LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) { |
| if (count > trueCount) { |
| // number of iterations exceeded |
| // will be reported with checks |
| return; |
| } |
| space[count - 1].i = i; |
| space[count - 1].j = j; |
| space[count - 1].k = k; |
| } |
| int test() { |
| int pass = 1; |
| LOOP_IV_TYPE0 i; |
| LOOP_IV_TYPE1 j; |
| LOOP_IV_TYPE2 k; |
| |
| spaceType *openmpSpace; |
| spaceType *scalarSpace; |
| |
| unsigned trueCount = 0; |
| unsigned openmpCount = 0; |
| unsigned scalarCount = 0; |
| unsigned uselessThreadsOpenMP = 0; |
| unsigned usefulThreadsOpenMP = 0; |
| |
| // Use half of the available threads/logical processors. |
| unsigned num_threads = omp_get_max_threads() / 2; |
| |
| // Make sure num_threads is not 0 after the division in case |
| // omp_get_max_threads() returns 1. |
| if (num_threads == 0) |
| num_threads = 1; |
| |
| if (num_threads > MAX_THREADS) |
| num_threads = MAX_THREADS; |
| |
| unsigned long *chunkSizesOpenmp = |
| (unsigned long *)malloc(sizeof(unsigned long) * num_threads); |
| memset(chunkSizesOpenmp, 0, sizeof(unsigned long) * num_threads); |
| |
| // count iterations and allocate space |
| LOOP { ++trueCount; } |
| |
| openmpSpace = AllocSpace(trueCount); |
| scalarSpace = AllocSpace(trueCount); |
| |
| // fill the scalar (compare) space |
| LOOP { |
| ++scalarCount; |
| Set(scalarSpace, scalarCount, trueCount, i, j, k); |
| } |
| |
| // test run body: |
| // perform and record OpenMP iterations and thread use |
| #pragma omp parallel num_threads(num_threads) |
| { |
| unsigned gtid = omp_get_thread_num(); |
| #pragma omp for collapse(3) private(i, j, k) |
| LOOP { |
| unsigned count; |
| #pragma omp atomic update |
| ++chunkSizesOpenmp[gtid]; |
| #pragma omp atomic capture |
| count = ++openmpCount; |
| Set(openmpSpace, count, trueCount, i, j, k); |
| } |
| } |
| |
| // check for the right number of iterations processed |
| // (only need to check for less, greater is checked when recording) |
| if (openmpCount < trueCount) { |
| PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n", |
| openmpCount, trueCount); |
| pass = 0; |
| } else if (openmpCount > trueCount) { |
| PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n", |
| openmpCount, trueCount); |
| pass = 0; |
| } |
| |
| // check openMP for iteration correctnes against scalar |
| for (unsigned i = 0; i < trueCount; i++) { |
| unsigned j; |
| for (j = 0; j < openmpCount; j++) { |
| if ((scalarSpace[i].i == openmpSpace[j].i) && |
| (scalarSpace[i].j == openmpSpace[j].j) && |
| (scalarSpace[i].k == openmpSpace[j].k)) { |
| break; |
| } |
| } |
| if (j == openmpCount) { |
| PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i, |
| scalarSpace[i].j, scalarSpace[i].k); |
| pass = 0; |
| } |
| } |
| |
| // check for efficient thread use |
| for (unsigned i = 0; i < num_threads; ++i) { |
| if (chunkSizesOpenmp[i] == 0) { |
| ++uselessThreadsOpenMP; |
| } |
| } |
| |
| // a check to see if at least more than one thread was used (weakish) |
| if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) { |
| PRINTF("OpenMP FAILURE: threads are not used\n"); |
| pass = 0; |
| } |
| |
| #if 0 |
| // a check to see if the load was spread more or less evenly so that |
| // when there was more work than threads each one got at least something |
| // (stronger, but may currently fail for a general collapse case) |
| if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) { |
| PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n", |
| uselessThreadsOpenMP, openmpCount); |
| pass = 0; |
| } |
| #endif |
| |
| // clean up space |
| FreeSpace(openmpSpace); |
| FreeSpace(scalarSpace); |
| free(chunkSizesOpenmp); |
| return pass; |
| } |